In [9]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pandas as pd
%matplotlib inline

In [10]:
#from google.colab import drive
#drive.mount('/content/drive')
#file_path = '/content/drive/MyDrive/names.txt'


In [11]:
words = open('names.txt','r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)} #i+1 인이유 enumerate의 첫 index - 0이고 이를 시작토큰 .으로 설정
stoi['.'] = 0
itos = {i+1:s for i,s in enumerate(chars)} # {i:s for i,s in stoi.items()} 도 가능
itos[0] = '.'

vocab_size = len(itos)

#데이터셋 만들기
block_size = 3 #context length 몇개의 알파벳을 다음문자를 예측하기 위해 쓸거냐
def build_dataset(words):
  x,y=[],[]
  for w in words:
    context = [0] * block_size
    for ch in w + '.':                #olivia [0,0,0]
      ix =stoi[ch]                    #           o,  l,  i,  v,  i,  a
      x.append(context)               #[[0,0,0]] ...,..o,.ol,oli,liv,iva,...
      y.append(ix)                    #[15] <= olivia 의 stoi가 들어감
      context = context[1:] +[ix]     # 값전달 list
  #build_dataset(words[:3])
  #
  x=torch.tensor(x)
  y=torch.tensor(y)
  print(x.shape,y.shape)

  return x,y

build_dataset(words[:3])

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

xtr,ytr = build_dataset(words[:n1])
xval, yval = build_dataset(words[n1:n2])
xtest, ytest = build_dataset(words[n2:])


torch.Size([16, 3]) torch.Size([16])
torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [12]:
#utility function
# compare manual gradients to pytorch gradient의 줄임말
# 직접만든 backward와 pytorch backword를 비교해보자

def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s}|실제값: {str(ex):5s}| 추정값: {str(app):5s}| 최대차이값:{maxdiff}')


In [13]:
#activation, batchnorm ipynb
n_embedding = 10
n_hidden = 64

g= torch.Generator().manual_seed(2147483647)

C= torch.randn((vocab_size,n_embedding),generator=g)
# *gain/sqrt(fan_in) std 조정
w1 = torch.randn((n_embedding*block_size,n_hidden), generator= g)*5/3/(n_embedding*block_size)**0.5
#batchnorm 때문에 써도 되고 안써도됨
#b1 = torch.randn(n_hidden,generator =g) * 0.1

w2= torch.randn((n_hidden,vocab_size),generator=g) * 0.1   #hpreact.abs()줄이기
b2 = torch.randn(vocab_size, generator=g)*0.1

bngain = torch.randn((1,n_hidden))*0.1+ 1.0
bnbias = torch.randn((1,n_hidden))*0.1

parameters = [C,w1,w2,b2,bngain,bnbias]
print(sum(p.nelement() for p in parameters)) #param 총개수
for p in parameters:
  p.requires_grad = True
  #같은 snippet as always

4073


In [14]:
batch_size = 32
n= batch_size # 편의를 위해서

## mini batch 만들기
#batch size, 하면 size[32]가되고 , 없으면 size[1]이 됨
idx = torch.randint(0,xtr.shape[0],(batch_size,),generator = g)
xb,yb = xtr[idx],ytr[idx]
#xb shape 32,3으로 xtr의 182625,3 tensor를 batch size 32씩 나눈것

# forward pass
#---------------------------------------------------------------
emb = C[xb] #embed the characters into vector
# xb shape = 32,3 C shape = 27,10 C가 뭐였냐 -> embedding matrix
#emb shape = 32,3,10 pytorch의 tensor 에 tensor index 원리? 생략
embconcat = emb.view(emb.shape[0],-1) #concatanate vectors
# emb.shape[0] = 32 즉 embconcat shape = 32,30

#linear layer | first hidden layer
#--------------------------------------------------
hprebn = embconcat @ w1
# hidden pre activation batch normalization 간단히 batchnorm 전 input

#batchnorm | batch norm layer
#---------------------------------------------------------
bnmean = hprebn.sum(0,keepdim=True)/n #평균
bndiff = hprebn - bnmean #편차
bndiff2 = bndiff**2 #편차제곱
bnvar = bndiff2.sum(0,keepdim=True)/(n-1) #bossel's correlation 샘플 표준편차,분산은 n-1로 나눔
bnvar_inv = (bnvar + 1e-5)**-0.5 #분모부분 inv = inverted


bnraw = bndiff * bnvar_inv # gamma beta 로 scale shift 하기전의 값
hpreact = bngain * bnraw + bnbias

# BATCH NORMALIZATION 식 1e-5 는 엡실론으로 분모 0 방지
# 빠른 방법 -> hpreact = bngain *(bndiff/(torch.sqrt(bnvar + 1e-55))  ) + bnbias
# 하지만 모든 parameter grad의 역전파과정 살펴보는게 목적
#activation | tanh layer
#-----------------------------------------------------
h = torch.tanh(hpreact)

#linear | second hidden layer
#---------------------------------------------------
logits = h @ w2 + b2

#loss function | cross entropy  손으로
#--------------------------------------------------------

# F.cross_entropy(logits,yb) 와 100% 똑같음
# pytorch cross entropy 모듈 내부
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes #batch norm한 것과 유사 stabilizing
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True)
counts_sum_inv = counts_sum**-1

#(1.0 / counts_sum)을 사용한다면 backprop을 정확하게 할수없음

probs = counts * counts_sum_inv #결국 확률값
logprobs = probs.log()  #
loss = -logprobs[range(n), yb].mean()
#xb 매우많은 input 을 32개씩 쪼갠 32,3shape [[o,l,i][l,i,v],[i,v,a],..] 의 stoi index
#yb 매우많은 output을 32개씩 쪼갠 32 shape [o,l,i,v,i,a,...] 의 stoi index

#pytorch의 backward()사용한 backward pass not manually
for p in parameters:
  p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmean,
         embconcat, emb]:

  t.retain_grad()
loss.backward()
loss

tensor(3.4670, grad_fn=<NegBackward0>)

In [15]:
#------------------------------
# yb shape 32
#dloss/da = - 1/n
#loss = -1/3a -1/3b -1/3c 이유 -(a+b+c)/n -logprobs[range(n), yb].mean()
#loss는 평균을 구한 값이기에 gradient도 덧셈에서의 상수값
#logprob의 값변화자체는 gradient에 영향을 주지않는다
dlogprobs =  torch.zeros_like(logprobs)
dlogprobs[range(n),yb] = -1.0/n
cmp('logprobs',dlogprobs,logprobs) # pytorch의 logprobs.grad와 직접구한 gradient 차이
#--------------------------------------------------------
dprobs = (1.0/probs) * dlogprobs
# chain rule prob 미분식 dloss/dlogprob * dlogprob/dprob , logprob = dprob.log(), dlogprob/dprob = 1/prob
cmp('probs',dprobs,probs)
#------------------------------------------------
#counts shape = [32,27]
#counts_inv shape = [32,1]
#32,27 matrix에 가로줄에 counts inv 의 32 row 값이 모두 곱해짐
#probs = counts * counts_sum_inv  #dprobs/dcounts = counts_sum_inv
# 다른 shape의 multiplication의 derivitve
dcounts = counts_sum_inv * dprobs
dcounts_sum_inv =  (dprobs * counts).sum(1,keepdim=True)
cmp('dcounts_sum_inv',dcounts_sum_inv,counts_sum_inv)
#이하 머리아파서 추후에

logprobs       |실제값: True | 추정값: True | 최대차이값:0.0
probs          |실제값: True | 추정값: True | 최대차이값:0.0
dcounts_sum_inv|실제값: True | 추정값: True | 최대차이값:0.0


In [21]:
#bessel's correlation은 minibatch와 같은 적은량의 dataset의 분산을 구할때 효과적


tensor(3.4670, grad_fn=<NegBackward0>)

In [20]:
#cross entropy 역전파 직접작성
fast_loss = F.cross_entropy(logits,yb)
print(fast_loss.item(),'diff:',(fast_loss - loss).item())

3.4669768810272217 diff: 7.152557373046875e-07


In [None]:
#batchnorm backward manually