In [23]:
import torch
import numpy
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [24]:
words = open('names.txt','r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)} #i+1 인이유 enumerate의 첫 index - 0이고 이를 시작토큰 .으로 설정
stoi['.'] = 0
itos = {i+1:s for i,s in enumerate(chars)} # {i:s for i,s in stoi.items()} 도 가능
itos[0] = '.'
itos
#데이터셋 만들기
vocab_size = 27
block_size = 8 #context length 몇개의 알파벳을 다음문자를 예측하기 위해 쓸거냐
def build_dataset(words):
  x,y=[],[]
  for w in words:
    context = [0] * block_size
    for ch in w + '.':                #olivia [0,0,0]
      ix =stoi[ch]                    #           o,  l,  i,  v,  i,  a
      x.append(context)               #[[0,0,0]] ...,..o,.ol,oli,liv,iva,...
      y.append(ix)                    #[15] <= olivia 의 stoi가 들어감
      context = context[1:] +[ix]     # 값전달 list
  #build_dataset(words[:3])
  #
  x=torch.tensor(x)
  y=torch.tensor(y)
  print(x.shape,y.shape)

  return x,y

build_dataset(words[:3])

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

xtr,ytr = build_dataset(words[:n1])
xval, yval = build_dataset(words[n1:n2])
xtest, ytest = build_dataset(words[n2:])


torch.Size([16, 8]) torch.Size([16])
torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [25]:
class Linear:
    def __init__(self, fan_in, fan_out, bias = True):
        self.weight = torch.randn((fan_in, fan_out))
        self.weight /= fan_in ** 0.5
        self.bias = torch.zeros((fan_out)) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameter(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])



#----------------------------------------------------------------------

class batchnorm1d:

  def __init__(self,dim,eps=1e-5,momentum=0.1):
    # eps 는 batchnorm 과정중 분모가 0이 되는 걸 막는 작은값

    self.eps =  eps
    self.momentum = momentum
    self.training = True
    #parameter

    #gamma 값이 1인 이유= 직선식에서 ax+b 부분의 a부분이기 때문 초기화 값 당연히 1
    #beta 값이 1인이유 = 직선식에서 ax+b 부분의 b부분의 초기화 값이기 때문
    #gamma beta는 scale shift를 위한것 다양한 값을 선택하기 위함
    #gaussian 분포의 x,y축으로의 움직임

    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

    #buffer
    #위와비슷 초기 mean이 0으로 선택되면 고려x
    #variance또한 마찬가지 1로 설정되면 고려x
    self.runningmean = torch.zeros(dim)
    self.runningvar= torch.ones(dim)

  def __call__(self,x):

    if self.training:
      xmean = x.mean(0,keepdim=True)  #batchmean
      xvar =x.var(0,keepdim=True,unbiased=True)   #batchvariance
    else:
      xmean = self.runningmean
      xvar= self.runningvar

    xhat=(x-xmean)/torch.sqrt(self.eps+xvar) #actiavationm,batchnorm.ipynb
    self.out = self.gamma*xhat + self.beta

    #buffer 업데이트  momentum comes into play & gradient descent 영향을 받지않음
    #부드럽게 업데이트할 수 있으며, 훈련 초기에는 높은 학습률을 유지함
    #훈련이 진행될수록 이동 평균이 더 중요해지게 됨

    #전체 데이터셋 평균,분산만 사용하는게 아니라 학습하며 얻은정보도 활용
    if self.training:
      with torch.no_grad():
        self.runningmean =  (1-self.momentum)*self.runningmean + self.momentum*xmean
        self.runningvar = (1-self.momentum)*self.runningvar + self.momentum*xvar
    return self.out

  def parameter(self):
    return [self.gamma,self.beta]

#----------------------------------------------------------------------------------

class tanh:
  def __call__(self,x):
    self.out = torch.tanh(x)
    return self.out

  def parameter(self):
    return []
#----------------------------------------------------------------

class embedding():
  def __init__(self,n_embd,embedding_dim):
    self.weight = torch.randn((n_embd,embedding_dim))
    #C matrix

  def __call__(self,x):
    self.out = self.weight[x]
    # C[xb]
    return self.out

  def parameter(self):
    return [self.weight] #C를 param으로 설정

#---------------------------------------------------------------

class flatten():
  def __call__(self,x):
    self.out = x.view(x.shape[0],-1)
    return self.out

  def parameter(self):
    return []

#-----------------------------------------------------------------

class sequential:
  def __init__(self,layers):
    self.layers = layers

  def __call__(self,x):
    for layer in self.layers:
      x= layer(x)
    self.out = layer
    return self.out

  def parameter(self):
    return [p for layer in self.layers for p in layer.parameter()]

#--------------------------------------------------------------------
class flattenconsecutive:
    def __init__(self, n):
        # n is the number of consecutive elements we want (2 in our example)
        self.n = n

    def __call__(self, x):
        # in our example: B = 5, T = 8, C = 10
        B, T, C = x.shape
        # we want to convert X to (5, 4, 20)
        x = x.view(B, T // self.n, C * self.n)

        if x.shape[1] == 1:
            x = x.squeeze(1)

        self.out = x
        return self.out

    def parameter(self):
        return []

In [31]:
n_embd = 24
n_hidden = 128

model = sequential([
  embedding(vocab_size, n_embd),
  flattenconsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), batchnorm1d(n_hidden), tanh(),
  flattenconsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), batchnorm1d(n_hidden), tanh(),
  flattenconsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), batchnorm1d(n_hidden), tanh(),
  Linear(n_hidden, vocab_size),
])

with torch.no_grad():
  model.layers[-1].weight *= 0.1

parameters = model.parameter()
print(sum(p.nelement() for p in parameters))
for p in parameters:
  p.requires_grad = True

max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

  # minibatch construct
  ix = torch.randint(0, xtr.shape[0], (batch_size,))
  Xb, Yb = xtr[ix], ytr[ix] # batch X,Y

  # forward pass
  logits = model(Xb)
  loss = F.cross_entropy(logits, Yb) # loss function

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update: simple SGD
  lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())

76579


TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not Linear

In [None]:
# sampling from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size
    while True:
        # Forward pass
        logits = model(torch.tensor([context]).reshape(1, -1))
        probs = F.softmax(logits, dim = 1)

        ix = torch.multinomial(probs, num_samples = 1).item()

        # Shift the Context Window
        context = context[1:] + [ix]

        if ix == 0:
            break

        out.append(ix)

    print("".join(itos[i] for i in out))

In [None]:
#print(C.shape)
#print([p.shape for layer in layers for p in layer.parameter()])
#numpy broadcasting 덧셈,곱셈

max_step = 20000
batchsize = 32

lossi = []

for i in range(max_step):

# 미니배치 분리 randint 사용해서 random으로 쓸 xtr 가져옴
  idx = torch.randint(0,xtr.shape[0],(batchsize,))
  xb,yb = xtr[idx],ytr[idx]

  #xtr -> xb
  #shape = [전체 문자,block_size] -> 미니배치 -> [minibatch_size,block_size]

  #class 로 embedding flatten정의했기때문에 layer안에 정의되있음
  #emb = C[xb] # emb matrix C에 | [[1,14,6],...] 문자 xb 임베딩 과정
  #x= emb.view(emb.shape[0],-1) #layers 첫 hidden layer차원과 맞추는 과정 = 임베딩
  
  #------------------------
  #x= xb
  #for layer in layers:
  #  x=layer(x)
  #-------------------------  
  #         |
  #         |
  #         V
  
  logits = model(xb)
  loss = F.cross_entropy(logits,yb)

  #backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  lr = 0.1 if i < 150000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  if i % 10000 == 0:
    print(f'{i:6d}/{max_step:6d}:{loss.item():.3f}')
  lossi.append(loss.log10().item())


In [None]:
#layers 를 더 훈련시키지 않는다 -> eval mode
for layer in model.layers:
  layer.training =False

In [None]:
#,,,eval mode down below

In [None]:
logit.shape

#만든 모델 실행
for i in range(20):
  out = []
  context= [0]*block_size
  while True:
    logits = model(torch.tensor([context]))
    probs = F.softmax(logits,dim=1)

In [None]:
for _ in range(20):
    out = []
    context = [0] * block_size
    while True:
        # Forward pass
        logits = model(torch.tensor([context]).reshape(1, -1))
        probs = F.softmax(logits, dim = 1)

        ix = torch.multinomial(probs, num_samples = 1).item()

        # Shift the Context Window
        context = context[1:] + [ix]

        if ix == 0:
            break

        out.append(ix)

    print("".join(itos[i] for i in out))

In [None]:
plt.plot(torch.tensor(lossi).view(-1,1000))

In [None]:
plt.plot(torch.tensor(lossi).view(-1,1000).mean(1))