In [45]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
%matplotlib inline

In [46]:
words = open('names.txt','r').read().splitlines()
words[:9]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper']

In [47]:
chars = sorted(list(set(''.join(words))))

In [48]:
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [49]:
stoi = {s:i+1 for i,s in enumerate(chars)} #i+1 인이유 enumerate의 첫 index - 0이고 이를 시작토큰 .으로 설정
stoi['.'] = 0
itos = {i+1:s for i,s in enumerate(chars)} # {i:s for i,s in stoi.items()} 도 가능
itos[0] = '.'
itos


{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [50]:
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [51]:
#데이터셋 만들기
block_size = 3 #context length 몇개의 알파벳을 다음문자를 예측하기 위해 쓸거냐
def build_dataset(words):
  x,y=[],[]
  for w in words:
    context = [0] * block_size
    for ch in w + '.':                #olivia [0,0,0]
      ix =stoi[ch]                    #           o,  l,  i,  v,  i,  a
      x.append(context)               #[[0,0,0]] ...,..o,.ol,oli,liv,iva,...
      y.append(ix)                    #[15] <= olivia 의 stoi가 들어감
      context = context[1:] +[ix]     # 값전달 list
  #build_dataset(words[:3])
  #
  x=torch.tensor(x)
  y=torch.tensor(y)
  print(x.shape,y.shape)

  return x,y

build_dataset(words[:3])



torch.Size([16, 3]) torch.Size([16])


(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [ 0,  0,  0],
         [ 0,  0, 15],
         [ 0, 15, 12],
         [15, 12,  9],
         [12,  9, 22],
         [ 9, 22,  9],
         [22,  9,  1],
         [ 0,  0,  0],
         [ 0,  0,  1],
         [ 0,  1, 22],
         [ 1, 22,  1]]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0]))

In [52]:
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

xtr,ytr = build_dataset(words[:n1])
xval, yval = build_dataset(words[n1:n2])
xtest, ytest = build_dataset(words[n2:])


torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [54]:
n_embedding = 10 #각 단어를 10차원의 벡터로 표현하겟다~
vocab_size = len(itos) # 27
n_hidden = 200 #은닉층에 있는 뉴런개수
g= torch.Generator().manual_seed(2147483647)
#C는 보통 nlp 에서 embedding matrix 의미
C= torch.randn((vocab_size,n_embedding),generator=g)
W1 = torch.randn((n_embedding*block_size,n_hidden), generator= g)
#broadcasting 진행 마지막 dim에 더해짐
b1 = torch.randn(n_hidden,generator =g)
W2= torch.randn((n_hidden,vocab_size),generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C,W1,b1,W2,b2]
print(sum(p.nelement() for p in parameters))
#27*10 + 30+200 +200 + 200*27 + 27 = 11897
for p in parameters:
  p.requires_grad= True #backpropagation밑밥

11897


In [62]:
xtr.shape[0]

182625

In [85]:
max_step = 200000
batch_size = 32
lossi = []
for i in range(max_step):
  ## mini batch 만들기
  #batch size, 하면 size[32]가되고 , 없으면 size[1]이 됨
  idx = torch.randint(0,xtr.shape[0],(batch_size,),generator = g)
  xb,yb = xtr[idx],ytr[idx]
  #xtr shape 16,3 ytr shape 16

  #forward pass
  emb = C[xb]
  # xb shape = 32,3 C shape = 27,10 C가 뭐였냐 -> embedding matrix
  #emb shape = 32,3,10 pytorch의 tensor 에 tensor index 원리? 몰루
  embconcat = emb.view(emb.shape[0],-1)
  # emb.shape[0] = 32 즉 embconcat shape = 32,30
  hpreact = embconcat @ W1 + b1
  #h_pre_act = hidden layer pre activation
  # 단순히 activition function 전의 hidden layer
  # W1 shape 30,200이였으므로 곱셈가능
  h = torch.tanh(hpreact)#activation function
  # W2.shape 200,27
  # hpreact.shape = 32,200
  logits = h @ W2 + b2
  #logits shape 32,27
  #yb shape 32
  loss = F.cross_entropy(logits,yb)
  ###yb = if emma [5,13,13,1]

  #backward pass
  for p in parameters:
    p.grad = None#gradient 초기값없음
  loss.backward()

  #update learnining rate
  lr = 0.1 if i < 100000 else 0.01
  #학습률 조절 초기엔 빠르게 갈수록 안정적으로
  for p in parameters:
    p.data += lr * p.grad


  # tqdm 모시깽깽이로 진행상황 표시
  if i % 10000 == 0:
    print(f'{i:7d}/{max_step:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())

  break

In [None]:
#이진욱의 호기심 천국 below

In [77]:
logits.shape

torch.Size([32, 27])

In [79]:
logits

tensor([[-7.8213e+00,  7.7030e+00,  1.3639e+01, -7.6651e+00,  1.2807e+00,
         -1.9651e+00, -7.8856e-01, -4.6155e+00, -2.3033e+01, -3.7568e+00,
          1.6521e+01,  7.9513e+00,  1.9079e+00, -8.0861e+00,  6.0954e-01,
          2.1489e+01, -8.7937e+00,  1.1714e+01,  2.7585e+01,  2.0314e+01,
         -8.1255e+00,  8.9166e+00, -1.0405e+01,  1.0608e+01,  1.8319e+01,
         -3.4983e+00,  1.9232e+01],
        [ 1.2412e+01, -7.6996e+00,  2.3776e+00,  6.5568e+00, -6.7900e+00,
         -1.5600e+01, -2.1210e+01, -6.8933e-01,  1.3283e+01, -1.2542e+01,
         -4.2807e+00,  2.5917e+01,  1.7245e+00, -1.9725e+01,  2.6588e+00,
          7.7760e+00, -1.5661e+01,  1.4815e+01,  1.6635e+01, -9.3979e+00,
         -6.0412e+00, -2.7174e+00, -1.9348e+00, -4.2945e+00, -9.4654e+00,
         -5.1644e+00,  7.4092e-01],
        [-1.1465e+01, -2.3577e+00, -1.9897e+01, -1.1001e+01, -5.2540e-01,
         -4.8786e+00, -8.6600e+00,  1.4336e+01, -5.2337e+00,  6.3359e+00,
         -1.4439e+01, -1.8999e+00,  4.75

In [80]:
yb

tensor([19,  1,  5,  0, 14, 14, 18,  8, 11, 25,  2,  9, 26, 15, 14, 19,  5, 13,
         0,  0,  8, 12,  9, 25,  1,  5,  8,  8,  5,  1,  1, 22])

In [78]:
yb.shape

torch.Size([32])

In [71]:
h.shape

torch.Size([32, 200])

In [82]:
h

tensor([[-0.9986,  0.0272, -0.9958,  ...,  0.5002,  0.9998,  0.9967],
        [-1.0000,  0.9604, -0.1418,  ..., -0.1266,  1.0000,  1.0000],
        [ 0.9923, -0.9907, -1.0000,  ..., -0.9999, -0.9476,  0.9999],
        ...,
        [-0.9948, -0.2404,  0.9418,  ..., -0.9999,  0.9992,  1.0000],
        [ 0.9780, -0.9986, -0.9931,  ..., -1.0000, -0.9995, -0.9460],
        [ 0.9958,  0.9997, -1.0000,  ..., -0.8982, -1.0000,  0.6227]],
       grad_fn=<TanhBackward0>)

In [83]:
hpreact.shape

torch.Size([32, 200])

In [84]:
hpreact

tensor([[-3.6460,  0.0272, -3.0791,  ...,  0.5495,  4.6690,  3.1952],
        [-9.8810,  1.9506, -0.1427,  ..., -0.1273,  5.3983,  7.5101],
        [ 2.7799, -2.6841, -6.1061,  ..., -5.2841, -1.8080,  4.9415],
        ...,
        [-2.9766, -0.2452,  1.7540,  ..., -5.1640,  3.9082, 15.4719],
        [ 2.2498, -3.6238, -2.8332,  ..., -5.8289, -4.1230, -1.7921],
        [ 3.0773,  4.3283, -6.9878,  ..., -1.4626, -5.4021,  0.7294]],
       grad_fn=<AddBackward0>)

In [65]:
xb.shape

torch.Size([32, 3])

In [64]:
emb.shape

torch.Size([32, 3, 10])

In [68]:
embconcat.shape

torch.Size([32, 30])