<a href="https://colab.research.google.com/github/igor531205/nlp/blob/main/home_work_1/Model_MLP_for_generating_russian_names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Выполнил Пушкарев Игорь Игоревич. Группа 23.М08-мм.***

### Модель генерации имен на основе multilayer perceptron (MLP).

In [None]:
import random
from urllib import request

# read file
link = 'https://raw.githubusercontent.com/igor531205/nlp/main/data/names_rus.txt'
words = []
with request.urlopen(link) as f:
    words = f.read().decode().splitlines()

random.seed(42)
random.shuffle(words)
n1 = int(0.85 * len(words))  # 85% training data

# Dividing data into training and test sets
train_data = words[:n1]
test_data = words[n1:]

# Save to train.txt
link_train = 'train.txt'
with open(link_train, 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write("%s\n" % item)

# Save to test.txt
link_test = 'test.txt'
with open(link_test, 'w', encoding='utf-8') as f:
    for item in test_data:
        f.write("%s\n" % item)

print(f'Names saved to {link_train} and {link_test}')

Names saved to train.txt and test.txt


In [None]:
import torch
import torch.nn.functional as F

# read file 'train.txt'
link_train = 'train.txt'
words = open(link_train, 'r').read().splitlines()

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
num_chars = len(itos)

# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []
  for w in words:

    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X, Y

Xtr, Ytr = build_dataset(words)

# Initialize weights
C = torch.randn((num_chars, 2))

ys = C[:,1]
xs= C[:,0]

tmp=torch.arange(6).view(-1, 3)

emb = C[Xtr]

W1 = torch.randn((6, 100))
b1 = torch.randn(100)

h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
W2 = torch.randn((100, num_chars))
b2 = torch.randn(num_chars)

logits = h @ W2 + b2

counts = logits.exp()

# Calculate probabilities
prob = counts / counts.sum(1, keepdims=True)

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((num_chars, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, num_chars), generator=g)
b2 = torch.randn(num_chars, generator=g)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
  p.requires_grad = True

# Model optimization
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre

lri = []
lossi = []
stepi = []

for i in range(20000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 10)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  stepi.append(i)
  lossi.append(loss.log10().item())

emb = C[Xtr] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr)

# Save the model
link_model = 'model.torch'
torch.save({'C': C, 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}, link_model)

print(f'Model saved to {link_model}')

Model saved to model.torch


In [None]:
import torch

# read file 'test.txt'
link_test = 'test.txt'
words = open(link_test, 'r').read().splitlines()

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
num_chars = len(itos)

# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []
  for w in words:

    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X, Y

Xte, Yte = build_dataset(words)

# Loading a Saved Model
link_model = 'model.torch'
model = torch.load(link_model)
C, W1, b1, W2, b2 = model['C'], model['W1'], model['b1'], model['W2'], model['b2']

# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

emb = C[Xte] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Yte)
print(f'test loss: {loss}')

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break

    print(''.join(itos[i] for i in out))

test loss: 3.6518492698669434
Дония.
Мотапа.
Дитристина.
Ревватимий.
Ларь.
Равдинарастьян.
Посторий.
Фирафия.
Дфедозард.
Анасторий.
Жераскерд.
Йирий.
Дорита.
Позалина.
Уриса.
Энина.
Дросийналентиния.
Зия.
Адраг.
Фрисонстанцихон.
