In [3]:
# 参考教程 https://jaketae.github.io/study/pytorch-rnn/

# 下载训练语料

In [2]:
%cd /mnt/bn/tob-lq/qianweishuo/pytorchTutorial
%pwd

/mnt/bn/tob-lq/qianweishuo/pytorchTutorial


'/mnt/bn/tob-lq/qianweishuo/pytorchTutorial'

In [21]:
!curl -o name2lang.zip https://download.pytorch.org/tutorial/data.zip
!mkdir -p data/name2lang && unzip name2lang.zip -d data/name2lang
!mv data/name2lang/data/* data/name2lang/
!rm -rf ./data.zip data/name2lang/data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2814k  100 2814k    0     0  2907k      0 --:--:-- --:--:-- --:--:-- 2904k
Archive:  name2lang.zip
   creating: data/name2lang/data/
  inflating: data/name2lang/data/eng-fra.txt  
   creating: data/name2lang/data/names/
  inflating: data/name2lang/data/names/Arabic.txt  
  inflating: data/name2lang/data/names/Chinese.txt  
  inflating: data/name2lang/data/names/Czech.txt  
  inflating: data/name2lang/data/names/Dutch.txt  
  inflating: data/name2lang/data/names/English.txt  
  inflating: data/name2lang/data/names/French.txt  
  inflating: data/name2lang/data/names/German.txt  
  inflating: data/name2lang/data/names/Greek.txt  
  inflating: data/name2lang/data/names/Irish.txt  
  inflating: data/name2lang/data/names/Italian.txt  
  inflating: data/name2lang/data/names/Japanese.txt  
  inflating: data/name2lang/data/names/Korean

# lang2label, name2tensor

In [22]:
# !pip3 install unidecode

In [3]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_ = torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
data_dir = "./data/name2lang/names"
lang2label = {file_name.split(".")[0]: torch.tensor([i], dtype=torch.long) for i, file_name in enumerate(os.listdir(data_dir))}
lang2label

{'Arabic': tensor([0]),
 'Chinese': tensor([1]),
 'Czech': tensor([2]),
 'Dutch': tensor([3]),
 'English': tensor([4]),
 'French': tensor([5]),
 'German': tensor([6]),
 'Greek': tensor([7]),
 'Irish': tensor([8]),
 'Italian': tensor([9]),
 'Japanese': tensor([10]),
 'Korean': tensor([11]),
 'Polish': tensor([12]),
 'Portuguese': tensor([13]),
 'Russian': tensor([14]),
 'Scottish': tensor([15]),
 'Spanish': tensor([16]),
 'Vietnamese': tensor([17])}

In [5]:
num_langs = len(lang2label)

In [6]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx)
num_letters

59

In [7]:
def name2tensor(name):
    """
    In PyTorch, RNN layers expect the input tensor to be of size (seq_len, batch_size, input_size).
    """
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [8]:
name2tensor("abc")

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

# 数据集构建

In [9]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass

len(target_langs), target_langs[:3]

(20070, [tensor([0]), tensor([0]), tensor([0])])

In [10]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)),
    test_size=0.1,
    shuffle=True,
    stratify=[e.item() for e in target_langs],  # 注意不是 stratify=target_langs,
)
train_dataset = [(tensor_names[i], target_langs[i]) for i in train_idx]
test_dataset = [(tensor_names[i], target_langs[i]) for i in test_idx]

In [11]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")

Train: 18063
Test: 2007


# Simple RNN

In [12]:
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)

    def forward(self, x, hidden_state):
        combined = torch.cat(tensors=(x, hidden_state), dim=1)  # (1, num_letters + hidden_size)
        hidden = torch.sigmoid(self.in2hidden(combined))  # (1, hidden_size)
        output = self.in2output(combined)  # (1, num_langs)
        return output, hidden

    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))


hidden_size = 256
learning_rate = 0.001

model = MyRNN(input_size=num_letters, hidden_size=hidden_size, output_size=num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
!nvidia-smi

262.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Wed Jun 19 23:29:52 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:60:00.0 Off |                    0 |
| N/A   42C    P0    66W / 300W |      3MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [16]:
num_epochs = 2
print_interval = 30

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()  # (1, hidden_size)
        for char in name:  # (1, num_letters)
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)  # logits v.s. target

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1, norm_type=2.0)  # Clips gradient norm of an iterable of parameters.
        optimizer.step()

        if (i + 1) % print_interval == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], " f"Step [{i + 1}/{len(train_dataset)}], " f"Loss: {loss.item():.4f}")
        if i > 300: break  # debug

Epoch [1/2], Step [30/18063], Loss: 0.0239
Epoch [1/2], Step [60/18063], Loss: 0.1119
Epoch [1/2], Step [90/18063], Loss: 0.1356
Epoch [1/2], Step [120/18063], Loss: 3.0097
Epoch [1/2], Step [150/18063], Loss: 0.0751


KeyboardInterrupt: 

In [17]:
from tqdm.auto import tqdm

num_correct = 0
num_samples = len(test_dataset)

model.eval()
with torch.no_grad():
    for name, label in tqdm(test_dataset, desc='逐条测试'):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)
print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

逐条测试:   0%|          | 0/2007 [00:00<?, ?it/s]

Accuracy: 46.8859%


In [18]:
label2lang = {label.item(): lang for lang, label in lang2label.items()}


def myrnn_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in tensor_name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
    model.train()
    return label2lang[pred.item()]


for name in ['Mike', 'Qin', 'Slaveya']:
    print(myrnn_predict(name))

Russian
Russian
Russian


# PyTorch GRU on CUDA

In [22]:
class GRUModel(nn.Module):
    def __init__(self, num_layers, hidden_size):
        super(GRUModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.gru = nn.GRU(
            input_size=num_letters,
            hidden_size=hidden_size,
            num_layers=num_layers,
        )
        self.fc = nn.Linear(hidden_size, num_langs)

    def forward(self, x):
        hidden_state = self.init_hidden()
        output, hidden_state = self.gru(x, hidden_state)
        output = self.fc(output[-1])
        return output

    def init_hidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size).to(device)


model = GRUModel(num_layers=2, hidden_size=hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [23]:
from tqdm.auto import trange, tqdm

for epoch in trange(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(tqdm(train_dataset)):
        output = model(name.to(device))
        loss = criterion(output, label.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % print_interval == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], " f"Step [{i + 1}/{len(train_dataset)}], " f"Loss: {loss.item():.4f}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/18063 [00:00<?, ?it/s]

Epoch [1/2], Step [30/18063], Loss: 0.2804
Epoch [1/2], Step [60/18063], Loss: 5.4819
Epoch [1/2], Step [90/18063], Loss: 1.7093
Epoch [1/2], Step [120/18063], Loss: 1.6225
Epoch [1/2], Step [150/18063], Loss: 0.3381
Epoch [1/2], Step [180/18063], Loss: 2.9073
Epoch [1/2], Step [210/18063], Loss: 4.1206
Epoch [1/2], Step [240/18063], Loss: 1.8794
Epoch [1/2], Step [270/18063], Loss: 4.1809
Epoch [1/2], Step [300/18063], Loss: 2.8752
Epoch [1/2], Step [330/18063], Loss: 2.3095
Epoch [1/2], Step [360/18063], Loss: 9.1140
Epoch [1/2], Step [390/18063], Loss: 3.0998
Epoch [1/2], Step [420/18063], Loss: 0.1925
Epoch [1/2], Step [450/18063], Loss: 3.3917
Epoch [1/2], Step [480/18063], Loss: 0.7813
Epoch [1/2], Step [510/18063], Loss: 0.8431
Epoch [1/2], Step [540/18063], Loss: 2.0390
Epoch [1/2], Step [570/18063], Loss: 1.6805
Epoch [1/2], Step [600/18063], Loss: 0.9868
Epoch [1/2], Step [630/18063], Loss: 4.0865
Epoch [1/2], Step [660/18063], Loss: 1.1449
Epoch [1/2], Step [690/18063], Loss

  0%|          | 0/18063 [00:00<?, ?it/s]

Epoch [2/2], Step [30/18063], Loss: 0.0001
Epoch [2/2], Step [60/18063], Loss: 0.4895
Epoch [2/2], Step [90/18063], Loss: 0.0022
Epoch [2/2], Step [120/18063], Loss: 0.1502
Epoch [2/2], Step [150/18063], Loss: 0.0015
Epoch [2/2], Step [180/18063], Loss: 4.0186
Epoch [2/2], Step [210/18063], Loss: 0.9032
Epoch [2/2], Step [240/18063], Loss: 1.7574
Epoch [2/2], Step [270/18063], Loss: 0.0044
Epoch [2/2], Step [300/18063], Loss: 0.1591
Epoch [2/2], Step [330/18063], Loss: 0.0001
Epoch [2/2], Step [360/18063], Loss: 0.0003
Epoch [2/2], Step [390/18063], Loss: 1.5843
Epoch [2/2], Step [420/18063], Loss: 0.0064
Epoch [2/2], Step [450/18063], Loss: 0.0115
Epoch [2/2], Step [480/18063], Loss: 0.1849
Epoch [2/2], Step [510/18063], Loss: 0.0010
Epoch [2/2], Step [540/18063], Loss: 0.1300
Epoch [2/2], Step [570/18063], Loss: 4.7715
Epoch [2/2], Step [600/18063], Loss: 0.0622
Epoch [2/2], Step [630/18063], Loss: 1.6822
Epoch [2/2], Step [660/18063], Loss: 0.0150
Epoch [2/2], Step [690/18063], Loss

In [26]:
num_correct = 0

model.eval()
with torch.no_grad():
    for name, label in test_dataset:
        output = model(name.to(device))
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred.cpu() == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 81.5147%


In [28]:
def pytorch_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        output = model(tensor_name.to(device))
        _, pred = torch.max(output, dim=1)
    model.train()
    return label2lang[pred.item()]


for name in ['Mike', 'Michael', 'Qin', 'Zhang', 'Slaveya', 'Svanovski']:
    print(pytorch_predict(name))

Japanese
German
Chinese
Chinese
Italian
Russian
