In [1]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import time
import json
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
import spacy
from datasets import load_dataset
from transformers import AutoTokenizer

In [4]:
%run "/content/drive/MyDrive/MSML641/encoder_decoder.ipynb"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data_files = {'train':'/content/drive/MyDrive/MSML641/dataset.json'}
data = load_dataset('json',data_files=data_files)
tokenizer_en = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer_zh = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
data.set_format('pandas')
dataset = data['train'][:][['tokenized_en_id','tokenized_zh_id']]
dataset = dataset.rename(columns={'tokenized_en_id':'english'})
dataset = dataset.rename(columns={'tokenized_zh_id':'chinese'})
dataset.head()

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Unnamed: 0,english,chinese
0,"[101, 14555, 1350, 1120, 1157, 2190, 1104, 110...","[101, 517, 6568, 2168, 518, 2199, 4680, 1045, ..."
1,"[101, 2372, 1128, 4472, 1164, 1103, 2170, 1524...","[101, 872, 2190, 7478, 3828, 1184, 5296, 2773,..."
2,"[101, 2353, 6124, 1108, 1653, 117, 102]","[101, 872, 1957, 3301, 1351, 3221, 4635, 782, ..."
3,"[101, 146, 1156, 1176, 1122, 1106, 1129, 1126,...","[101, 2769, 2361, 3307, 2828, 2124, 1215, 2533..."
4,"[101, 1124, 2993, 1106, 1107, 16811, 1471, 151...","[101, 800, 7444, 6206, 3680, 3299, 711, 5632, ..."


In [6]:
# hyperparameters
# model
vocab_size_en = len(tokenizer_en.vocab)
vocab_size_zh = len(tokenizer_zh.vocab)
print(vocab_size_en,vocab_size_zh)
max_length = 512        # max length of the input sequence
n_emb = 512             # embedding size
n_head = 8             # number of heads in multi-head attention
head_size = 64          # number of 'features' output by a single-head self-attention
n_blocks = 3            # number of blocks in a encoder or decoder
n_hidden = 1024
assert head_size*n_head == n_emb, ''

# training
num_epochs = 20
batch_size = 128
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

28996 21128


In [7]:
class textData(Dataset):
  def __init__(self,data):
    super().__init__()
    self.data = data

  def __getitem__(self, index):
    chinese_sentence = torch.tensor(self.data['chinese'][index])
    english_sentence = torch.tensor(self.data['english'][index])
    return english_sentence,chinese_sentence,

  def __len__(self):
    return len(self.data)

In [8]:
def collate(batch):
  input_embeddings = [item[0] for item in batch]
  targets = [item[1] for item in batch]

  input_padded = pad_sequence(input_embeddings, batch_first=True, padding_value=tokenizer_en.pad_token_id)
  attention_mask_input = (torch.ones((input_padded.shape[1],input_padded.shape[1]))==0)
  attention_mask_input_padding = (input_padded == tokenizer_en.pad_token_id)

  target_padded = pad_sequence(targets, batch_first=True, padding_value=tokenizer_zh.pad_token_id)
  size = target_padded.shape[1] - 1
  attention_mask_target = (torch.tril(torch.ones(size,size)) == 0)
  attention_mask_target_padding = (target_padded[:,:-1] == tokenizer_zh.pad_token_id)

  return {'input':input_padded,'attention_mask_input':attention_mask_input,'attention_mask_input_padding':attention_mask_input_padding,
          'target':target_padded,'attention_mask_target':attention_mask_target,'attention_mask_target_padding':attention_mask_target_padding}

In [9]:
data = textData(dataset)
dataloader = DataLoader(data,batch_size=batch_size,shuffle=True,collate_fn=collate)

In [10]:
model = TorchTransformer(n_emb,head_size,n_head,n_blocks,vocab_size_en,vocab_size_zh,n_hidden,max_length).to(device)
for p in model.parameters():
  if p.dim() > 1:
    nn.init.kaiming_uniform_(p, nonlinearity='relu')
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer_en.pad_token_id)
optim = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
schedular = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=num_epochs)

In [11]:
print('parameters of this model: ',sum(p.numel() for p in model.parameters()))
print('number of examples: ',len(data))

parameters of this model:  52275848
number of examples:  100000


In [None]:
load_model = False
if load_model:
  model.load_state_dict(torch.load(r'/content/drive/MyDrive/MSML641/model_state_dict.pt'))
else:
  count = 0
  lossi = []
  start = time.time()
  model.train()
  for epoch in range(num_epochs):
    for batch in dataloader:
      inputs = batch['input'].to(device)
      attention_mask_input = batch['attention_mask_input'].to(device)
      attention_mask_input_padding = batch['attention_mask_input_padding'].to(device)
      targets = batch['target'].to(device)
      attention_mask_target = batch['attention_mask_target'].to(device)
      attention_mask_target_padding = batch['attention_mask_target_padding'].to(device)
      y_pred = model(inputs,targets[:,:-1],
                     attention_mask_input,attention_mask_target,
                     attention_mask_input_padding,attention_mask_target_padding,
                     attention_mask_input_padding)

      loss = loss_fn(y_pred.transpose(1,2), targets[:,1:])
      optim.zero_grad()
      loss.backward()
      optim.step()
      lossi.append(loss.item())
      count += 1
      if count % 200 == 0:
        print('loss: ',loss.item())
    schedular.step()
    print('epoch: ',epoch,' loss: ',loss.item())
  torch.save(model.state_dict(), r'/content/drive/MyDrive/MSML641/model_state_dict.pt')
  end = time.time()
  print(f'time for {num_epochs} epoches: ',end-start)
  plt.plot(lossi)

loss:  5.7367777824401855
loss:  5.788347244262695
loss:  5.427490234375
epoch:  0  loss:  5.241091728210449
loss:  5.326956748962402
loss:  5.127538204193115
loss:  5.1583452224731445
loss:  5.231652736663818
epoch:  1  loss:  5.240166187286377
loss:  4.9452900886535645
loss:  4.901640892028809
loss:  5.0972185134887695
loss:  4.864455223083496
epoch:  2  loss:  4.710724353790283
loss:  4.5350847244262695
loss:  4.575887203216553
loss:  4.529055595397949
loss:  4.44680118560791
epoch:  3  loss:  4.227026462554932
loss:  4.396206855773926
loss:  4.275308132171631
loss:  4.378812789916992
loss:  4.468381404876709
epoch:  4  loss:  4.194155216217041
loss:  3.867584228515625
loss:  4.152860164642334
loss:  3.971611738204956
loss:  4.065534591674805
epoch:  5  loss:  4.060426235198975
loss:  3.859006881713867
loss:  4.118797302246094
loss:  4.05768346786499
loss:  3.895028829574585
epoch:  6  loss:  4.038586616516113
loss:  3.898801326751709
loss:  3.8122775554656982
loss:  4.1471428871154

In [None]:
model.eval()
def generate(input_seq,test=False):
  tokens = tokenizer_en.tokenize(input_seq,add_special_tokens=True)
  inputs = tokenizer_en.convert_tokens_to_ids(tokens)
  seq_enc = torch.tensor(inputs).unsqueeze(0).to(device)
  mask_enc = (torch.ones([seq_enc.shape[1],seq_enc.shape[1]])==0).to(device)
  memory = model.encode(seq_enc,mask_enc).to(device)

  output_token = '[CLS]'
  while len(output_token)<=32:
    tokens = tokenizer_zh.tokenize(output_token)
    inputs = tokenizer_zh.convert_tokens_to_ids(tokens)
    seq_dec = torch.tensor(inputs).unsqueeze(0).to(device)
    mask_dec = (torch.tril(torch.ones([seq_dec.shape[1],seq_dec.shape[1]]))==0)
    logits = model.linear(model.decode(seq_dec,memory,mask_dec)[:,-1,:])
    probs = F.softmax(logits,dim=-1)
    temp = torch.argmax(probs,dim=-1).squeeze()
    temp = tokenizer_zh.decode(temp)
    output_token += temp
    if temp == '[SEP]':
      break
  return output_token

In [None]:
input_seq = "can you go now?"
output_seq = generate(input_seq,False)
print(output_seq)