In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import torch
from torch import nn

In [None]:
df = pd.read_csv('files/en2fr.csv')

In [None]:
df.shape[0]

In [None]:
print(df.iloc[30856]['en'])

In [None]:
print(df.iloc[30856]['fr'])

In [None]:
from transformers import XLMTokenizer

In [None]:
tokenizer = XLMTokenizer.from_pretrained('xlm-clm-enfr-1024')

In [None]:
tokenized_en = tokenizer.tokenize("I don't speak French.")

In [None]:
tokenized_en

In [None]:
tokenized_fr=tokenizer.tokenize("Je ne parle pas français.")

In [None]:
tokenized_fr

In [None]:
print(tokenizer.tokenize("How are you?"))
print(tokenizer.tokenize("Comment êtes-vous?"))

In [None]:
from collections import Counter


In [None]:
en = df['en'].tolist()

In [None]:
en_tokens = [['BOS'] +tokenizer.tokenize(x)+ ['EOS'] for x in en]

In [None]:
word_counter = Counter()
for sentence in en_tokens:
    for word in sentence:
        word_counter[word] += 1


In [None]:
frequency = word_counter.most_common(50000)

In [None]:
total_en_words = len(frequency) + 2

In [None]:
en_word_dict = {w[0]:idx+2 for idx, w in enumerate(frequency)}

In [None]:
PAD=0
UNK=1
en_word_dict['PAD'] = PAD
en_word_dict['UNK'] = UNK

In [None]:
en_idx_dict={v:k for k,v in en_word_dict.items()}

In [None]:
en_word_dict

In [None]:
enidx = [en_word_dict.get(i, UNK) for i in tokenized_en]

In [None]:
enidx

In [None]:
entokens = [en_idx_dict.get(i, "UNK") for i in enidx]

In [None]:
entokens

In [None]:
en_phrase="".join(entokens)
en_phrase=en_phrase.replace("</w>"," ")

In [None]:
for x in '''?:;.,'("-!&)%''':
    en_phrase=en_phrase.replace(f" {x}",f"{x}")
print(en_phrase)

In [None]:
fr=df["fr"].tolist()
fr_tokens=[["BOS"]+tokenizer.tokenize(x)+["EOS"] for x in fr]
word_count=Counter()
for sentence in fr_tokens:
    for word in sentence:
        word_count[word]+=1
frequency=word_count.most_common(50000)
total_fr_words=len(frequency)+2
fr_word_dict={w[0]:idx+2 for idx,w in enumerate(frequency)}
fr_word_dict["PAD"]=PAD
fr_word_dict["UNK"]=UNK
fr_idx_dict={v:k for k,v in fr_word_dict.items()}

In [None]:
fridx=[fr_word_dict.get(i,UNK) for i in tokenized_fr]

In [None]:
fridx

In [None]:
frtokens=[fr_idx_dict.get(i,"UNK") for i in fridx]
print(frtokens)

In [None]:
fr_phrase="".join(frtokens)
fr_phrase=fr_phrase.replace("</w>"
,
" ")
for x in '''?:;.,'("-!&)%''':
    fr_phrase=fr_phrase.replace(f" {x}",f"{x}")
print(fr_phrase)

In [None]:
import pickle

In [None]:
with open('files/dict.p', 'rb') as f:
    en_word_dict,en_idx_dict, fr_word_dict,fr_idx_dict = pickle.load(f)

In [None]:
out_en_ids = [[en_word_dict.get(w, UNK) for w in s] for s in en_tokens]
out_fr_ids = [[en_word_dict.get(w, UNK) for w in s] for s in fr_tokens]

In [None]:
sorted_ids=sorted(range(len(out_en_ids)), key=lambda x:len(out_en_ids[x]))
out_en_ids=[out_en_ids[x] for x in sorted_ids]
out_fr_ids=[out_fr_ids[x] for x in sorted_ids]

In [None]:
batch_size = 128

In [None]:
idx_list = np.arange(0, len(out_en_ids), batch_size)

In [None]:
len(en_tokens)

In [None]:
batch_indexs = []
for idx in idx_list:
    batch_indexs.append(np.arange(idx,min(len(en_tokens), idx+batch_size)))

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [None]:
src_vocab = len(en_word_dict)
tgt_vocab = len(fr_word_dict)
print(f"there are {src_vocab} distinct English tokens")
print(f"there are {tgt_vocab} distinct French tokens")

In [None]:
from utils.ch09util import PositionalEncoding

In [None]:
pe = PositionalEncoding(256, .1)

In [None]:
x = torch.zeros(1, 8, 256, device=DEVICE)

In [None]:
y = pe(x)

In [None]:
y.shape

In [None]:
y

In [None]:
from utils.ch09util import create_model

In [None]:
model = create_model(src_vocab, tgt_vocab,N=6, d_model=256, d_ff=1024, h=8, dropout=.1)

In [None]:
from utils.ch09util import NoamOpt

In [None]:
optimizer = NoamOpt(256, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(.9, .98), eps=1e-9))

In [None]:
from utils.ch09util import SimpleLossCompute, LabelSmoothing, Batch, subsequent_mask

def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    padded_seq = np.array([np.concatenate([x, [padding] * (ML - len(x))])
        if len(x) < ML else x for x in X])
    return padded_seq

class BatchLoader():
    def __init__(self):
        self.idx=0
    def __iter__(self):
        return self
    def __next__(self):
        self.idx += 1
        if self.idx<=len(batch_indexs):
            b=batch_indexs[self.idx-1]
            batch_en=[out_en_ids[x] for x in b]
            batch_fr=[out_fr_ids[x] for x in b]
            batch_en=seq_padding(batch_en)
            batch_fr=seq_padding(batch_fr)
            return Batch(batch_en,batch_fr)
        raise StopIteration

criterion = LabelSmoothing(tgt_vocab, padding_idx=0, smoothing=0.1)
loss_func = SimpleLossCompute( model.generator, criterion, optimizer)

In [None]:
from tqdm import tqdm
for epoch in range(100):
    model.train()
    tloss=0
    tokens=0
    loop = tqdm(BatchLoader(), leave=False)
    for batch in tqdm(loop):
        out = model(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = loss_func(out, batch.trg_y, batch.ntokens)
        tloss += loss
        tokens += batch.ntokens
        loop.set_postfix(avg_loss=(tloss/tokens).item(), epoch=epoch)
    torch.save(model.state_dict(),"files/my_en2fr.pth")

In [None]:

def translate(eng):
    # tokenize the English sentence
    tokenized_en=tokenizer.tokenize(eng)
    # add beginning and end tokens
    tokenized_en=["BOS"]+tokenized_en+["EOS"]
    # convert tokens to indexes
    enidx=[en_word_dict.get(i,UNK) for i in tokenized_en]  
    src=torch.tensor(enidx).long().to(DEVICE).unsqueeze(0)
    # create mask to hide padding
    src_mask=(src!=0).unsqueeze(-2)
    # encode the English sentence
    memory=model.encode(src,src_mask)
    # start translation in an autogressive fashion
    start_symbol=fr_word_dict["BOS"]
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    translation=[]
    for i in range(100):
        out = model.decode(memory,src_mask,ys,
        subsequent_mask(ys.size(1)).type_as(src.data))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.ones(1, 1).type_as(
            src.data).fill_(next_word)], dim=1)
        sym = fr_idx_dict[ys[0, -1].item()]
        if sym != 'EOS':
            translation.append(sym)
        else:
            break
    # convert tokens to sentences
    trans="".join(translation)
    trans=trans.replace("</w>"," ") 
    for x in '''?:;.,'("-!&)%''':
        trans=trans.replace(f" {x}",f"{x}")    
    print(trans)
    return trans

In [None]:
eng = "Today is a beautiful day!"
translated_fr = translate(eng)

In [None]:
eng = "A little boy in jeans climbs a small tree while another child looks on."
translated_fr = translate(eng)