In [None]:
#%%
# IMPORTS
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import math
import copy
import time
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize
from collections import Counter
from torch.autograd import Variable
#%%

#%%
#CORPUS PATHS

# Path to training corpus
TRAIN_CORPUS = 'ours/train.txt'

# Path to development corpus
TEST_CORPUS = 'ours/dev.txt'

# Path to saving the weights
SAVE_CORPUS = 'ours/large_model.pt'
#%%
#SET DEVICE
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    DEVICE = torch.device("cpu")
    print("Warning: You are using CPU. For better performance, use GPU.")
print("Pytorch version is: ", torch.__version__)
print("You are using: ", DEVICE)
#%%
# HYPER PARAMETERS

#batch size
BATCH_SIZE = 64

#NUM_EPOCHS
NUM_EPOCHS = 30

#Encoding/Decoding Layers
NUM_LAYERS = 9

#Num of Attention Heads
NUM_HEADS = 8

#Model Dimensions
MODEL_DIMENSIONS = 512

#Feed-Forward Dimensions
FEED_FORWARD_DIMENSIONS = 2048

#Dropout Rate
DROPOUT = 0.1

#Maximum Sentence Length
MAX_LENGTH = 60

#unknown ID
UNK = 0

#padding ID
PAD = 1
#%%
def pad_sequence(sequences, padding=0):
    lengths = []
    for seq in sequences:
        lengths.append(len(seq))
    max_length = max(lengths)
    result = []

    for seq in sequences:
        if len(seq) < max_length:
            padded_element = list(seq) + [padding] * (max_length - len(seq))
        else:
            padded_element = seq
        result.append(padded_element)

    return np.array(result)
#%%

class PreProcessing:
    def __init__(self, train_corpus, test_corpus):
        self.train_sequences_en, self.train_sequences_zh = self.generate_sequences_from_corpus(train_corpus)
        self.test_sequences_en, self.test_sequences_zh = self.generate_sequences_from_corpus(test_corpus)
        self.dict_en, self.dict_size_en, self.index_dict_en = self.generate_dictionary(self.train_sequences_en)
        self.dict_zh, self.dict_size_zh, self.index_dict_zh = self.generate_dictionary(self.train_sequences_zh)
        self.train_sequences_en, self.train_sequences_zh = self.map_word(self.train_sequences_en, self.train_sequences_zh, self.dict_en, self.dict_zh)
        self.test_sequences_en, self.test_sequences_zh = self.map_word(self.test_sequences_en, self.test_sequences_zh, self.dict_en, self.dict_zh)
        self.train_data = self.split_batch(self.train_sequences_en, self.train_sequences_zh, BATCH_SIZE)
        self.test_data = self.split_batch(self.test_sequences_en, self.test_sequences_zh, BATCH_SIZE)

    def generate_sequences_from_corpus(self, corpus):
        en, zh = [], []
        with open(corpus, 'r', encoding='utf-8') as file:
            for pair in file:
                pair = pair.strip().split('\t')
                en_sentence = ["BOS"]
                zh_sentence = ["BOS"]
                en_sentence.extend(word_tokenize(pair[0].lower()))
                en_sentence.append("EOS")
                for word in pair[1]:
                    zh_sentence.extend(word_tokenize(word))
                zh_sentence.append("EOS")
                en.append(en_sentence)
                zh.append(zh_sentence)
        return en, zh

    def generate_dictionary(self, sequences, max_length=50000):
        word_count = Counter()
        for seq in sequences:
            for word in seq:
                word_count[word] += 1
        ls = word_count.most_common(max_length)
        total_words = len(ls) + 2
        word_dict = {}
        index_dict = {}
        for index, (word, _) in enumerate(ls, start=2):
            word_dict[word] = index
            index_dict[index] = word
        word_dict['UNK'] = UNK
        word_dict['PAD'] = PAD
        index_dict[UNK] = 'UNK'
        index_dict[PAD] = 'PAD'
        return word_dict, total_words, index_dict

    def map_word(self, sequences_en, sequences_zh, dict_en, dict_zh, sort=True):
        tokens_en, tokens_zh = [], []
        for en_sentence, zh_sentence in zip(sequences_en, sequences_zh):
            en_ids = [dict_en.get(word, UNK) for word in en_sentence]
            zh_ids = [dict_zh.get(word, UNK) for word in zh_sentence]
            tokens_en.append(en_ids)
            tokens_zh.append(zh_ids)
        if sort:
            sorted_indices = sorted(range(len(tokens_en)), key=lambda i: len(tokens_en[i]))
            tokens_en = [tokens_en[i] for i in sorted_indices]
            tokens_zh = [tokens_zh[i] for i in sorted_indices]
        return tokens_en, tokens_zh


    def split_batch(self, sequences_en, sequences_zh, batch_size, shuffle=True):
        idx_list = list(range(0, len(sequences_en), batch_size))
        if shuffle:
            np.random.shuffle(idx_list)
        batches = []
        for idx in idx_list:
            batch_indices = range(idx, min(idx + batch_size, len(sequences_en)))
            batch_en = [sequences_en[i] for i in batch_indices]
            batch_zh = [sequences_zh[i] for i in batch_indices]
            batch_zh = pad_sequence(batch_zh)
            batch_en = pad_sequence(batch_en)
            batches.append(Batch(batch_en, batch_zh))
        return batches
#%%
def compute_attention(query, key, value, mask=None, dropout_layer=None):
    dimension_key = query.size(-1)
    raw_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(dimension_key)

    if mask is not None:
        raw_scores.masked_fill_(mask == 0, float('-inf'))

    normalized_scores = F.softmax(raw_scores, dim=-1)

    if dropout_layer is not None:
        normalized_scores = dropout_layer(normalized_scores)

    weighted_values = torch.matmul(normalized_scores, value)

    return weighted_values, normalized_scores


def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0
#%%
class CreateEmbeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(CreateEmbeddings, self).__init__()
        self.d_model = d_model
        self.vocab_size = vocab
        self.embedding = nn.Embedding(self.vocab_size, self.d_model)
        self.scale = math.sqrt(self.d_model)

    def forward(self, x):
        embedded_x = self.embedding(x)
        scaled_embeddings = embedded_x * self.scale
        return scaled_embeddings

class EncodePositions(nn.Module):
    def __init__(self, model_dim, dropout_rate, max_length=5000):
        super(EncodePositions, self).__init__()
        self.model_dim = model_dim
        self.max_length = max_length
        self.dropout_layer = nn.Dropout(p=dropout_rate)

        positional_matrix = torch.zeros(self.max_length, self.model_dim, device=DEVICE)
        positions = torch.arange(0, self.max_length, device=DEVICE).unsqueeze(1)
        frequency_divisor = torch.exp(torch.arange(0, self.model_dim, 2, device=DEVICE) * -(math.log(10000.0) / self.model_dim))

        positional_matrix[:, 0::2] = torch.sin(positions * frequency_divisor)
        positional_matrix[:, 1::2] = torch.cos(positions * frequency_divisor)

        self.register_buffer('positional_matrix', positional_matrix.unsqueeze(0))

    def forward(self, x):
        x = x + self.positional_matrix[:, :x.size(1)]
        return self.dropout_layer(x)


class MHA(nn.Module):
    def __init__(self, head, model_dimensions, dropout=0.1):
        super(MHA, self).__init__()

        self.head_dimensions = model_dimensions // head
        self.head = head
        self.layers = clones(nn.Linear(model_dimensions, model_dimensions), 4)
        self.attention = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, q, k, v, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        num_batches = q.size(0)
        q, k, v = [layer(tensor).view(num_batches, -1, self.head, self.head_dimensions).transpose(1, 2)
                   for layer, tensor in zip(self.layers, (q, k, v))]

        attention_output, self.attention = compute_attention(q, k, v, mask=mask, dropout_layer=self.dropout)
        attention_output = attention_output.transpose(1, 2).contiguous().view(num_batches, -1, self.head * self.head_dimensions)
        return self.layers[-1](attention_output)


class Batch:
    def __init__(self, source, outputs=None, pad=0):
        source = torch.from_numpy(source).to(DEVICE).long()
        outputs = torch.from_numpy(outputs).to(DEVICE).long()

        self.source = source
        self.source_mask = (source != pad).unsqueeze(-2)

        if outputs is not None:
            self.outputs = outputs[:, :-1]
            self.output_targets = outputs[:, 1:]
            self.output_masks = self.generate_mask(self.outputs, pad)
            self.token_count = (self.output_targets != pad).data.sum()

    @staticmethod
    def generate_mask(target, padding):
        result = (target != padding).unsqueeze(-2)
        result = result & Variable(subsequent_mask(target.size(-1)).type_as(result.data))
        return result

class LayerNormalization(nn.Module):
    def __init__(self, num_features, epsiolon=1e-6):
        super(LayerNormalization, self).__init__()

        self.scale = nn.Parameter(torch.ones(num_features))
        self.shift = nn.Parameter(torch.zeros(num_features))

        self.epsilon = epsiolon

    def forward(self, x):
        avg = x.mean(-1, keepdim=True)
        variance = x.var(-1, keepdim=True)
        normalized = (x - avg) / torch.sqrt(variance + self.epsilon)
        return self.scale * normalized + self.shift

class ConnectionLayer(nn.Module):
    def __init__(self, dimension, drop_rate):
        super(ConnectionLayer, self).__init__()
        self.normalization = LayerNormalization(dimension)
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, x, function):
        return x + self.dropout(function(self.normalization(x)))

class FeedForward(nn.Module):
    def __init__(self, model_dimensions, feed_forward_dimensions, drop_rate=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(model_dimensions, feed_forward_dimensions)
        self.linear2 = nn.Linear(feed_forward_dimensions, model_dimensions)
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

#%%
# Transformer Architecture
class Transformer(nn.Module):
    def __init__(self, encoder, decoder, source_embeddings, target_embeddings, generator):
        super(Transformer, self).__init__()
        self.source_embeddings = source_embeddings
        self.target_embeddings = target_embeddings
        self.generator = generator
        self.encoder = encoder
        self.decoder = decoder

    def encode(self, source, source_mask):
        return self.encoder(self.source_embeddings(source), source_mask)

    def decode(self, memory, source_mask, target, target_mask):
        return self.decoder(self.target_embeddings(target), memory, source_mask, target_mask)

    def forward(self, source, target, source_mask, target_mask):
        return self.decode(self.encode(source, source_mask), source_mask, target, target_mask)

class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.normalization = LayerNormalization(layer.size)

    def forward(self, x, memory, source_mask, target_mask):
        for layer in self.layers:
            x = layer(x, memory, source_mask, target_mask)
        return self.normalization(x)

class DecoderLayer(nn.Module):
    def __init__(self, size, self_attention, source_attention, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.source_attention = source_attention
        self.feed_forward = feed_forward
        self.sublayer = clones(ConnectionLayer(size, dropout), 3)
        self.size = size
        self.self_attention = self_attention

    def forward(self, x, memory, source_mask, target_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attention(x, x, x, target_mask))
        x = self.sublayer[1](x, lambda x: self.source_attention(x, m, m, source_mask))
        return self.sublayer[2](x, self.feed_forward)

class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.normalization = LayerNormalization(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.normalization(x)

class EncoderLayer(nn.Module):
    def __init__(self, size, self_attention, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.sublayer = clones(ConnectionLayer(size, dropout), 2)
        self.size = size
        self.self_attention = self_attention
        self.feed_forward = feed_forward

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attention(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)
#%%
# Building Model

import nltk
nltk.download('punkt')

class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

class LabelSmoothing(nn.Module):
    def __init__(self, size, padding_index, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_index = padding_index
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        dist = x.data.clone()
        dist.fill_(self.smoothing / (self.size - 2))
        dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        dist[:, self.padding_index] = 0
        mask = torch.nonzero(target.data == self.padding_index, as_tuple=False)
        if mask.dim() > 0:
            dist.index_fill_(0, mask.squeeze(), 0.0)
        return self.criterion(x, Variable(dist, requires_grad=False))

class LossComputation:
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data.item() * norm.float()

class NoamOpt:
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        self._step = 0

    def step(self):
        self._step += 1
        rate = self.learning_rate()
        for param in self.optimizer.param_groups:
            param['lr'] = rate
        self._rate = rate
        self.optimizer.step()

    def learning_rate(self, step=None):
        if step is None:
            step = self._step
        return self.factor * (self.model_size ** (-0.5) * min(step ** (-0.5), step * self.warmup ** (-1.5)))

def make_model(source_vocab, target_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attention = MHA(h, d_model).to(DEVICE)
    ff = FeedForward(d_model, d_ff, dropout).to(DEVICE)
    position = EncodePositions(d_model, dropout).to(DEVICE)
    model = Transformer(
        Encoder(EncoderLayer(d_model, c(attention), c(ff), dropout).to(DEVICE), N).to(DEVICE),
        Decoder(DecoderLayer(d_model, c(attention), c(attention), c(ff), dropout).to(DEVICE), N).to(DEVICE),
        nn.Sequential(CreateEmbeddings(d_model, source_vocab).to(DEVICE), c(position)),
        nn.Sequential(CreateEmbeddings(d_model, target_vocab).to(DEVICE), c(position)),
        Generator(d_model, target_vocab).to(DEVICE)
    )
    for param in model.parameters():
        if param.dim() > 1:
            nn.init.xavier_uniform_(param)
    return model.to(DEVICE)

def get_std_opt(model):
    return NoamOpt(model.source_embeddings[0].d_model, 2, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    res = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for idx in range(max_len-1):
        out = model.decode(memory, src_mask, Variable(res), Variable(subsequent_mask(res.size(1)).type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        res = torch.cat([res, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return res

def run(data, model, loss_compute, epoch):
    start = time.time()
    total_tokens = 0.
    total_loss = 0.
    tokens = 0.
    for idx, batch in enumerate(data):
        out = model(batch.source, batch.outputs, batch.source_mask, batch.output_masks)
        loss = loss_compute(out, batch.output_targets, batch.token_count)
        total_loss += loss
        total_tokens += batch.token_count
        tokens += batch.token_count
        if idx % 100 == 1:
            elapsed = time.time() - start
            print("Epoch {:d}, Batch: {:d} Loss: {:.4f}".format(epoch, idx - 1, loss / batch.token_count, (tokens.float() / elapsed / 1000.)))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

def train(data, model, criterion, optimizer):
    best_loss = 1e5

    for epoch in range(NUM_EPOCHS):
        model.train()
        run(data.train_data, model, LossComputation(model.generator, criterion, optimizer), epoch)
        model.eval()

        test_loss = run(data.test_data, model, LossComputation(model.generator, criterion, None), epoch)

        if test_loss < best_loss:
            best_loss = test_loss
            torch.save(model.state_dict(), SAVE_CORPUS)

        print("----------------------------\n")
#%%
# Training
data = PreProcessing(TRAIN_CORPUS, TEST_CORPUS)
src_vocab = len(data.dict_en)
tgt_vocab = len(data.dict_zh)
model = make_model(src_vocab, tgt_vocab, N=NUM_LAYERS, d_model=MODEL_DIMENSIONS, d_ff=FEED_FORWARD_DIMENSIONS, h=NUM_HEADS, dropout=DROPOUT)
criterion = LabelSmoothing(size=tgt_vocab, padding_index=0, smoothing=0.1)
model_opt = NoamOpt(model.source_embeddings[0].d_model, 1, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

train(data, model, criterion, model_opt)
#%%
# Evaluating
def evaluate(data, model):
    model.eval()
    with torch.no_grad():
        for i in np.random.randint(0, len(data.test_sequences_en), size=10):
            english_sentence = " ".join([data.index_dict_en[word] for word in data.test_sequences_en[i] if word != PAD])
            zh_sentence = " ".join([data.index_dict_zh[word] for word in data.test_sequences_zh[i] if word != PAD])
            source = torch.from_numpy(np.array(data.test_sequences_en[i])).long().to(DEVICE).unsqueeze(0)
            source_mask = (source != PAD).unsqueeze(-2)
            out = greedy_decode(model, source, source_mask, max_len=MAX_LENGTH, start_symbol=data.dict_zh["BOS"])
            translation = []
            for j in range(1, out.size(1)):
                sym = data.index_dict_zh[out[0, j].item()]
                if sym != 'EOS':
                    translation.append(sym)
                else:
                    break
            print("\nOriginal English Sentence: ", english_sentence)
            print("Target Chinese Sentence: ", zh_sentence)
            print("Translated Chinese Sentence: ", " ".join(translation))

model.load_state_dict(torch.load(SAVE_CORPUS))
evaluate(data, model)
#%%
# BLEU SCORE
import random
import sacrebleu

def calculate_bleu_sacrebleu(data, model, device, sample_size):
    model.eval()
    sampled_data = random.sample(list(zip(data.test_sequences_en, data.test_sequences_zh)), sample_size)
    references = []
    hypotheses = []

    with torch.no_grad():
        for src, ref in sampled_data:
            ref_words = [data.index_dict_zh[word_id] for word_id in ref if word_id != data.dict_zh['PAD']]
            references.append([' '.join(ref_words)])

            src_tensor = torch.tensor([data.dict_en.get(word, data.dict_en['UNK']) for word in src], dtype=torch.long).to(device).unsqueeze(0)
            src_mask = (src_tensor != data.dict_en['PAD']).unsqueeze(-2)

            out = greedy_decode(model, src_tensor, src_mask, max_len=MAX_LENGTH, start_symbol=data.dict_zh["BOS"])
            translation = []
            for token_id in out[0, :]:
                word = data.index_dict_zh.get(token_id.item(), '[UNK]')
                if word == 'EOS':
                    break
                translation.append(word)

            hypotheses.append(' '.join(translation))

    bleu_score = sacrebleu.corpus_bleu(hypotheses, references)
    return bleu_score.score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(SAVE_CORPUS))
bleu_start = time.time()
bleu_score = calculate_bleu_sacrebleu(data, model, device, sample_size=500)
print(f"SacreBLEU Score on Development Set: {bleu_score:.2f}; Time: {time.time() - bleu_start:.4f} seconds")
