In [16]:
import re
import os
from collections import defaultdict
import regex as re
import numpy as np
from collections import defaultdict, Counter, OrderedDict, namedtuple
import pandas as pd

from torchtext.vocab import build_vocab_from_iterator, vocab
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from itertools import chain

import nltk
import sklearn
import scipy.stats
# from sklearn.metrics import make_scorer
# from sklearn.cross_validation import cross_val_score
# from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



##  torch imports
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import DataLoader

## torchtext imports
import torchtext
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer

## other imports 
import io
from collections import OrderedDict, Counter
import spacy
import numpy as np
import spacy.cli
import random

import pycrfsuite
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents

In [17]:
# def accent_normalize(string):
#     for char in accents_dict:
#         string = re.sub(char, accents_dict[char],string)
#     return string

normalizer = normalizers.Sequence([NFD(),StripAccents()])

In [18]:
normalize = lambda x: re.sub("'","",normalizer.normalize_str(x))

START = "<start>"
END = "<end>"
UNK = "<unk>"
PAD = "<pad>"
BD = "<bd>"
SPECIALS = [START,END,UNK,PAD, BD]

In [19]:
langs_codes = {"Gitksan": "git", "Arapaho":"arp", "Lezgi":"lez", "Nyangbo":"nyb", "Tsez":"ddo","Uspanteko":"usp"}

def tag_bd(seq):
    for i, c in enumerate(seq):
        if c == "-":
            seq[i] = "<bd>"
    return seq

def get_lang_df(lang):
    ld = {
        "train": defaultdict(list),
        "dev": defaultdict(list)
    }
        
    path = f"data/{lang}/"
    train_fn = f"{langs_codes[lang]}-train-track2-uncovered"
    dev_fn = f"{langs_codes[lang]}-dev-track2-uncovered"

    for fp in [train_fn, dev_fn]:
        data_type = fp.split("-")[1]
        with open(path + fp, "r") as f:
            for line in f.readlines():
                

                if line.startswith("\\t"):
                    ld[data_type]["transcription"].append(line.lstrip("\\t ").rstrip("\n"))
                if line.startswith("\g"):
                    ld[data_type]["glosses"].append(line.lstrip("\\g ").rstrip("\n"))
                if line.startswith("\m"):
                    ld[data_type]["morphemes"].append(line.lstrip("\\m ").rstrip("\n"))

                if line.startswith("\l"):
                    ld[data_type]["translation"].append(normalize(line.lstrip("\\l ").rstrip("\n")))
    
    train_df = pd.DataFrame.from_dict(ld["train"])
    dev_df = pd.DataFrame.from_dict(ld["dev"])
    
    return ld["train"],ld["dev"]



In [20]:
train,dev=get_lang_df("Lezgi")

In [21]:
train_morphs, train_glosses = train["morphemes"], train["glosses"]
dev_morphs, dev_glosses = dev["morphemes"], dev["glosses"]

def get_segmented(line):
    BD = "#"
    allsplits=[]

    splits=[m.split() for m in line]
    for i in splits:
        for m in i:
            allsplits.append(m)
    return allsplits

In [22]:
def get_segmented_data(langdict,splits=["morphemes","glosses"]):
    data = {}
    for split in splits:
        data[split] = get_segmented(langdict[split])
    return data

def get_characterized_data(segments,splits=["morphemes","glosses"]):
    data = {}
    for split in splits:
        data[split] = [[START] + tag_bd(list(m)) + [END] for m in segments[split]]
    return data

In [23]:
train_processed = get_characterized_data(get_segmented_data(train))
dev_processed = get_characterized_data(get_segmented_data(dev))

In [24]:
train_test_data = [(morph, gloss)for morph, gloss in zip(train_processed["morphemes"], train_processed["glosses"])]
dev_data = [(morph, gloss)for morph, gloss in zip(dev_processed["morphemes"], dev_processed["glosses"])]

In [25]:
def get_train_val_split(data, props):
    test_data = []
    train_data = []
    for i, dat in enumerate(data):
        if i%props == 0:
            test_data.append(dat)
        else:
            train_data.append(dat)
    return test_data, train_data
    

In [26]:
test_data, train_data = get_train_val_split(train_test_data, 5)

In [27]:
voc = build_vocab_from_iterator(chain(train_morphs,train_glosses), specials = SPECIALS, special_first=True)
voc.set_default_index(voc[UNK])

In [28]:

def collate_batch(batch):
    transform_sequence = lambda x: [voc[c] for c in x]    

    input, output = [], []
    for morphemes, glosses in batch:
        input_tensor = torch.tensor(transform_sequence(morphemes), dtype=torch.long)
        output_tensor = torch.tensor(transform_sequence(glosses), dtype=torch.long)
        input.append(input_tensor)
        output.append(output_tensor)
        

    return pad_sequence(input, batch_first=False, padding_value=voc[PAD]), pad_sequence(output, batch_first=False, padding_value=voc[PAD])




In [29]:

train_dataloader = DataLoader(
    train_data, 
    batch_size=1,
    collate_fn=collate_batch,
    shuffle=True
)


dev_dataloader = DataLoader(
    dev_data, 
    batch_size=1,
    collate_fn=collate_batch,
    shuffle=True
)

test_dataloader = DataLoader(
    test_data, 
    batch_size=1,
    collate_fn=collate_batch,
    shuffle=True
)

In [30]:
for batch in train_dataloader:
    src, trg = batch
    print('tensor size of source language:', src.shape)
    print('tensor size of target language:', trg.shape)
    print('the tensor of first example in target language:', trg[:, 0])
    print('the tensor of first example in src language:', src[:, 0])
    print([voc.get_itos()[i] for i in src[:, 0]])
    print([voc.get_itos()[i] for i in trg[:, 0]])
    break

tensor size of source language: torch.Size([12, 1])
tensor size of target language: torch.Size([17, 1])
the tensor of first example in target language: tensor([ 0, 46,  8,  4, 56, 82, 34,  4, 38, 34, 38,  4, 35, 71, 35, 34,  1])
the tensor of first example in src language: tensor([ 0, 77, 28,  4, 19,  7,  4, 54,  4, 19, 11,  1])
['<start>', 'ж', 'е', '<bd>', 'д', 'а', '<bd>', 'й', '<bd>', 'д', 'и', '<end>']
['<start>', 'b', 'e', '<bd>', 'F', 'U', 'T', '<bd>', 'P', 'T', 'P', '<bd>', 'S', 'B', 'S', 'T', '<end>']


In [33]:
from torch import Tensor
import torch.nn as nn
import math
from torch.nn import Transformer

In [None]:
from torchtext.vocab import Vectors

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, emb_size, embedding: Vectors):
        super(TokenEmbedding, self).__init__()
        #self.embedding = nn.Embedding(vocab_size, emb_size)
        self.embed = embedding
        #self.emb_size = emb_size
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embed(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 src_embeddings: nn.Embedding,
                 tgt_embeddings: Vectors,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(emb_size, src_embeddings)
        self.tgt_tok_emb = TokenEmbedding(emb_size, tgt_embeddings)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
torch.manual_seed(531)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 300  # GloVe dim
NHEAD = 6
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

# nn.Embeddings for source language (FR)
src_embeddings = nn.Embedding(SRC_VOCAB_SIZE, EMB_SIZE)

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, src_embeddings,
                                 tgt_embeddings, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_epoch(model, iterator, optimizer):
    model.train()
    losses = 0

    for src, tgt in tqdm(iterator):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(iterator)


def evaluate(model, iterator):
    model.eval()
    losses = 0

    for src, tgt in iterator:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(iterator)

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 20

for epoch in range(1, NUM_EPOCHS + 1):
    start_time = timer()
    train_loss = train_epoch(transformer, train_dataloader, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer, valid_dataloader)

    state_dict_model = transformer.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
    }

    torch.save(state, "./drive/MyDrive/Colab Notebooks/ckpt_tut4_emb/transformer_"+str(epoch) + ".pt")

    print((f"Epoch: {epoch},  "f"Epoch time = {(end_time - start_time):.3f}s"))
    print(f'\t Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f}')