In [123]:
import re
import os
import numpy as np
import math
from collections import defaultdict, Counter, OrderedDict

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator, vocab

from itertools import chain
from torchtext.data.utils import get_tokenizer

## other imports 
import io
import spacy
import numpy as np
import spacy.cli
import random

In [124]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

In [125]:
#normalize = lambda x: re.sub("'","",normalizer.normalize_str(x))

START = "<start>"
END = "<end>"
UNK = "<unk>"
PAD = "<pad>"
BD = "<bd>"
SPECIALS = [START,END,UNK,PAD, BD]

In [100]:
langs_codes = {"Gitksan": "git", "Arapaho":"arp", "Lezgi":"lez", "Nyangbo":"nyb", "Tsez":"ddo","Uspanteko":"usp"}

def tag_bd(seq):
    for i, c in enumerate(seq):
        if c == "-":
            seq[i] = "<bd>"
    return seq

def get_lang_df(lang):
    ld = {
        "train": defaultdict(list),
        "dev": defaultdict(list)
    }
        
    path = f"data/{lang}/"
    train_fn = f"{langs_codes[lang]}-train-track2-uncovered"
    dev_fn = f"{langs_codes[lang]}-dev-track2-uncovered"

    for fp in [train_fn, dev_fn]:
        data_type = fp.split("-")[1]
        with open(path + fp, "r") as f:
            for line in f.readlines():
                

                if line.startswith("\\t"):
                    ld[data_type]["transcription"].append(line.lstrip("\\t ").rstrip("\n"))
                if line.startswith("\g"):
                    ld[data_type]["glosses"].append(line.lstrip("\\g ").rstrip("\n"))
                if line.startswith("\m"):
                    ld[data_type]["morphemes"].append(line.lstrip("\\m ").rstrip("\n"))

                if line.startswith("\l"):
                    ld[data_type]["translation"].append(normalize(line.lstrip("\\l ").rstrip("\n")))
    
    train_df = pd.DataFrame.from_dict(ld["train"])
    dev_df = pd.DataFrame.from_dict(ld["dev"])
    
    return ld["train"],ld["dev"]



In [126]:
def get_segmented(line):
    BD = "#"
    allsplits=[]

    splits=[m.split() for m in line]
    for i in splits:
        for m in i:
            allsplits.append(m)
    return allsplits

def get_segmented_data(langdict,splits=["morphemes","glosses"]):
    data = {}
    for split in splits:
        data[split] = get_segmented(langdict[split])
    return data

def get_characterized_data(segments,splits=["morphemes","glosses"]):
    data = {}
    for split in splits:
        data[split] = [[START] + tag_bd(list(m)) + [END] for m in segments[split]]
    return data

def get_train_val_split(data, props):
    test_data = []
    train_data = []
    for i, dat in enumerate(data):
        if i%props == 0:
            test_data.append(dat)
        else:
            train_data.append(dat)
    return test_data, train_data


In [129]:
train,dev = get_lang_df("Arapaho")
train_morphs, train_glosses = train["morphemes"], train["glosses"]
dev_morphs, dev_glosses = dev["morphemes"], dev["glosses"]

voc = build_vocab_from_iterator(chain(train_morphs,train_glosses), specials = SPECIALS, special_first=True)
voc.set_default_index(voc[UNK])
transform_sequence = lambda x: [voc[c] for c in x]    

In [130]:
def collate_batch(batch):
    
    input, output = [], []
    for morphemes, glosses in batch:
        input_tensor = torch.tensor(transform_sequence(morphemes), dtype=torch.long)
        output_tensor = torch.tensor(transform_sequence(glosses), dtype=torch.long)
        input.append(input_tensor)
        output.append(output_tensor)
        

    return pad_sequence(input, batch_first=False, padding_value=voc[PAD]), pad_sequence(output, batch_first=False, padding_value=voc[PAD])

In [105]:
train_processed = get_characterized_data(get_segmented_data(train))
dev_processed = get_characterized_data(get_segmented_data(dev))

In [106]:
train_test_data = [(morph, gloss)for morph, gloss in zip(train_processed["morphemes"], train_processed["glosses"])]
dev_data = [(morph, gloss)for morph, gloss in zip(dev_processed["morphemes"], dev_processed["glosses"])]

test_data, train_data = get_train_val_split(train_test_data, 5)

In [111]:

train_dataloader = DataLoader(
    train_data, 
    batch_size=1,
    collate_fn=collate_batch,
    shuffle=True
)

dev_dataloader = DataLoader(
    dev_data, 
    batch_size=1,
    collate_fn=collate_batch,
    shuffle=True
)

test_dataloader = DataLoader(
    test_data, 
    batch_size=1,
    collate_fn=collate_batch,
    shuffle=True
)

In [112]:
for batch in train_dataloader:
    src, trg = batch
    print('tensor size of source language:', src.shape)
    print('tensor size of target language:', trg.shape)
    print('the tensor of first example in target language:', trg[:, 0])
    print('the tensor of first example in src language:', src[:, 0])
    print([voc.get_itos()[i] for i in src[:, 0]])
    print([voc.get_itos()[i] for i in trg[:, 0]])
    break

tensor size of source language: torch.Size([7, 1])
tensor size of target language: torch.Size([6, 1])
the tensor of first example in target language: tensor([ 0, 17, 13,  8, 17,  1])
the tensor of first example in src language: tensor([ 0, 13,  7, 16, 15, 15,  1])
['<start>', 'h', 'i', 'n', 'e', 'e', '<end>']
['<start>', 't', 'h', 'a', 't', '<end>']


In [113]:
import pickle

with open("torchdata/train_data", "wb") as f:
     pickle.dump(train_data, f)

with open("torchdata/val_data", "wb") as f:
     pickle.dump(dev_data, f)

with open("torchdata/test_data", "wb") as f:
     pickle.dump(test_data, f)
        
with open("torchdata/vocab", "wb") as f:
     pickle.dump(voc, f)

In [114]:
seed(0)
torch.manual_seed(0)
np.random.seed(0)

import re

# Hyperparameters
EMBEDDING_DIM=50
RNN_HIDDEN_DIM=50
RNN_LAYERS=1
BATCH_SIZE=10
CHAR_DROPOUT=0.2
EPOCHS=10

# Maximum length of generated output word forms.
MAXWFLEN=40

def accuracy(sys,gold):
    assert(len(sys) == len(gold))
    return sum([1 if x==y else 0 for x,y in zip(sys,gold)])*100.0/len(gold)

In [115]:
class Encoder(nn.Module):
        def __init__(self,alphabet):
                super(Encoder,self).__init__()
                self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
                self.rnn = nn.LSTM(EMBEDDING_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=True)

        def forward(self,ex):
            input, _ = ex
            encoder_embedded = self.embedding(input)
            encoder_output, (hn,cn) = self.rnn(encoder_embedded)
            return encoder_output

# An assertion to test that your implementation returns an object of the correct size. 


In [116]:
class Decoder(nn.Module):
    def __init__(self, alphabet):
        super(Decoder,self).__init__()
        self.alphabet = alphabet
        self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
        self.attention = Attention()
        self.rnn = nn.LSTM(EMBEDDING_DIM+2*RNN_HIDDEN_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=False)
        self.hidden2char = nn.Linear(RNN_HIDDEN_DIM, len(alphabet))
    
    def forward(self,ex,encoder_hss):
        ex = input, output
        
        output_length = len(output)

        embedded_output = self.embedding(output[:-1])
        #print(embedded_output)
        results = []
        decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM,requires_grad=False), 
                         torch.zeros(1,1,RNN_HIDDEN_DIM,requires_grad=False))
        
        #decoder_states_expanded = decoder_state[0].expand(embedded_output.shape[0],-1,-1)
        for i in range(output_length-1):
            context = self.attention(encoder_hss, decoder_state[0])
            _, decoder_state = self.rnn(torch.cat([embedded_output[i].unsqueeze(0), context], dim=2), decoder_state)
            decoder_hs = decoder_state[0]
            #print(decoder_state[0].shape)
            result = self.hidden2char(decoder_hs)
            #print(log_softmax(result, dim=2).shape)
            results.append(result)
        
        # print("results")
        # print(torch.cat(results).shape)
        # print(log_softmax(torch.cat(results), dim=2))
        return log_softmax(torch.cat(results), dim=2), output[1:]
                    
    def generate(self,encoder_hss):
        with torch.no_grad():
            decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM), torch.zeros(1,1,RNN_HIDDEN_DIM))
            output_char = torch.LongTensor([[self.alphabet["<start>"]]])
            result = []
            for _ in range(MAXWFLEN):
                output_embedding = self.embedding(output_char)
                context = self.attention(encoder_hss, decoder_state[0])
                #print(context.shape)
                _, decoder_state = self.rnn(torch.cat([output_embedding, context], dim=2), decoder_state)
                output_char =  torch.LongTensor([[self.hidden2char(decoder_state[0]).argmax()]])
                #print(output_char.numpy().tolist()[0][0])

                result.append(output_char.numpy().tolist()[0][0])
            return result
            


In [118]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention,self).__init__()

        self.linear1 = nn.Linear(3*RNN_HIDDEN_DIM,RNN_HIDDEN_DIM)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(RNN_HIDDEN_DIM,1)
    
    def forward(self,encoder_hss,decoder_hs):
        # your code here
        decoder_hs = decoder_hs.expand(encoder_hss.size()[0],-1,-1)
        conditioned = torch.cat([decoder_hs,encoder_hss],dim=2)
        h1 = self.relu(self.linear1(conditioned))
        h2 = self.linear2(h1).softmax(dim=0)
        weights = h2.expand(-1,-1,2*RNN_HIDDEN_DIM)
        weighted_mean = torch.sum(weights * encoder_hss, dim=0)
        return weighted_mean.unsqueeze(0)
        # your code here

# # An assertion to test that your implementation returns an object of the correct size. 
# input, input_length = example.input
# encoder_hss = Encoder(vocab.get_stoi())(example)
# decoder_hs = torch.randn(1,1,RNN_HIDDEN_DIM)

# assert(Attention()(encoder_hss,decoder_hs).size() == torch.Size([1,1,2*RNN_HIDDEN_DIM]))

In [122]:
class WordInflector(nn.Module):
    def __init__(self, alphabet):
        super(WordInflector, self).__init__()
        self.c2i = alphabet.get_stoi()
        self.i2c = alphabet.get_itos()
        alphabet_size = len(self.c2i)
        
        self.encoder = Encoder(self.c2i)
        self.decoder = Decoder(self.c2i)
    
    def get_string(self,ids):
        string = ''.join([self.i2c[i] for i in ids])
        return re.sub("%s.*" % "<end>","",string)

    def forward(self, example):
        encoder_hs = self.encoder(example)
        return self.decoder(example,encoder_hs)
            
    def generate(self, data):
        all_results = []
        with torch.no_grad():
            for example in data:
                encoder_hs = self.encoder(example)
                output = self.decoder.generate(encoder_hs)
                all_results.append(self.get_string(output))
        return all_results
    
if __name__=="__main__":
    train_iter, dev_iter = train_dataloader, dev_dataloader
    vocab = voc
    
    inflector = WordInflector(vocab)

    loss_function = nn.NLLLoss(ignore_index=inflector.c2i["<pad>"],reduction='mean')
    optimizer = Adam(inflector.parameters())
    gold_dev_words = [''.join(output) for input,output in dev_iter.dataset]

    for epoch in range(EPOCHS):
        tot_loss = 0 

        # Update parameters
        for i, batch in enumerate(train_iter):
            print("Example %u of %u" % (i+1,len(train_iter)),end="\r")
            inflector.zero_grad()
            tag_scores, tgt = inflector(batch)
            tgt = tgt.permute(1,0)
            tag_scores = tag_scores.permute(1,2,0)
            loss = loss_function(tag_scores,tgt) 
            tot_loss += loss.detach().numpy()
            loss.backward()
            optimizer.step()
        print()
        avg_loss = tot_loss/len(train_iter)
        print("EPOCH %u: AVG LOSS PER EX: %.5f" % (epoch+1,avg_loss))        

        # Evaluate on dev data.
        sys_dev_words = inflector.generate(dev_iter)
        print("DEV ACC: %.2f%%" % accuracy(sys_dev_words,gold_dev_words))
        
        torch.save(inflector , "ckpt/"+"tsez-" + str(epoch) + ".pt")

        

Example 64619 of 111771

KeyboardInterrupt: 

In [None]:
sys_dev_words