***Libraries and overhead***

In [1]:
import torch
import torch.nn as nn
import re
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from torchmetrics import F1Score
from sklearn.model_selection import train_test_split
from collections import defaultdict
from operator import add

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'1.12.1+cu102'

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
ALPHABET = ['а','б','в','г','д','е','ё','ж','з','и','й','к','л','м','н','о','п','р','с','т','у',
            'ф','х','ц','ч','ш','щ','ъ','ы','ь','э','ю','я']

***Helper Functions***

In [5]:
def convert_to_ortho(data, orthographies):
    newdata = []
    for line in data:
        line = line.lower()
        newline = ''
        r = np.random.randint(len(orthographies))
        for char in line:
            if orthographies[r].get(char):
                newline += orthographies[r].get(char)
            else:
                newline += char
        
        newdata.append(newline)
    return newdata

In [6]:
def readin_orthos():
    with open('../data/orthographies.txt') as f:
        rawtext = f.readlines()
    orthos = []
    for line in rawtext:
        if (line[0] == '#'):
            tmp = defaultdict();
        else:
            line = line.rstrip()
            sp = line.split('\t')
            tmp.update({sp[0] : sp[1]})
        if (len(tmp) == 33):
            orthos.append(tmp)


    return orthos

In [7]:
def clean_ortho_data(data):
    cdata = []
    for entry in data:
        if len(entry) == 2:
            continue
        e = re.sub(r'[^\w]','', entry)
        e = e.lower()
        e = list(e)
        e.insert(0, '<STR>')
        e.append('<END>')
        cdata.append(e)
    
    return cdata

In [29]:
def clean_data(data):
    cdata = []
    for entry in data:
        if len(entry) == 2:
            continue
        e = re.sub(r'[^\w]','', entry)
        e = e.lower()
        if re.sub(r'[бвгджзклмнпрстфхцчшщаэыуояеёюиьъй]', '', e):
            continue
        e = list(e)
        e.insert(0, '<STR>')
        e.append('<END>')
        cdata.append(e)
    
    return cdata

In [9]:
def get_labels(data):
    labels = []
    for entry in data:
        tmp = []
        for i in range(len(entry) - 1):
            tmp.append(entry[i + 1])
        labels.append(tmp)
    return labels

In [10]:
class CustomTextDataset(Dataset):
    def __init__(self, text, labels):
        self.labels = labels
        self.text = text

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        sample = (text, label)
        return sample

In [11]:
# add the ultimate character to the vector
def convert_label_to_vec(data, dicti):
    new_data = []
    for i in range(len(data)):
        tmp_list = np.zeros(len(dicti))
        # tmp_list = [0 for j in range(len(dicti))]
        tmp_list[dicti[data[i]] - 1] = 1
        new_data.append(tmp_list)

    return np.array(new_data)

In [12]:
# add the penultimate character to the vector
def pair_vecs(data, vecs, dicti):
    vecs[0] *= 2
    for i in range(1, len(data)):
        vecs[i][dicti[data[i - 1]] - 1] += 1
    
    return vecs

In [13]:
def cyr_cyr_step(data):
    for line in data:
        line.pop()
    
    return data

In [14]:
def evaluate(model, val_dataloader):
    """
    After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []
    f1_weighted = []

    # For each batch in our validation set...
    for _, (text, labels) in enumerate(tqdm(val_dataloader, position = 0, leave = True)):
        # Load batch to GPU
        text = text.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.FloatTensor).to(device)

        # Compute logits
        with torch.no_grad():
            output = model(text)

        # Compute loss
        loss = criterion(output, labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(output)
        labelloc = torch.argmax(labels)

        # Calculate the accuracy rate
        accuracy = (preds == labelloc).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)


    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    f1_weighted = np.mean(f1_weighted)

    return val_loss, val_accuracy, f1_weighted

**Model declarations**

*Probability Model (Cyrillic->Cyrillic prediction)*

In [15]:
class PModel():
    def __init__(self, alphabet):
        self.P = defaultdict()
        for char in alphabet:
            self.P.update({char : defaultdict()})
        
        for key, _ in self.P.items():
            for char in alphabet:
                self.P[key].update({char : 0})

    def display_model(self):
        for key, value in self.P.items():
            print('P[' + key + '] = ' + str(value))
            # for keyo, val in self.P[key].items():
            #     print('P[' + key + '][' + keyo + '] = ' + str(val), end = ' ')
            # print('\n')
    
    def model_build(self, X, Y):
        total = len(X)
        total_pairs = len(X) - 1
        indv_chars = defaultdict(lambda : False)

        for i, (cur, nex) in enumerate(zip(X, Y)):
            self.P[cur][nex] += 1

            if indv_chars[cur]:
                indv_chars[cur] += 1
            else:
                indv_chars.update({cur : 1})
        indv_chars['<END>'] += 1 # off by 1 error since it's not at the end of X

        for key, _ in indv_chars.items():
            indv_chars[key] /= float(total)

        for (cur, nex) in set(zip(X, Y)):
            self.P[cur][nex] = float(self.P[cur][nex]) / (total_pairs)
            self.P[cur][nex] = self.P[cur][nex] / indv_chars[cur]
    
    def model_test(self, X, Y):
        correct = 0
        total = len(X)

        for i in range(len(X)):
            pred = self.predict_next(X[i])
            if pred == Y[i]:
                correct += 1
        
        return float(correct) / total
        
    def predict_next(self, char):
        return max(self.P[char], key=self.P[char].get)

*Feedforward Model (Cyrillic->Cyrillic prediction)*

In [16]:
class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
         super(NeuralNet, self).__init__()
         self.input_size = input_size
         self.l1 = nn.Linear(input_size, hidden_size) 
         self.relu = nn.ReLU()
         self.l2 = nn.Linear(hidden_size, num_classes)
         self.s1 = nn.Softmax(dim = 0)

     def forward(self, x):
         out = self.l1(x)
         out = self.relu(out)
         out = self.l2(out)
         out = self.s1(out)
         return out

*Transducer Model (Latin->Cyrillic prediction)*

In [17]:
# class Joiner(torch.nn.Module):
#   def __init__(self, num_outputs):
#     super(Joiner, self).__init__()
#     self.linear = torch.nn.Linear(joiner_dim, num_outputs)

#   def forward(self, encoder_out, predictor_out):
#     out = encoder_out + predictor_out
#     out = torch.nn.functional.relu(out)
#     out = self.linear(out)
#     return out

In [18]:
# class Encoder(torch.nn.Module):
#   def __init__(self, num_inputs):
#     super(Encoder, self).__init__()
#     self.embed = torch.nn.Embedding(num_inputs, encoder_dim)
#     self.rnn = torch.nn.GRU(input_size=encoder_dim, hidden_size=encoder_dim, num_layers=3, batch_first=True, bidirectional=True, dropout=0.1)
#     self.linear = torch.nn.Linear(encoder_dim*2, joiner_dim)

#   def forward(self, x):
#     out = x
#     out = self.embed(out)
#     out = self.rnn(out)[0]
#     out = self.linear(out)
#     return out

In [19]:
# class Predictor(torch.nn.Module):
#   def __init__(self, num_outputs):
#     super(Predictor, self).__init__()
#     self.embed = torch.nn.Embedding(num_outputs, predictor_dim)
#     self.rnn = torch.nn.GRUCell(input_size=predictor_dim, hidden_size=predictor_dim)
#     self.linear = torch.nn.Linear(predictor_dim, joiner_dim)
    
#     self.initial_state = torch.nn.Parameter(torch.randn(predictor_dim))
#     self.start_symbol = NULL_INDEX # In the original paper, a vector of 0s is used; just using the null index instead is easier when using an Embedding layer.

#   def forward_one_step(self, input, previous_state):
#     embedding = self.embed(input)
#     state = self.rnn.forward(embedding, previous_state)
#     out = self.linear(state)
#     return out, state

#   def forward(self, y):
#     batch_size = y.shape[0]
#     U = y.shape[1]
#     outs = []
#     state = torch.stack([self.initial_state] * batch_size).to(y.device)
#     for u in range(U+1): # need U+1 to get null output for final timestep 
#       if u == 0:
#         decoder_input = torch.tensor([self.start_symbol] * batch_size).to(y.device)
#       else:
#         decoder_input = y[:,u-1]
#       out, state = self.forward_one_step(decoder_input, state)
#       outs.append(out)
#     out = torch.stack(outs, dim=1)
#     return out

In [21]:
class Transducer(nn.Module):
    def __init__(self, input_size, output_size, beam_len,
                 encoder = 0,
                 joiner = 0,
                 predictor = 0):
        super(Transducer, self).__init__()
        if encoder:
            self.encoder = encoder
        else:
            self.encoder = Encoder(input_size, output_size)
        
        if joiner:
            self.joiner = joiner
        else:
            self.joiner = Joiner(input_size, output_size)

        if predictor:
            self.predictor = predictor
        else:
            self.predictor = Predictor(input_size, output_size)

    self.beam_len = beam_len
    
    # take one step through the network to get a final softmax
    def network_step(self, y):
        sftmx = self.predictor.forward(y)
        embed = self.encoder.forward(y)
        out = self.joiner.forward(embed, sftmx)
        return out
        
    # take one step through a full example with multiple network steps
    def forward(self, y):
        batch_size = y.shape[0]
        U = y.shape[1]
        
        preds = []
        best = []
        
        for u in range(U + 1):
            if (u == 0):
                prev = '<STR>'
            else:
                prev = y[:, u - 1]
            
            out = self.network_step(self, y)
            preds.append(torch.max(out))
        return preds

***Data preprocessing***

In [22]:
with open('../data/words.txt') as f:
    data = f.readlines()

In [23]:
orthographies = readin_orthos()

In [24]:
ortho_data = convert_to_ortho(data, orthographies)

In [26]:
cortho_data = clean_ortho_data(ortho_data)

In [30]:
cdata = clean_data(data)

In [31]:
labels = get_labels(cdata)

In [32]:
cdata = cyr_cyr_step(cdata)

In [34]:
X = [char for word in cdata for char in word]

In [35]:
Y = [char for target in labels for char in target]

***Feed forward model***

In [44]:
alpha_dict = defaultdict()
for i in range(len(ALPHABET)):
    alpha_dict.update({ALPHABET[i]: i + 1})

alpha_dict.update({'<STR>' : len(ALPHABET) + 1})
alpha_dict.update({'<END>' : len(ALPHABET) + 2})

In [45]:
print(alpha_dict)

defaultdict(None, {'а': 1, 'б': 2, 'в': 3, 'г': 4, 'д': 5, 'е': 6, 'ё': 7, 'ж': 8, 'з': 9, 'и': 10, 'й': 11, 'к': 12, 'л': 13, 'м': 14, 'н': 15, 'о': 16, 'п': 17, 'р': 18, 'с': 19, 'т': 20, 'у': 21, 'ф': 22, 'х': 23, 'ц': 24, 'ч': 25, 'ш': 26, 'щ': 27, 'ъ': 28, 'ы': 29, 'ь': 30, 'э': 31, 'ю': 32, 'я': 33, '<STR>': 34, '<END>': 35})


In [50]:
numX = convert_label_to_vec(X, alpha_dict)

In [47]:
# numX = pair_vecs(X, numX, alpha_dict)

In [48]:
numY = convert_label_to_vec(Y, alpha_dict)

In [55]:
numX = torch.Tensor(numX).to(device = device, dtype=torch.float)
numY = torch.Tensor(numY).to(device = device, dtype=torch.float)

In [56]:
numX[:3]

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])

In [57]:
data_obj = CustomTextDataset(numX, numY)

In [58]:
train_dataset, test_dataset = random_split(data_obj, [int(0.7 * len(data_obj)), int(0.3 * len(data_obj))], generator=torch.Generator().manual_seed(42))

In [59]:
batch_data = DataLoader(train_dataset, batch_size = 64)
test_batch_data = DataLoader(train_dataset, batch_size = 64)

In [60]:
input_size = len(alpha_dict)
hidden_size = len(alpha_dict)
num_classes = len(alpha_dict)
num_epochs = 2
learning_rate = 0.1

In [61]:
model = NeuralNet(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [62]:
n_total_steps = len(batch_data)
for epoch in tqdm(range(num_epochs)):
    for i, (text, labels) in enumerate(tqdm(batch_data, position = 0, leave = True)):
        text = text.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.FloatTensor).to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(text)

        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()

    print ('\n', f'Epoch [{epoch+1}/{num_epochs}], Step[{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

100%|████████████████████████████████████████████████████████████████████████████| 27804/27804 [00:45<00:00, 610.80it/s]
 50%|██████████████████████████████████████████▌                                          | 1/2 [00:45<00:45, 45.52s/it]


 Epoch [1/2], Step[27804/27804], Loss: 3.5491


100%|████████████████████████████████████████████████████████████████████████████| 27804/27804 [00:39<00:00, 699.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:25<00:00, 42.63s/it]


 Epoch [2/2], Step[27804/27804], Loss: 3.5547





In [63]:
evaluate(model, test_batch_data)

100%|███████████████████████████████████████████████████████████████████████████| 27804/27804 [00:21<00:00, 1290.50it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


(3.545538928067662, 0.6653718889368436, nan)

***Probability Model***

In [84]:
P = PModel(alpha_dict.keys())

In [85]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.30, random_state=10)

In [86]:
P.model_build(xtrain, ytrain)

In [87]:
P.model_test(xtest, ytest)

0.2591196799408347

***RNN-Transducer Model***

In [48]:
input_size = len(alpha_dict)
encoder_dim = len(alpha_dict)
joiner_dim = len(alpha_dict)
predictor_dim = len(alpha_dict)

num_classes = len(alpha_dict)
num_epochs = 1
learning_rate = 0.01