***Libraries and overhead***

In [69]:
import torch
import torch.nn as nn
import re
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from torchmetrics import F1Score
from sklearn.model_selection import train_test_split
from collections import defaultdict
from operator import add

In [70]:
torch.__version__

'1.12.1+cu102'

In [71]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


***Helper Functions***

In [92]:
def convert_to_ortho(data, orthographies):
    newdata = []
    for line in data:
        line = line.lower()
        newline = ''
        r = np.random.randint(len(orthographies))
        for char in line:
            if orthographies[r].get(char):
                newline += orthographies[r].get(char)
            else:
                newline += char
        
        newdata.append(newline)
    return newdata

In [74]:
def readin_orthos():
    with open('../data/orthographies.txt') as f:
        rawtext = f.readlines()
    orthos = []
    for line in rawtext:
        if (line[0] == '#'):
            tmp = defaultdict();
        else:
            line = line.rstrip()
            sp = line.split('\t')
            tmp.update({sp[0] : sp[1]})
        if (len(tmp) == 33):
            orthos.append(tmp)


    return orthos

In [110]:
def clean_ortho_data(data):
    cdata = []
    for entry in data:
        if len(entry) == 2:
            continue
        e = re.sub(r'[^\w]','', entry)
        e = e.lower()
        e = list(e)
        e.insert(0, '<STR>')
        e.append('<END>')
        cdata.append(e)
    
    return cdata

In [113]:
def clean_data(data):
    clean_data = []
    for entry in data:
        if len(entry) == 2:
            continue
        e = re.sub(r'[^\w]','', entry)
        e = e.lower()
        if re.sub(r'[бвгджзклмнпрстфхцчшщаэыуояеёюиьъй]', '', e):
            continue
        e = list(e)
        e.insert(0, '<STR>')
        e.append('<END>')
        clean_data.append(e)
    
    return clean_data

***Data preprocessing***

In [None]:
with open('../data/words.txt') as f:
    data = f.readlines()

In [98]:
orthographies = readin_orthos()

In [99]:
ortho_data = convert_to_ortho(data, orthographies)

In [111]:
clean_ortho_data = clean_ortho_data(testing_data)

In [109]:
samp[:10]

[['<STR>', 'r', 'e', 'y', '<END>'],
 ['<STR>', 'm', 'i', 'l', 'l', 'a', 'n', 'd', '<END>'],
 ['<STR>', 'e', 'n', 't', 'o', 'n', 'i', '<END>'],
 ['<STR>', 'k', 'u', 'i', 'n', '<END>'],
 ['<STR>', 'd', 'y', 'e', 'b', 'r', 'a', '<END>'],
 ['<STR>', 'p', 'a', 'z', 'h', 'y', 'e', 't', '<END>'],
 ['<STR>', 'f', 'e', 'e', 'l', 'm', 'j', 'e', 'h', '<END>'],
 ['<STR>', 'b', 'y', 'e', 'h', 'r', 'y', 'e', 'h', 'g', '<END>'],
 ['<STR>', 'r', 'y', 'e', 'k', 'i', '<END>'],
 ['<STR>', 'f', 'e', 'e', 'l', 'm', 'y', 'e', 'h', '<END>']]

In [9]:
data[:10]

['Рэй\n',
 'МИЛЛАНД,\n',
 'Энтони\n',
 'КУИН,\n',
 'Дебра\n',
 'ПАЖЕТ\n',
 'в\n',
 'фильме\n',
 'БЕРЕГ\n',
 'РЕКИ\n']

In [101]:
testing_data[:10]

['rey\n',
 'milland,\n',
 'entoni\n',
 'kuin,\n',
 'dyebra\n',
 'pazhyet\n',
 'v\n',
 "feel'mjeh\n",
 'byehryehg\n',
 'ryeki\n']

In [11]:
clean_data[:10]

[['<STR>', 'р', 'э', 'й', '<END>'],
 ['<STR>', 'м', 'и', 'л', 'л', 'а', 'н', 'д', '<END>'],
 ['<STR>', 'э', 'н', 'т', 'о', 'н', 'и', '<END>'],
 ['<STR>', 'к', 'у', 'и', 'н', '<END>'],
 ['<STR>', 'д', 'е', 'б', 'р', 'а', '<END>'],
 ['<STR>', 'п', 'а', 'ж', 'е', 'т', '<END>'],
 ['<STR>', 'ф', 'и', 'л', 'ь', 'м', 'е', '<END>'],
 ['<STR>', 'б', 'е', 'р', 'е', 'г', '<END>'],
 ['<STR>', 'р', 'е', 'к', 'и', '<END>'],
 ['<STR>', 'ф', 'и', 'л', 'ь', 'м', 'е', '<END>']]

In [12]:
def get_labels(data):
    labels = []
    for entry in data:
        tmp = []
        for i in range(len(entry) - 1):
            tmp.append(entry[i + 1])
        labels.append(tmp)
    return labels

In [13]:
labels = get_labels(clean_data)

In [14]:
print(labels[:10])

[['р', 'э', 'й', '<END>'], ['м', 'и', 'л', 'л', 'а', 'н', 'д', '<END>'], ['э', 'н', 'т', 'о', 'н', 'и', '<END>'], ['к', 'у', 'и', 'н', '<END>'], ['д', 'е', 'б', 'р', 'а', '<END>'], ['п', 'а', 'ж', 'е', 'т', '<END>'], ['ф', 'и', 'л', 'ь', 'м', 'е', '<END>'], ['б', 'е', 'р', 'е', 'г', '<END>'], ['р', 'е', 'к', 'и', '<END>'], ['ф', 'и', 'л', 'ь', 'м', 'е', '<END>']]


In [15]:
for entry in clean_data:
    entry.pop()

In [16]:
clean_data[:10]

[['<STR>', 'р', 'э', 'й'],
 ['<STR>', 'м', 'и', 'л', 'л', 'а', 'н', 'д'],
 ['<STR>', 'э', 'н', 'т', 'о', 'н', 'и'],
 ['<STR>', 'к', 'у', 'и', 'н'],
 ['<STR>', 'д', 'е', 'б', 'р', 'а'],
 ['<STR>', 'п', 'а', 'ж', 'е', 'т'],
 ['<STR>', 'ф', 'и', 'л', 'ь', 'м', 'е'],
 ['<STR>', 'б', 'е', 'р', 'е', 'г'],
 ['<STR>', 'р', 'е', 'к', 'и'],
 ['<STR>', 'ф', 'и', 'л', 'ь', 'м', 'е']]

In [17]:
class CustomTextDataset(Dataset):
    def __init__(self, text, labels):
        self.labels = labels
        self.text = text

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        sample = (text, label)
        return sample

In [18]:
X = [char for word in clean_data for char in word]

In [19]:
Y = [char for target in labels for char in target]

In [20]:
print(X[:10])
print(Y[:10])

['<STR>', 'р', 'э', 'й', '<STR>', 'м', 'и', 'л', 'л', 'а']
['р', 'э', 'й', '<END>', 'м', 'и', 'л', 'л', 'а', 'н']


In [21]:
ALPHA = list(set(X).union(set(Y)))

In [22]:
alpha_dict = {}
for i in range(len(ALPHA)):
    alpha_dict.update({ALPHA[i]: i + 1})

In [23]:
print(alpha_dict)

{'й': 1, 'у': 2, 'х': 3, 'ь': 4, 'м': 5, 'т': 6, 'д': 7, 'ё': 8, 'э': 9, 'с': 10, 'ъ': 11, 'в': 12, 'ы': 13, 'ф': 14, 'щ': 15, '<STR>': 16, 'я': 17, 'ц': 18, 'п': 19, 'р': 20, 'н': 21, 'ж': 22, 'з': 23, 'е': 24, 'ш': 25, 'б': 26, 'и': 27, 'л': 28, 'а': 29, 'о': 30, 'ю': 31, 'к': 32, 'г': 33, '<END>': 34, 'ч': 35}


In [24]:
# add the ultimate character to the vector
def convert_label_to_vec(data, dicti):
    new_data = []
    for i in range(len(data)):
        tmp_list = np.zeros(len(dicti))
        # tmp_list = [0 for j in range(len(dicti))]
        tmp_list[dicti[data[i]] - 1] = 1
        new_data.append(tmp_list)

    return np.array(new_data)

In [25]:
# add the penultimate character to the vector
def pair_vecs(data, vecs, dicti):
    vecs[0] *= 2
    for i in range(1, len(data)):
        # tmp_list = np.zeros(len(dicti))
        # tmp_list = np.array([0 for j in range(len(dicti))])
        # tmp_list[dicti[data[i - 1]] - 1] = 1
        vecs[i][dicti[data[i - 1]] - 1] += 1
    
    return vecs

In [26]:
numX = convert_label_to_vec(X, alpha_dict)

In [27]:
numX = pair_vecs(X, numX, alpha_dict)

In [28]:
numY = convert_label_to_vec(Y, alpha_dict)

In [29]:
# numY = pair_vecs(Y, numY, alpha_dict)

In [30]:
print(numX[:3], '\n')
print(numY[:3])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] 

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [31]:
numX = torch.Tensor(numX)
numY = torch.Tensor(numY)
numX = numX.to(device = device, dtype=torch.float)
numY = numY.to(device = device, dtype=torch.float)

In [32]:
data_obj = CustomTextDataset(numX, numY)

In [33]:
train_dataset, test_dataset = random_split(data_obj, [int(0.7 * len(data_obj)), int(0.3 * len(data_obj))], generator=torch.Generator().manual_seed(42))

In [34]:
batch_data = DataLoader(train_dataset, batch_size = 64)
test_batch_data = DataLoader(train_dataset, batch_size = 64)

***Simple Neural Model***

In [35]:
input_size = len(alpha_dict)
hidden_size = len(alpha_dict)
num_classes = len(alpha_dict)
num_epochs = 2
learning_rate = 0.1

In [36]:
class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
         super(NeuralNet, self).__init__()
         self.input_size = input_size
         self.l1 = nn.Linear(input_size, hidden_size) 
         self.relu = nn.ReLU()
         self.l2 = nn.Linear(hidden_size, num_classes)
         self.s1 = nn.Softmax(dim = 0)

     def forward(self, x):
         out = self.l1(x)
         out = self.relu(out)
         out = self.l2(out)
         out = self.s1(out)
         return out

In [37]:
model = NeuralNet(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
# criterion = nn.L1Loss()
# criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [38]:
n_total_steps = len(batch_data)
for epoch in tqdm(range(num_epochs)):
    for i, (text, labels) in enumerate(tqdm(batch_data, position = 0, leave = True)):
        text = text.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.FloatTensor).to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(text)

        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()

    print ('\n', f'Epoch [{epoch+1}/{num_epochs}], Step[{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

100%|██████████| 27804/27804 [00:51<00:00, 544.11it/s]
 50%|█████     | 1/2 [00:51<00:51, 51.11s/it]


 Epoch [1/2], Step[27804/27804], Loss: 3.5743


100%|██████████| 27804/27804 [00:48<00:00, 573.62it/s]
100%|██████████| 2/2 [01:39<00:00, 49.80s/it]


 Epoch [2/2], Step[27804/27804], Loss: 3.6056





In [39]:
model.eval()

NeuralNet(
  (l1): Linear(in_features=35, out_features=35, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=35, out_features=35, bias=True)
  (s1): Softmax(dim=0)
)

In [40]:
for name, param in model.named_parameters():
    print(name, ':', param)

l1.weight : Parameter containing:
tensor([[-1.6224e+01,  1.3914e+01, -7.9100e+00,  ..., -3.9172e+01,
         -1.3434e-01, -2.3193e-01],
        [-1.0444e+01, -7.5761e+00,  6.5266e+00,  ...,  2.3303e+01,
          5.3966e-02, -2.4689e+01],
        [ 1.5998e+00,  4.3962e-01, -1.5900e+00,  ..., -3.6987e+00,
         -1.2969e-02, -2.4832e-01],
        ...,
        [ 6.2511e+00, -5.8042e+00, -6.5932e+00,  ..., -4.0717e+00,
          1.5687e-02, -4.7618e+00],
        [ 4.4949e-02, -5.6893e-01,  6.1298e-02,  ..., -8.6231e-02,
          8.8037e-02, -5.1669e-02],
        [-9.1080e+00, -5.8142e+00, -9.6775e+00,  ...,  1.4624e+01,
          1.3552e-01, -1.2786e+01]], requires_grad=True)
l1.bias : Parameter containing:
tensor([  9.9033,  11.3529,  -5.1598,   9.4761,  -3.7640,   7.9370,  -6.3907,
         -4.7866,  21.4401,  19.7857,   9.6258,   5.8882, -15.5130,  -2.1691,
          8.6503,  -1.5731, -18.3755,  -6.4872,  -4.6135,  33.0635,   6.9219,
         16.5590, -22.1838,  12.2196, -10.4974, 

In [41]:
def evaluate(model, val_dataloader):
    """
    After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []
    f1_weighted = []

    # For each batch in our validation set...
    for _, (text, labels) in enumerate(tqdm(val_dataloader, position = 0, leave = True)):
        # Load batch to GPU
        text = text.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.FloatTensor).to(device)

        # Compute logits
        with torch.no_grad():
            output = model(text)

        # Compute loss
        loss = criterion(output, labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(output)
        labelloc = torch.argmax(labels)
        # print(output.argmax())
        # print(preds)
        # print(labels)
        # print(torch.argmax(labels))
        # Calculate the accuracy rate
        accuracy = (preds == labelloc).cpu().numpy().mean() * 100
        # print(accuracy)
        val_accuracy.append(accuracy)

        # Calculate the f1 weighted score
        # f1_metric = F1Score('weighted') 
        # f1_weighted = f1_metric(preds, labelloc)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    print(val_accuracy)
    val_accuracy = np.mean(val_accuracy)
    f1_weighted = np.mean(f1_weighted)

    return val_loss, val_accuracy, f1_weighted

In [42]:
print(evaluate(model, test_batch_data))

100%|██████████| 27804/27804 [00:24<00:00, 1149.07it/s]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


***Probability Model***

In [43]:
class PModel():
    def __init__(self, alphabet):
        self.P = defaultdict()
        for char in alphabet:
            self.P.update({char : defaultdict()})
        
        for key, _ in self.P.items():
            for char in alphabet:
                self.P[key].update({char : 0})

    def display_model(self):
        for key, value in self.P.items():
            print('P[' + key + '] = ' + str(value))
            # for keyo, val in self.P[key].items():
            #     print('P[' + key + '][' + keyo + '] = ' + str(val), end = ' ')
            # print('\n')
    
    def model_build(self, X, Y):
        total = len(X)
        total_pairs = len(X) - 1
        indv_chars = defaultdict(lambda : False)

        for i, (cur, nex) in enumerate(zip(X, Y)):
            self.P[cur][nex] += 1

            if indv_chars[cur]:
                indv_chars[cur] += 1
            else:
                indv_chars.update({cur : 1})
        indv_chars['<END>'] += 1 # off by 1 error since it's not at the end of X

        for key, _ in indv_chars.items():
            indv_chars[key] /= float(total)

        for (cur, nex) in set(zip(X, Y)):
            self.P[cur][nex] = float(self.P[cur][nex]) / (total_pairs)
            self.P[cur][nex] = self.P[cur][nex] / indv_chars[cur]
    
    def model_test(self, X, Y):
        correct = 0
        total = len(X)

        for i in range(len(X)):
            pred = self.predict_next(X[i])
            if pred == Y[i]:
                correct += 1
        
        return float(correct) / total
        
    def predict_next(self, char):
        return max(self.P[char], key=self.P[char].get)

In [44]:
P = PModel(ALPHA)

In [45]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.30, random_state=42)

In [46]:
P.model_build(xtrain, ytrain)

In [47]:
P.model_test(xtest, ytest)

0.25866204044274327

***RNN-Transducer Model***

In [48]:
input_size = len(alpha_dict)
encoder_dim = len(alpha_dict)
joiner_dim = len(alpha_dict)
predictor_dim = len(alpha_dict)

num_classes = len(alpha_dict)
num_epochs = 1
learning_rate = 0.01

In [49]:
class Encoder(torch.nn.Module):
  def __init__(self, num_inputs):
    super(Encoder, self).__init__()
    self.embed = torch.nn.Embedding(num_inputs, encoder_dim)
    self.rnn = torch.nn.GRU(input_size=encoder_dim, hidden_size=encoder_dim, num_layers=3, batch_first=True, bidirectional=True, dropout=0.1)
    self.linear = torch.nn.Linear(encoder_dim*2, joiner_dim)

  def forward(self, x):
    out = x
    out = self.embed(out)
    out = self.rnn(out)[0]
    out = self.linear(out)
    return out

In [50]:
class Predictor(torch.nn.Module):
  def __init__(self, num_outputs):
    super(Predictor, self).__init__()
    self.embed = torch.nn.Embedding(num_outputs, predictor_dim)
    self.rnn = torch.nn.GRUCell(input_size=predictor_dim, hidden_size=predictor_dim)
    self.linear = torch.nn.Linear(predictor_dim, joiner_dim)
    
    self.initial_state = torch.nn.Parameter(torch.randn(predictor_dim))
    self.start_symbol = NULL_INDEX # In the original paper, a vector of 0s is used; just using the null index instead is easier when using an Embedding layer.

  def forward_one_step(self, input, previous_state):
    embedding = self.embed(input)
    state = self.rnn.forward(embedding, previous_state)
    out = self.linear(state)
    return out, state

  def forward(self, y):
    batch_size = y.shape[0]
    U = y.shape[1]
    outs = []
    state = torch.stack([self.initial_state] * batch_size).to(y.device)
    for u in range(U+1): # need U+1 to get null output for final timestep 
      if u == 0:
        decoder_input = torch.tensor([self.start_symbol] * batch_size).to(y.device)
      else:
        decoder_input = y[:,u-1]
      out, state = self.forward_one_step(decoder_input, state)
      outs.append(out)
    out = torch.stack(outs, dim=1)
    return out

In [51]:
class Joiner(torch.nn.Module):
  def __init__(self, num_outputs):
    super(Joiner, self).__init__()
    self.linear = torch.nn.Linear(joiner_dim, num_outputs)

  def forward(self, encoder_out, predictor_out):
    out = encoder_out + predictor_out
    out = torch.nn.functional.relu(out)
    out = self.linear(out)
    return out