# Mayer & Nelson (2020) Phonotactic learning with neural language models

In [2]:
import random
import torch
import torch.nn as nn
import sys
import numpy as np
import time
from scipy.stats import norm

In [3]:
#from statsmodels.stats.weightstats import CompareMeans

# Part 1: Getting data

In [4]:
def get_corpus_data(filename):
    """
    Reads input file and coverts it to list of lists, adding word boundary 
    markers.
    """
    raw_data = []
    file = open(filename,'r')
    for line in file:
        line = line.rstrip()
        line = ['<s>'] + line.split(' ') + ['<e>']
        raw_data.append(line)
    return raw_data

In [5]:
raw_data = get_corpus_data("../sample_data/corpora/finnish_training.txt")

In [6]:
def process_data(string_training_data, dev=True, training_split=60):
    random.shuffle(string_training_data)
    # all data points need to be padded to the maximum length
    max_chars = max([len(x) for x in string_training_data])
    string_training_data = [
        sequence + ['<p>'] * (max_chars - len(sequence)) 
        for sequence in string_training_data]
    # get the inventory and build both directions of dicts  
    # this will store the set of possible phones
    inventory = list(set(phone for word in string_training_data for phone in word))
    inventory = ['<p>'] + [x for x in inventory if x != '<p>'] #ensure that the padding symbol is at index 0

    # dictionaries for looking up the index of a phone and vice versa
    phone2ix = {p: ix for (ix, p) in enumerate(inventory)}
    ix2phone = {ix: p for (ix, p) in enumerate(inventory)}

    as_ixs = [
        torch.LongTensor([phone2ix[p] for p in sequence]) 
        for sequence in string_training_data
      ]

    if not dev:
        training_data = torch.stack(as_ixs, 0)
        # simpler make a meaningless tiny dev than to have a different eval 
        # training method that doesn't compute Dev perplexity
        dev = torch.stack(as_ixs[-10:], 0)
    else:
        split = int(len(as_ixs) * (training_split/100))
        training_data = torch.stack(as_ixs[:split], 0)
        dev = torch.stack(as_ixs[split:], 0)

    return inventory, phone2ix, ix2phone, training_data, dev

In [7]:
inventory, phone2ix, ix2phone, training, dev = process_data(
        raw_data, dev=True, training_split=60
    )
inventory_size = len(inventory)


# Part 2: Defining the model

In [8]:
class Emb_RNNLM(nn.Module):
    def __init__(self, params):
        super(Emb_RNNLM, self).__init__()
        self.vocab_size = params['inv_size']
        self.d_emb = params['d_emb']
        self.n_layers = params['num_layers']
        self.d_hid = params['d_hid']
        self.embeddings = nn.Embedding(self.vocab_size, self.d_emb)
        
        # input to recurrent layer, default nonlinearity is tanh
        self.i2R = nn.RNN(
            self.d_emb, self.d_hid, batch_first=True, num_layers = self.n_layers
        )
        # recurrent to output layer
        self.R2o = nn.Linear(self.d_hid, self.vocab_size)
        if params['tied']:
            if self.d_emb == self.d_hid:
                self.R2o.weight = self.embeddings.weight
            else:
                print("Dimensions don't support tied embeddings")

    def forward(self, batch):
        batches, seq_len = batch.size()
        embs = self.embeddings(batch)
        output, hidden = self.i2R(embs)
        outputs = self.R2o(output)
        return outputs


In [9]:
rnn_params = {}
rnn_params['d_emb'] = 24
rnn_params['d_hid'] = 64
rnn_params['num_layers'] = 1
rnn_params['batch_size'] = 64
rnn_params['learning_rate'] = .005
rnn_params['epochs'] = 10
rnn_params['tied'] = True
rnn_params['inv_size'] = inventory_size
RNN = Emb_RNNLM(rnn_params)


Dimensions don't support tied embeddings


# Part 3: Model training

In [10]:
def compute_perplexity(dataset, net, bsz=64):
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
    num_examples, seq_len = dataset.size()
    
    batches = [(start, start + bsz) for start in\
               range(0, num_examples, bsz)]
    
    total_unmasked_tokens = 0.
    nll = 0.
    for b_idx, (start, end) in enumerate(batches):
            
        batch = dataset[start:end]
        ut = torch.nonzero(batch).size(0)
        preds = net(batch)
        targets = batch[:, 1:].contiguous().view(-1)
        preds = preds[:, :-1, :].contiguous().view(-1, net.vocab_size)
        loss = criterion(preds, targets)
        nll += loss.detach()
        total_unmasked_tokens += ut

    perplexity = torch.exp(nll / total_unmasked_tokens).cpu()
    return perplexity.data


In [11]:
def train_lm(dataset, dev, params, net):
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(net.parameters(), lr=params['learning_rate'])
    num_examples, seq_len = dataset.size()    
    batches = [
        (start, start + params['batch_size']) 
        for start in range(0, num_examples, params['batch_size'])
    ]
    
    prev_perplexity = 1e10
    for epoch in range(params['epochs']):
        ep_loss = 0.
        start_time = time.time()
        random.shuffle(batches)
        
        for b_idx, (start, end) in enumerate(batches):
            batch = dataset[start:end]
            preds = net(batch)
            preds = preds[:, :-1, :].contiguous().view(-1, net.vocab_size)
            targets = batch[:, 1:].contiguous().view(-1)
            loss = criterion(preds, targets)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            ep_loss += loss.detach() 
        dev_perplexity = compute_perplexity(dev,net)

        print('epoch: %d, loss: %0.2f, time: %0.2f sec, dev perplexity: %0.2f' %
              (epoch, ep_loss, time.time()-start_time, dev_perplexity))
        # stop early criterion, increasing perplexity on dev 
        if dev_perplexity - prev_perplexity > 0.01:
            print('Stop early reached')
            break



In [14]:
train_lm(training, dev, rnn_params, RNN)


epoch: 0, loss: 1937.15, time: 11.15 sec, dev perplexity: 6.97
epoch: 1, loss: 1832.69, time: 9.71 sec, dev perplexity: 6.72
epoch: 2, loss: 1806.62, time: 12.05 sec, dev perplexity: 6.61
epoch: 3, loss: 1792.68, time: 20.18 sec, dev perplexity: 6.52
epoch: 4, loss: 1784.55, time: 16.06 sec, dev perplexity: 6.48
epoch: 5, loss: 1779.31, time: 16.15 sec, dev perplexity: 6.46
epoch: 6, loss: 1775.42, time: 10.50 sec, dev perplexity: 6.48
epoch: 7, loss: 1773.13, time: 10.30 sec, dev perplexity: 6.47
epoch: 8, loss: 1770.58, time: 9.59 sec, dev perplexity: 6.44
epoch: 9, loss: 1768.98, time: 9.26 sec, dev perplexity: 6.43


Emb_RNNLM(
  (embeddings): Embedding(26, 24)
  (i2R): RNN(24, 64, batch_first=True)
  (R2o): Linear(in_features=64, out_features=26, bias=True)
)

# Part 4: Model evaluation

In [17]:
def get_probs(input_file, model, phone2ix, out_filename):
    inp_file = open(input_file, 'r',encoding='UTF-8')
    out_file = open(out_filename,'w',encoding='UTF-8')
    data_tens = []
    as_strings = []
    for line in inp_file:
        line = line.rstrip()
        as_strings.append(line.replace(' ',''))
        line = line.split(' ')
        line = ['<s>'] + line + ['<e>']
        line_as_tensor = torch.LongTensor([phone2ix[p] for p in line])
        data_tens.append(line_as_tensor)

    num_points = len(data_tens)

    for i,word in enumerate(data_tens):
        curr_string = as_strings[i]
        out_file.write(curr_string + '\t' + str(compute_perplexity(word.unsqueeze(0), model).numpy()) + '\n')
    
    inp_file.close()
    out_file.close()

In [21]:
#RNN.eval()
get_probs("../sample_data/test_data/finnish_test.txt",
    RNN, phone2ix,
    "../output/finnish.csv")
