# Before you run this training code:

1. Connect to your google drive
2. Inside your drive root path, put the 'input' directory constructed in the last step.
  (or the ones we provided to you in github repositoy)

In [None]:
from google.colab import drive
drive.mount('/gdrive')
root = '/gdrive/My Drive/Punctuator'

Mounted at /gdrive


In [None]:
import pickle
with open (root + '/input/word_dict', 'rb') as file:
        word_dict = pickle.load(file)

In [None]:
"""
making embedding matrix from pre-trained GloVe.
If it is already made, (we provided you in github repository), do not run this code.
Since it downloads the full GloVe embedding vector, it will take some time.
If you do not run this code, instead, run the code below, to get the pre-made embedding matrix.
Also, for model running in the local computer, you should move the resulting embedding matrix to the local directory.
"""
from torchtext.vocab import GloVe
embedding_glove = GloVe(name = '6B', dim = 100)

def get_embedding_matrix(word_dictionary, embedding_dictionary):
    dim = embedding_dictionary['the'].size(0)
    embedding_matrix = np.zeros((len(word_dictionary), embedding_dictionary['the'].size(0)))
    for word in word_dictionary:
        num = word_dictionary[word]
        embedding_matrix[num] = embedding_dictionary[word]
    embedding_matrix[0] = embedding_dictionary['pad']
    embedding_matrix[1] = embedding_dictionary['unk']
    return embedding_matrix

emb_matrix = get_embedding_matrix(word_dict, embedding_glove)

with open(root + '/input/GloVe_matrix', 'rb') as file:
  emb_matrix = pickle.load(file)

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                          
 99%|█████████▉| 397727/400000 [00:16<00:00, 24893.85it/s]

In [None]:
"""
This is for downloading a pre-made embedding matrix.
Run this code instead of the upper one.
"""
with open(root + '/input/GloVe_matrix', 'wb') as file:
  pickle.dump(emb_matrix, file)

In [None]:
vocab_size = emb_matrix.shape[0]
vector_size = emb_matrix.shape[1]

11176
100


In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm.notebook as tq
from pathlib import Path
import os


In [None]:
from easydict import EasyDict as edict

# Hyperparameters
args = edict()
args.batch_size = 32
args.lr = 0.001
args.epochs = 20
args.clip = 1
args.ninp = vector_size

args.dropout = 0.2

args.nlayers = 2
args.nhid = 512
args.nhead = 8
args.attn_pdrop = 0.1   #0.1
args.resid_pdrop = 0.1  #0.1
args.embd_pdrop = 0.1   #0.1
args.nff = 4 * args.nhid

args.gpu = True

# Basic settings
torch.manual_seed(470)
torch.cuda.manual_seed(470)
device = 'cuda:0' if torch.cuda.is_available() and args.gpu else 'cpu'

# Create directory name.
result_dir = Path(root) / 'results'
result_dir.mkdir(parents=True, exist_ok=True)

In [None]:
import pickle

class PuncDataset(Dataset): 
    """Punctuation restoration dataset"""
    def __init__(self, data_file_path, train):

        if train:
            data_path = data_file_path + 'train'
        else:
            data_path = data_file_path + 'test'

        with open (data_path + '_input_small', 'rb') as file:
                input = pickle.load(file)
                self.input = torch.tensor(input, dtype = torch.long)

        with open (data_path + '_output', 'rb') as file:
                output = pickle.load(file)
                self.punc = torch.tensor(output)
        """
        with open (data_path + '_raw_input', 'rb') as file:
                raw_input = pickle.load(file)
                #raw_input = np.array(raw_input).T
                self.raw_input = raw_input
        """
    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        sample = {'input': self.input[idx], 'punc': self.punc[idx]} #'raw_input' : self.raw_input[idx]
        return sample

In [None]:
from torch.utils.data.dataset import random_split
path_to_dir = root + '/input/'

# training / validation dataset (spliced into 8:2 ratio)
dataset = PuncDataset(path_to_dir, True)
total_data_length = len(dataset)
train_dataset, valid_dataset = random_split(dataset, [round(total_data_length * 0.8), round(total_data_length * 0.2)])

print(len(dataset))
print(len(train_dataset))
print(len(valid_dataset))

#test dataset
test_dataset = PuncDataset(path_to_dir, False)

train_dataloader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True, drop_last = True)
valid_dataloader = DataLoader(valid_dataset, batch_size = args.batch_size, shuffle = False, drop_last = True)
test_dataloader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = False, drop_last = False)

29295
23436
5859


In [None]:
trg_ntoken = 6 
pad_id = 0
src_ntoken = vocab_size

In [None]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.linear_input = nn.Linear(input_size, 4 * hidden_size)
        self.linear_hidden = nn.Linear(hidden_size, 4 * hidden_size)

    def forward(self, x, state):
        hx, cx = state
        hx_out = self.linear_hidden(hx)
        x_out = self.linear_input(x)
        before_chunk = x_out + hx_out

        chunk_forgetgate, chunk_ingate, chunk_cellgate, chunk_outgate = torch.chunk(before_chunk,chunks=4, dim=1)
        fx = torch.sigmoid(chunk_forgetgate)
        ix = torch.sigmoid(chunk_ingate)
        cty = torch.tanh(chunk_cellgate)
        ox = torch.sigmoid(chunk_outgate)

        out_cy = (cx * fx) + (ix * cty)

        out_hy = torch.tanh(out_cy) * ox

        return out_hy, (out_hy, out_cy) 

In [None]:
class LSTMLayer(nn.Module):
    def __init__(self,*cell_args):
        super(LSTMLayer, self).__init__()
        self.cell = LSTMCell(*cell_args)

    def forward(self, x, state, length_x=None):
        inputs = x.unbind(0)
        assert (length_x is None) or torch.all(length_x == length_x.sort(descending=True)[0])
        outputs = [] 
        out_hidden_state = []
        out_cell_state = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i] , state)
            outputs += [out] 
            if length_x is not None:
                if torch.any(i+1 == length_x):
                    out_hidden_state = [state[0][i+1==length_x]] + out_hidden_state
                    out_cell_state = [state[1][i+1==length_x]] + out_cell_state
        if length_x is not None:
            state = (torch.cat(out_hidden_state, dim=0), torch.cat(out_cell_state, dim=0))
        return torch.stack(outputs), state 
    

class LSTM(nn.Module):
    def __init__(self, ninp, nhid, num_layers, dropout):
        super(LSTM, self).__init__()
        self.layers = []
        self.dropout = nn.Dropout(dropout)
        for i in range(num_layers):
            if i == 0:
                self.layers.append(LSTMLayer(ninp, nhid))
            else:
                self.layers.append(LSTMLayer(nhid, nhid)) 
        self.layers = nn.ModuleList(self.layers) 

    def forward(self, x, states, length_x=None):
          output_states = []
          
          input = x
          for i in range(len(states)):
            output_tensor, output_state = self.layers[i](input, states[i], length_x)
            output_states.append(output_state)
            
            input = self.dropout(output_tensor)

          return output_tensor, output_states

In [None]:
class LSTMModule(nn.Module):
    def __init__(self):
        super(LSTMModule, self).__init__()
        ninp = args.ninp
        nhid = args.nhid
        nlayers = args.nlayers
        dropout = args.dropout
        
        self.dropout = nn.Dropout(dropout)
        self.lstm = LSTM(ninp, nhid, nlayers, dropout)
        
    def forward(self, x, states, length_x=None):
        input = self.dropout(x)

        return self.lstm(input, states, length_x)

In [None]:
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.LSTM_LtoR = LSTMModule()
        self.LSTM_RtoL = LSTMModule()
        self.ff1 = nn.Linear(args.nhid, args.nhid)
        
        self.ff2 = nn.Linear(args.nhid, trg_ntoken)

        self.seq = nn.Sequential (nn.Linear(args.nhid*2, args.nhid), 
                                  nn.ReLU(),
                                  nn.Dropout(0.2),
                                  nn.Linear(args.nhid, trg_ntoken),
                                  nn.Softmax(dim = -1))
        self.softmax = nn.Softmax(dim = -1)
        self.embed = nn.Embedding(num_embeddings = vocab_size, embedding_dim = vector_size)
        self.embed.weight = nn.Parameter(torch.tensor(emb_matrix, dtype=torch.float32))
    
    def forward(self, x, y, length, max_len=None, teacher_forcing=True):
        
        B = int(x.shape[1])
        x = self.embed(x)
        zero_tensor = torch.zeros(B, args.nhid).to(device)
        init_states = [(zero_tensor, zero_tensor)]*args.nlayers

        reverse_x = torch.flip(x, [1])

        output1, _ = self.LSTM_LtoR(x, init_states, length)
        output2, _ = self.LSTM_RtoL(reverse_x, init_states, length)

        output2 = torch.flip(output2, [1])

        output = torch.cat((output1, output2), dim = -1)

        output = output.transpose(0, 1)
        output = self.seq(output)

        return output

In [None]:
def sort_batch(x, y):
    lengths = (x!=pad_id).long().sum(0)
    length, idx = lengths.sort(dim = 0, descending= True)
    x = torch.index_select(x, 1, idx)
    y = torch.index_select(y, 1, idx)
    return x, y, length

def save_model(model, mode="last"):
    torch.save(model.state_dict(),  result_dir / f'{type(model).__name__}_{mode}.ckpt')
    
def load_model(model, mode="last"):
    if os.path.exists(result_dir / f'{type(model).__name__}_{mode}.ckpt'):
        model.load_state_dict(torch.load(result_dir / f'{type(model).__name__}_{mode}.ckpt'))

In [None]:
loss_weight = torch.FloatTensor ([0,1,13.8,13.6,30,0]).to(device)
criterion = nn.CrossEntropyLoss(ignore_index = pad_id, weight = loss_weight)

In [None]:
def run_epoch(epoch, model, optimizer, is_train = True, data_loader = None):

    total_loss = 0
    n_correct = 0
    n_total = 0
    
    if data_loader is None:
        data_loader = train_dataloader if is_train else valid_dataloader
    if is_train:
        model.train()
    else:
        model.eval()

    for batch in data_loader:

        x, y = batch['input'].to(device), batch['punc'].to(device)
        
        x, y = x.transpose(0,1), y.transpose(0,1)
        x, y, length = sort_batch(x, y)
        target = y.transpose(0,1)

        pred = model(x, y, length)
        loss = criterion(pred.reshape(-1, trg_ntoken), target.reshape(-1))
        n_targets = (target != pad_id).long().sum().item() 
        n_total += n_targets 
        n_correct += (pred.argmax(-1) == target)[target != pad_id].long().sum().item()
        if is_train:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()
           
            
        total_loss += loss.item() * n_targets
    total_loss /= n_total
    print("Epoch", epoch, 'Train' if is_train else 'Valid', 
          "Loss", np.mean(total_loss), 
          "Acc", n_correct / n_total)
    return total_loss

In [None]:
def run_experiment(model):
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                    factor = 0.25, patience = 1, threshold = 0.0001, threshold_mode = 'rel',
                                                    cooldown = 0, min_lr = 0, eps = 1e-08, verbose = False)
    best_val_loss = np.inf
    #for epoch in tq.tqdm(range(args.epochs)):
    for epoch in tq.tqdm(range(args.epochs)):
        # train one epoch
        run_epoch(epoch, model, optimizer, is_train = True)

        # calculate validation loss and save
        with torch.no_grad():
            val_loss = run_epoch(epoch, model, None, is_train = False)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_model(model, 'best')

        save_model(model)
        scheduler.step(val_loss)

In [None]:
model = BiLSTM().to(device)
run_experiment(model)

In [None]:
def reverse_dict(src_dict):
  rev_dict = dict()
  for word in src_dict:
    rev_dict[src_dict[word]] = word
  return rev_dict

rev_word_dict = reverse_dict(word_dict)
rev_punc_dict = {0: ' ', 1: ' ', 2: '. ', 3:', ', 4: '? ', 5: ' '}

In [None]:
"""
If you are running the code from scratch, 
you should also run this code for applying the model.
With the embedding matrix, download this to your local computer, in a appropriate directory.
"""
with open(root + '/input/rev_dict', 'wb') as file:
  pickle.dump(rev_word_dict, file)

In [None]:
def run_test(model, mode='best'):
    with torch.no_grad():
        model.eval()
        load_model(model, mode)

        src_list = []
        gt_list = []
        pred_list = []

        total_loss = 0
        n_correct = 0
        n_total = 0

        confusion_matrix = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0] ]
        total_counts = [0,0,0,0]

        percent_matrix = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0] ]

        for batch in test_dataloader:
            print_string = ''
            x, y = batch['input'].to(device), batch['punc'].to(device)

            x, y = x.transpose(0,1), y.transpose(0,1)
            x, y, length = sort_batch(x, y)
            target = y.transpose(0,1)
            source = x.transpose(0,1)

            pred = model(x, y, length)
            loss = criterion(pred.reshape(-1, trg_ntoken), target.reshape(-1))
            n_targets = (target!=pad_id).long().sum().item() 
            n_total += n_targets 
            prediction = pred.argmax(-1)
            n_correct += (pred.argmax(-1)==target)[target!=pad_id].long().sum().item()

            for x_, y_, pred_ in zip(x, target, pred.argmax(-1)):
                src_list.append(x_)
                gt_list.append(y_)
                pred_list.append(pred_)

            total_loss += loss.item() * n_targets

            for i in range(32):
              for j in range(100):
                try:
                  if 0<target[i][j]<5:
                    total_counts[target[i][j]-1] += 1
                    if 0<prediction[i][j]<5:
                      confusion_matrix[target[i][j]-1][prediction[i][j]-1] += 1
                except: continue
           
            # code for printing out the result.
            # comment out if not neccesary
            for i in range(32):
              target_sent = ''
              pred_sent = ''
              for j in range(100):
                try:
                  target_sent += rev_word_dict[int(source[i][j])]
                  pred_sent += rev_word_dict[int(source[i][j])]
                  target_sent += rev_punc_dict[int(target[i,j])]
                  pred_sent += rev_punc_dict[int(prediction[i,j])]
                except: continue
              print("<<example>>")
              print(target_sent)
              print(pred_sent)

            
        for i in range(4):
          for j in range(4):
            percent_matrix[i][j] = confusion_matrix[i][j] / total_counts[i]
            

        total_loss /= n_total

        print('Test', 
                "Loss", np.mean(total_loss), 
                "Acc", n_correct / n_total)
        
        for i in range(10):
            print(f"--------- Translation Example {i+1} ---------")
            print(''.join(map(str, gt_list[i].tolist())))
            print(''.join(map(str, pred_list[i].tolist())))
            print()
        print()
        print()
        
        print(total_counts)
        print(confusion_matrix)
        print(percent_matrix)

        return total_loss

In [None]:
model = BiLSTM().to(device)
run_test(model, mode = 'best')