## Loading the data, padding (based on 2.0)

In [1]:
import sys
import os
import numpy as np
import torch
import random

# device = torch.device('cuda:3')
device = torch.device('cpu')

In [2]:
def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = ['ç']
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            #print(words)
            if columns == []:
                sentences.append((''.join(collection_words) + 'ñ', collection_labels))
                collection_words = []
                collection_labels = []
                continue
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))
            
    return sentences

##### comment
Adding a start token 'ç' and an end token 'ñ'.

In [3]:
train_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')

In [5]:
print(train_sentences[0])

('ç看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。ñ', [1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1])


In [4]:
test_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')

In [5]:
def index_chars(sentences):
    megasentence = ''.join(sentences)
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = [0] + list(char_list)
    return char_list, {char_list[x]:x for x in range(len(char_list))}

In [6]:
int_index, char_index = index_chars([x[0] for x in train_sentences + test_sentences])

In [84]:
all_values = char_index. values()
max_value = max(all_values) 
print(max_value)

3649


In [1]:
#int_index

In [7]:
def convert_sentence(sentence, index):
    return [index[x] for x in sentence]

In [8]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [9]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [10]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(train_sentences, device)
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(test_sentences, device)

## Packing the sequences for RNN

In [11]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

## Batching (based on 1.0, 1.1, 1.2)

In [12]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths # We need the lengths to efficiently use the padding.
        self.y = y
        self.device = device
        self.batch_size=batch_size
        self.max_iter = max_iter
        self.curr_iter = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)
        
        self.curr_iter += 1
        return zip(splitX, splitlengths, splity)

In [13]:
b = Batcher(train_X_tensor, train_lengths_tensor, train_y_tensor, torch.device('cpu'), max_iter=100)

## Modeling

### Original model

In [14]:
import torch.nn as nn

In [15]:
import torch.optim as optim

In [16]:
class Segmenter(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, 150, batch_first=True)
        self.sig1 = nn.Sigmoid()
        self.lin = nn.Linear(150, 2)
        self.softmax = nn.LogSoftmax(2)
        
    def forward(self, x, lengths):
        embs = self.emb(x)
        packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)
        output2 = self.sig1(unpacked)
        output3 = self.lin(output2)
        return self.softmax(output3)

In [17]:
def train(X, lengths, y, vocab_size, emb_size, batch_size, epochs, device, model=None):
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)
    if not model:
        m = Segmenter(vocab_size, emb_size).to(device)
    else:
        m = model
    loss = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    epoch = 0
    for split in b:
        tot_loss = 0
        for batch in split:
            optimizer.zero_grad()
            o = m(batch[0], batch[1])
            l = loss(o.permute(0,2,1), batch[2][:, :max(batch[1])])
            tot_loss += l
            l.backward()
            optimizer.step()
        print("Total loss in epoch {} is {}.".format(epoch, tot_loss))
        epoch += 1
    return m

In [18]:
model_s = train(train_X_tensor, train_lengths_tensor, train_y_tensor, len(int_index), 200, 50, 30, "cpu")
torch.save(model, 'chinese_segmentation.pt')

Total loss in epoch 0 is 33.48582077026367.
Total loss in epoch 1 is 18.672042846679688.
Total loss in epoch 2 is 13.95911693572998.
Total loss in epoch 3 is 10.847987174987793.
Total loss in epoch 4 is 8.672901153564453.
Total loss in epoch 5 is 6.825079441070557.
Total loss in epoch 6 is 5.620857238769531.
Total loss in epoch 7 is 4.421230792999268.
Total loss in epoch 8 is 3.560142755508423.
Total loss in epoch 9 is 2.812448263168335.
Total loss in epoch 10 is 2.171895980834961.
Total loss in epoch 11 is 1.8763411045074463.
Total loss in epoch 12 is 1.6573987007141113.
Total loss in epoch 13 is 1.336803913116455.
Total loss in epoch 14 is 1.3257462978363037.
Total loss in epoch 15 is 1.6597785949707031.
Total loss in epoch 16 is 1.9198561906814575.
Total loss in epoch 17 is 2.2084600925445557.
Total loss in epoch 18 is 2.0858240127563477.
Total loss in epoch 19 is 1.4297027587890625.
Total loss in epoch 20 is 1.0907490253448486.
Total loss in epoch 21 is 0.9485206604003906.
Total lo

NameError: name 'model' is not defined

In [33]:
# oops guess i didn't save that model properly but anyways i'm doing the evaluation on the same run,
#i hope whoever is grading this does not need to run this model again :') sorry sorry'

##### comment
This model and a few more on this notebook had to be trained using cpu instead of cuda because of cuda runtime errors. Since training on cpu is very time-consuming, the models were only trained on 30 epochs (as was done originally) and no further testing with more epochs was done.

### Model for part 1

In [37]:
class Predicter(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, 150, batch_first=True)
        self.lin = nn.Linear(150, self.vocab_size) #changed this here, used to be 2 instead of vocab_size
        self.softmax = nn.LogSoftmax(2)
        
    def forward(self, x, lengths):
        embs = self.emb(x)
        packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)
        output3 = self.lin(unpacked)
        return self.softmax(output3)
        

In [113]:
def train(X, lengths, y, vocab_size, emb_size, batch_size, epochs, device, model=None):
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)
#     if not model:
#         m = Segmenter(vocab_size, emb_size).to(device)
#     else:
#         m = model
    m = Predicter(vocab_size, emb_size).to(device)
    loss = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    epoch = 0
    for split in b:
        tot_loss = 0
        for batch in split:
            optimizer.zero_grad()
            o = m(batch[0], batch[1])
            l = loss(o[:, :-1, :].permute(0,2,1), batch[0][:, 1:max(batch[1])]) #rip 1st char
            tot_loss += l
            l.backward()
            optimizer.step()
        print("Total loss in epoch {} is {}.".format(epoch, tot_loss))
        epoch += 1
    return m

In [38]:
model = train(train_X_tensor, train_lengths_tensor, train_y_tensor, len(int_index), 200, 50, 30, "cuda:3")
torch.save(model, 'chinese_generation.pt')

Total loss in epoch 0 is 573.6555786132812.
Total loss in epoch 1 is 510.2818603515625.
Total loss in epoch 2 is 461.9465637207031.
Total loss in epoch 3 is 418.2381286621094.
Total loss in epoch 4 is 376.1671142578125.
Total loss in epoch 5 is 335.3100891113281.
Total loss in epoch 6 is 296.4182434082031.
Total loss in epoch 7 is 258.9539489746094.
Total loss in epoch 8 is 222.69390869140625.
Total loss in epoch 9 is 191.03509521484375.
Total loss in epoch 10 is 164.3633575439453.
Total loss in epoch 11 is 143.44784545898438.
Total loss in epoch 12 is 129.9801788330078.
Total loss in epoch 13 is 116.51019287109375.
Total loss in epoch 14 is 108.3027114868164.
Total loss in epoch 15 is 100.05379486083984.
Total loss in epoch 16 is 95.44648742675781.
Total loss in epoch 17 is 89.84992980957031.
Total loss in epoch 18 is 85.84286499023438.
Total loss in epoch 19 is 82.64210510253906.
Total loss in epoch 20 is 81.01806640625.
Total loss in epoch 21 is 78.18431854248047.
Total loss in epoc

##### comments

In order to fit the requirements of the task, the size of the linear layer was changed from 2 (in the original model, we were classifying with either 0 or 1) to the length of the vocabulary, as we are now predicting words.

The Sigmoid function was removed, since this function is useful for binary classification but not for the task of text generation.

In the model's training loop, in order to get the right predictions and loss, the end token of the sentences were removed in the second dimension, as well as the start token in the targets of the model.

## Generation (part 1)

In [None]:
model = torch.load("chinese_generation.pt")

In [47]:
#average
length = 0
for sentence in test_sentences:
    length += len(sentence[0])

avg = length / len(test_sentences)
print(int(avg))

39


In [41]:
def probabilities_generator():
    start_symbol = char_index['ç']
    starter = char_index[random.choice(int_index)]
    end_symbol = char_index['ñ']
    #     variance = 0.25
    generated = ''

    x = np.zeros((1, 39), dtype=int)
    for column in x:
        column[0] += start_symbol
        column[1] += starter
    p = torch.tensor([39])
    o = torch.tensor(x).to(device)

    for i in range(37):
        out = model(o, p)
        the_character = torch.argmax(out, 2)
        o[0][i+2] += the_character[0][i+2]

    return o

In [55]:
def probabilities_to_text(t):

    sentence = ''
    for v in t[0]:
        c = int_index[v]
        sentence += c
    print(sentence[1:])

In [56]:
b = probabilities_generator()
c = probabilities_to_text(b)

壹韌念補。縣南的子中。，留的木更通、敏、害。及生於i》飾於西。）建了子的體。


In [58]:
b = probabilities_generator()
c = probabilities_to_text(b)

梧者業養用宜的子。以，顯為子中的嶼學移。才行，自語由子的篇。及稱王，跡成人的


In [62]:
b = probabilities_generator()
c = probabilities_to_text(b)

佈和，勵的髮照放或能出，送的子與萄的性。及在大中江的殿。，任安的子。，留拒堂


In [63]:
b = probabilities_generator()
c = probabilities_to_text(b)

禕是大中的面。及彈的池。及王創。，於.發中的型範在子為能，態圈，度的度，掉的


##### comments

To generate sentences, we use the average sentence length (39).

The probabilities_generator function predicts the following character based on the starter symbol plus another character chosen at random from the list of words. When a new character is predicted, it is added to the tensor and used to predict the following word, until the end.

(Copy and paste the generated sentences to google translate if you wanna have a laugh, they're quite weird and funny)

### Model for part 2

In [35]:
class DoubleBoii(nn.Module):
    def __init__(self, segmenter, predicter):
        super(DoubleBoii, self).__init__()
        
        self.seg = segmenter
        self.pre = predicter
        
    def forward(self, x, lengths):
        
        segmentation = self.seg(x, lengths)
        prediction = self.pre(x, lengths)
        
        return segmentation, prediction

In [119]:
def train(X, lengths, y, vocab_size, emb_size, batch_size, epochs, device, model=None):
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)

    seg = Segmenter(vocab_size, emb_size).to(device)
    pre = Predicter(vocab_size, emb_size).to(device)
    doubleboii = DoubleBoii(seg, pre)

    loss = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(doubleboii.parameters(), lr=0.005)
    epoch = 0
    
    for split in b:
        tot_loss = 0
        for batch in split:
            optimizer.zero_grad()
            o, u = doubleboii(batch[0], batch[1])
            l_segmenter = loss(o.permute(0,2,1), batch[2][:, :max(batch[1])]) #loss of segmenter
            l_predicter = loss(u[:, :-1, :].permute(0,2,1), batch[0][:, 1:max(batch[1])]) #loss of predicter
            tot = l_segmenter + l_predicter
            tot_loss += tot
            #add two losses before this
            tot.backward()
            optimizer.step()
        print("Total loss in epoch {} is {}.".format(epoch, tot_loss))
        epoch += 1
    return doubleboii

In [120]:
momo = train(train_X_tensor, train_lengths_tensor, train_y_tensor, len(int_index), 200, 50, 30, device)
torch.save(momo, 'chinese_doubleboii.pt')

Total loss in epoch 0 is 603.0822143554688.
Total loss in epoch 1 is 524.9391479492188.
Total loss in epoch 2 is 472.2152404785156.
Total loss in epoch 3 is 424.95611572265625.
Total loss in epoch 4 is 381.0309143066406.
Total loss in epoch 5 is 338.6737060546875.
Total loss in epoch 6 is 298.1492614746094.
Total loss in epoch 7 is 259.1299743652344.
Total loss in epoch 8 is 223.43145751953125.
Total loss in epoch 9 is 192.60321044921875.
Total loss in epoch 10 is 166.27276611328125.
Total loss in epoch 11 is 145.16412353515625.
Total loss in epoch 12 is 129.30235290527344.
Total loss in epoch 13 is 116.69541931152344.
Total loss in epoch 14 is 108.91262817382812.
Total loss in epoch 15 is 100.5002212524414.
Total loss in epoch 16 is 94.72045135498047.
Total loss in epoch 17 is 89.19795989990234.
Total loss in epoch 18 is 88.46723175048828.
Total loss in epoch 19 is 84.53884887695312.
Total loss in epoch 20 is 82.37654113769531.
Total loss in epoch 21 is 78.7981948852539.
Total loss in

##### comments

To make this model, the two previous models are combined via a class (DoubleBoii), which returns the output tensors for both models.

In the training loop, one loss is calculated for each of the two models, and then concatenated.

## Part 3 - Evaluation

#### Segmenter model

In [19]:
model_s.eval()

Segmenter(
  (emb): Embedding(3650, 200, padding_idx=0)
  (lstm): LSTM(200, 150, batch_first=True)
  (sig1): Sigmoid()
  (lin): Linear(in_features=150, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=2)
)

In [20]:
with torch.no_grad():
    rawpredictions_s = model_s(test_X_tensor, test_lengths_tensor)

In [21]:
predictions_s = torch.argmax(rawpredictions_s, 2)

In [23]:
collectpreds_s = []
collecty_s = []
for i in range(test_X_tensor.size(0)):
    collectpreds_s.append(predictions_s[i][:test_lengths_tensor[i]])
    collecty_s.append(test_y_tensor[i][:test_lengths_tensor[i]])

In [24]:
allpreds_s = torch.cat(collectpreds_s)
classes_s = torch.cat(collecty_s)

In [25]:
classes_s = classes_s.float()
allpreds_s = allpreds_s.float()

In [26]:
tp_s = sum(classes_s * allpreds_s)
fp_s = sum(classes_s * (~allpreds_s.bool()).float())
tn_s = sum((~classes_s.bool()).float() * (~allpreds_s.bool()).float())
fn_s = sum((~classes_s.bool()).float() * allpreds_s)

tp_s, fp_s, tn_s, fn_s

(tensor(10844.), tensor(667.), tensor(6447.), tensor(747.))

In [27]:
accuracy_s = (tp_s + tn_s) / (tp_s + fp_s + tn_s + fn_s)
accuracy_s

tensor(0.9244)

In [28]:
recall_s = tp_s / (tp_s + fn_s)
recall_s

tensor(0.9356)

In [29]:
precision_s = tp_s / (tp_s + fp_s)
precision_s

tensor(0.9421)

In [30]:
f1_s = (2 * recall_s * precision_s) / (recall_s + precision_s)
f1_s

tensor(0.9388)

#### Dual model

In [38]:
model_d = torch.load('chinese_doubleboii.pt')

In [57]:
model_d.eval()

DoubleBoii(
  (seg): Segmenter(
    (emb): Embedding(3650, 200, padding_idx=0)
    (lstm): LSTM(200, 150, batch_first=True)
    (sig1): Sigmoid()
    (lin): Linear(in_features=150, out_features=2, bias=True)
    (softmax): LogSoftmax(dim=2)
  )
  (pre): Predicter(
    (emb): Embedding(3650, 200, padding_idx=0)
    (lstm): LSTM(200, 150, batch_first=True)
    (lin): Linear(in_features=150, out_features=3650, bias=True)
    (softmax): LogSoftmax(dim=2)
  )
)

In [58]:
with torch.no_grad():
    rawpredictions_d, _ = model_d(test_X_tensor, test_lengths_tensor)

In [59]:
predictions_d = torch.argmax(rawpredictions_d, 2)

In [60]:
collectpreds_d = []
collecty_d = []
for i in range(test_X_tensor.size(0)):
    collectpreds_d.append(predictions_d[i][:test_lengths_tensor[i]])
    collecty_d.append(test_y_tensor[i][:test_lengths_tensor[i]])

In [61]:
allpreds_d = torch.cat(collectpreds_d)
classes_d = torch.cat(collecty_d)

In [62]:
classes_d = classes_d.float()
allpreds_d = allpreds_d.float()

In [63]:
tp_d = sum(classes_d * allpreds_d)
fp_d = sum(classes_d * (~allpreds_d.bool()).float())
tn_d = sum((~classes_d.bool()).float() * (~allpreds_d.bool()).float())
fn_d = sum((~classes_d.bool()).float() * allpreds_d)

tp_d, fp_d, tn_d, fn_d

(tensor(6324.), tensor(5187.), tensor(3327.), tensor(3867.))

In [64]:
accuracy_d = (tp_d + tn_d) / (tp_d + fp_d + tn_d + fn_d)
accuracy_d

tensor(0.5160)

In [65]:
recall_d = tp_d / (tp_d + fn_d)
recall_d

tensor(0.6205)

In [66]:
precision_d = tp_d / (tp_d + fp_d)
precision_d

tensor(0.5494)

In [67]:
f1_d = (2 * recall_d * precision_d) / (recall_d + precision_d)
f1_d

tensor(0.5828)

##### comments

It is not possible to see exactly at around what epoch the models converge because, due to cuda runtime errors, the models were only trained in 30 epochs. In 30 epochs, it is complicated to point where the models converge.

Regardless, the accuracy scores of the models are 0.9244 for the segmentation model and 0.5160 for the dual model. The segmentation model is, then, more accurate than the dual model. Their F1 scores are 0.9388 for the segmentation model and 0.5828 for the dual model. Again, the segmentation model has a higher score, which denotes better performance.

#### Perplexity of predicter model

In [53]:
def perplexity_p(X, lengths, y, vocab_size, emb_size, batch_size, epochs, device, model=None):
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)

    m = Predicter(vocab_size, emb_size).to(device)
    loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    for split in b:
        tot_loss = 0
        for batch in split:
            o = m(batch[0], batch[1])
            l = loss(o[:, :-1, :].permute(0,2,1), batch[0][:, 1:max(batch[1])]) 
            tot_loss += l
            l.backward()
    p = torch.exp(tot_loss)
    print("Perplexity is {}.".format(p))

    return p.item()

In [55]:
model_p = torch.load('chinese_generation.pt')

In [56]:
perplexity_p(test_X_tensor, test_lengths_tensor, test_y_tensor, len(int_index), 200, 50, 30, device, model_p)

Perplexity is 3.509243098723996e+35.


3.509243098723996e+35

#### Perplexity of dual model

In [69]:
perplexity_p(test_X_tensor, test_lengths_tensor, test_y_tensor, len(int_index), 200, 50, 30, device, model_d)

Perplexity is 3.897350175616371e+35.


3.897350175616371e+35

##### comments

The result of the perplexity is high in both models, which is bad. The perplexity of the predicter model is a little bit lower than the perplexity of the dual model.