# Data Processing

In [1]:
import os
import pandas as pd
import fnmatch

In [2]:
#function that gets tokens from files
def tokens_from_files(directory, pattern='*.final'):
        """Recursively find all files matching the pattern."""
        file_path_list = []
        for root, dirnames, filenames in os.walk(directory):
            for filename in fnmatch.filter(filenames, pattern):
                file_path_list.append(os.path.join(root, filename))
        
        token_label_list=_extract_tokens(file_path_list)
        return token_label_list

#function that extracts tokens from a list of files (nested in the function above)
def _extract_tokens(file_list):
    token_label_list = []
    for file in file_list:
        with open(file,'r',errors='replace') as f:
             for token_label in f.read().splitlines():
                token_label_list.append(token_label)
    return token_label_list


#Function that takes a list of tokens, cleans it and outputs dataframe
def deep_clean(str_list):
    str_list = list(filter(str.strip, str_list)) #remove empty lines
    last_label='' #variable that keeps track of the label of the previous row
    df = pd.DataFrame(columns = ["Token", "Label"]) #creating an empty dataframe with two columns

    for line in str_list:
        line="".join(line.rstrip().lstrip()) #removes spaces on the left and right of each line
        if line[-1]=='I' and last_label in'Oo': #checks that no 'I' labels are preceded by an'O'
            line = list(line)
            line[-1] = 'B'  #if the condition is true, the 'I' is replaced by a 'B'
            line=''.join(line)
        if line[-1]=='0': #if the label is annotated as '0' it is changed to 'O'
            line = list(line)
            line[-1] = 'O'
            line=''.join(line)
        if line[-1] in 'OBIobi': #make sure all labels are either 'O' 'I' or 'B'
            try:
                row=line.split() #split by space or tab
                df.loc[len(df)] = row #add row to dataframe
            except ValueError:
                pass #skip problematic lines
            df['Label'] = df['Label'].str.upper() #make all labels upper case
        last_label=line[-1] #updates label of previous row for next loop
    return df

def make_sent(df):
    '''Turns every row in the data frame to a tuple. 
    The tuples of one sentence are saved in a list.
    The sentences are saved in a list.
    Input: dataframe
    Returns: list of list of tuples'''
    tuple_list=list(df.itertuples(index=False, name=None))
    stoplist=['.','?','!']
    sent_list=[]
    sent=[]
    for el in tuple_list:
        sent.append(el)
        if el[0] in stoplist:
            sent_list.append(sent)
            sent=[]
    return sent_list

## Loading data directory

In [3]:
# Note: This cell takes a long time to execute

path='../Dataset/' #add your directory path

str_list = tokens_from_files(path) #find final annotation files in directory and extract tokens
df=deep_clean(str_list) #create dataframe with tokens and labels (and remove/correct erroneous lines)

In [4]:
df

Unnamed: 0,Token,Label
0,Translation,B
1,models,I
2,used,O
3,for,O
4,statistical,B
...,...,...
26660,mutual,O
26661,disambiguation,B
26662,and,O
26663,generalization,B


In [5]:
df.groupby('Label').count()

Unnamed: 0_level_0,Token
Label,Unnamed: 1_level_1
B,3256
I,3508
II,1
O,19900


In [6]:
# replace label II to I
df['Label'] = df['Label'].replace(['II'], 'I')

In [7]:
df[df['Label']=='II']

Unnamed: 0,Token,Label


In [8]:
# make a sentence with pair of token and label
data=make_sent(df)

In [9]:
print("In our dataset contain "+str(len(data))+" sentences")

In our dataset contain 965 sentences


In [10]:
print(data[1])

[('The', 'O'), ('common', 'O'), ('assumption', 'O'), ('is', 'O'), ('that', 'O'), ('parallel', 'B'), ('texts', 'I'), ('are', 'O'), ('symmetrical', 'O'), (':', 'O'), ('The', 'O'), ('direction', 'O'), ('of', 'O'), ('translation', 'O'), ('is', 'O'), ('deemed', 'O'), ('irrelevant', 'O'), ('and', 'O'), ('is', 'O'), ('consequently', 'O'), ('ignored', 'O'), ('.', 'O')]


In [11]:
# split data into two lists of token and labels
tokens = []
labels = []
for sent in data:
    sub_tokens = []
    sub_labels = []
    for token in sent:
        sub_tokens.append(token[0])
        sub_labels.append(token[1])
    tokens.append(sub_tokens)
    labels.append(sub_labels)

In [12]:
print(tokens[-1])
print(labels[-1])

['We', 'show', 'that', 'these', 'structures', 'have', 'the', 'potential', 'to', 'capture', 'the', 'full', 'sentential', 'contexts', 'of', 'a', 'lexeme', 'and', 'provide', 'a', 'uniform', 'basis', 'for', 'the', 'composition', 'of', 'distributional', 'knowledge', 'in', 'a', 'way', 'that', 'captures', 'both', 'mutual', 'disambiguation', 'and', 'generalization', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O']


# Preparing data for training

In [13]:
# covert label into integer
all_labels = set() 
for sent in labels: 
    for label in sent:
        all_labels.add(label) 
        
print(all_labels) 

label2int = dict()
for i, label in enumerate(all_labels):
    label2int[label] = i

print(label2int)


int_labels = [[label2int[label] for label in sent] for sent in labels]
print(int_labels)

{'B', 'I', 'O'}
{'B': 0, 'I': 1, 'O': 2}
[[0, 1, 2, 2, 0, 1, 1, 2, 2, 2, 0, 1, 2, 2, 0, 1, 2], [2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 0, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2], [2, 2, 2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 0, 1, 1, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2], [0, 1, 1, 2, 0, 2, 2, 2, 0, 1, 2, 2,

In [14]:
#Covert token into integer
all_tokens = set()
for sent in tokens:
    for token in sent:
        all_tokens.add(token.lower()) 
        
print(all_tokens)

token2int = dict()
for i, token in enumerate(all_tokens):
    token2int[token] = i

print(token2int)

int_tokens = [[token2int[token.lower()] for token in sent] for sent in tokens] 
print(int_tokens)

{'noisy', 'biases', 'additionally', 'content', 'shown', 'hole', 'wisdom', 'computer', 'attested', 'individually', 'strongly', 'subsampling', 'thereby', 'encode', 'comparatively', 'attempt', 'extensions', 'symmetrical', 'automation', 'gale', 'arise', 'interact', 'filtering', 'bar', 'define', 'strengths', 'norwegian', 'ccgs', 'i', 'reproducible', 'interpolating', 'logic', 'rarely', 'substitutes', 'symbols', 'minimum', 'mrls', 'computational', 'description', 'dependent', 'strategies', 'partitioned', 'documents', 'attributes', 'effectiveness', 'lattices', 'choose', 'printer', 'etc', 'produced', 'imply', 'collins', 'enhanced', 'hold', 'ers', 'allocation', 'identifies', 'yielding', 'resistant', 'linguistically', 'hundreds', 'work', 'test', 'provide', 'g', 'least', 'although', 'linking', 'induces', 'parallel', 'framenet', 'perceptibly', 'takes', 'setting', 'reading', 'monolingual', 'resolution', ':', 'speciﬁcally', 'positioned', 'compete', 'typologically', 'dependencies', 'efforts', 'reduce',

In [15]:
# create reverse dictionaries (int2label, int2token) to map integer labels and integer tokens back to labels and tokens
int2label = {value: key for key, value in label2int.items()}
int2token = {value: key for key, value in token2int.items()}


print(int2label)
print(int2token)


{0: 'B', 1: 'I', 2: 'O'}
{0: 'noisy', 1: 'biases', 2: 'additionally', 3: 'content', 4: 'shown', 5: 'hole', 6: 'wisdom', 7: 'computer', 8: 'attested', 9: 'individually', 10: 'strongly', 11: 'subsampling', 12: 'thereby', 13: 'encode', 14: 'comparatively', 15: 'attempt', 16: 'extensions', 17: 'symmetrical', 18: 'automation', 19: 'gale', 20: 'arise', 21: 'interact', 22: 'filtering', 23: 'bar', 24: 'define', 25: 'strengths', 26: 'norwegian', 27: 'ccgs', 28: 'i', 29: 'reproducible', 30: 'interpolating', 31: 'logic', 32: 'rarely', 33: 'substitutes', 34: 'symbols', 35: 'minimum', 36: 'mrls', 37: 'computational', 38: 'description', 39: 'dependent', 40: 'strategies', 41: 'partitioned', 42: 'documents', 43: 'attributes', 44: 'effectiveness', 45: 'lattices', 46: 'choose', 47: 'printer', 48: 'etc', 49: 'produced', 50: 'imply', 51: 'collins', 52: 'enhanced', 53: 'hold', 54: 'ers', 55: 'allocation', 56: 'identifies', 57: 'yielding', 58: 'resistant', 59: 'linguistically', 60: 'hundreds', 61: 'work', 6

## creating training and validation data

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

max_len = 16
batch_size = 64
embed_size = 300
hidden_size = 128

In [17]:
# creating tensor
X = torch.zeros(len(tokens), max_len, dtype=torch.long)


for i, int_text in enumerate(int_tokens):
    if len(int_text) < max_len: 
        int_text = int_text + [len(token2int)] * (max_len - len(int_text)) 
    X[i] = torch.LongTensor(int_text[:max_len]) 
    
Y = torch.zeros(len(tokens), max_len, dtype=torch.long)
for i, int_label in enumerate(int_labels):
    if len(int_label) < max_len:
        int_label = int_label + [len(label2int)] * (max_len - len(int_label))
    Y[i] = torch.LongTensor(int_label[:max_len])

print(X.size())
print(Y.size())


torch.Size([965, 16])
torch.Size([965, 16])


In [18]:
X_train = X[:772]  # train 80% of dataset
X_valid = X[772:862] # around 10% for both valid and test set
X_test = X[862:]

Y_train = Y[:772]
Y_valid = Y[772:862]
Y_test = Y[862:]

In [19]:
from torch.utils.data import TensorDataset, DataLoader
train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)
test_set = TensorDataset(X_test, Y_test)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)

## using pre-trained Fasttext embeddings

In [20]:
token2int['<eos>'] = len(token2int) 
label2int['<eos>'] = len(label2int)
vocab_size = len(token2int)
embeddings = torch.zeros(vocab_size, embed_size)


with open('wiki.en.filtered.vec') as f:
    for line in f: 
        token = line.split(' ')[0] 
        embed = line.split(' ')[1:]
        tokens.append((token, embed)) 
        
        if token in token2int.keys(): 
            embeddings[token2int[token]] = torch.FloatTensor([float(x) for x in embed]) 

print(embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0152,  0.0724, -0.2539,  ...,  0.0842, -0.0279, -0.1278],
        [-0.1792,  0.3355, -0.0599,  ...,  0.1347,  0.5910, -0.1912],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [21]:
label2int

{'B': 0, 'I': 1, 'O': 2, '<eos>': 3}

## RNN model

In [22]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=token2int['<eos>'])
        self.embed.weight = nn.Parameter(embeddings, requires_grad=False)
        self.rnn = nn.GRU(embed_size, hidden_size, bias=False, num_layers=1, bidirectional=False, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.decision = nn.Linear(hidden_size * 1 * 1, len(label2int))
        
    def forward(self, x):
        embed = self.embed(x)
        output, hidden = self.rnn(embed) 
        return self.decision(self.dropout(output))

rnn_model = RNN()
rnn_model

RNN(
  (embed): Embedding(3364, 300, padding_idx=3363)
  (rnn): GRU(300, 128, bias=False, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (decision): Linear(in_features=128, out_features=4, bias=True)
)

In [46]:
# evaluation function
def perf(model, loader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = correct = num_loss = num_perf = 0
    for x, y in loader:
        with torch.no_grad():
            y_scores = model(x)
            loss = criterion(y_scores.view(y.size(0) * y.size(1), -1), y.view(y.size(0) * y.size(1)))
            y_pred = torch.max(y_scores, 2)[1]
            mask = (y != label2int['<eos>'])
            correct += torch.sum((y_pred.data == y) * mask)
            total_loss += loss.item()
            num_loss += len(y)
            num_perf += torch.sum(mask).item()
    return total_loss / num_loss, correct.item() / num_perf

perf(rnn_model, valid_loader) 

(0.033788356516096324, 0.7866666666666666)

In [47]:
def fit(model, epochs):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        num_samples = 0
        
        for x_data, y_data in train_loader:
            x_data = x_data.to(device)
            y_data = y_data.to(device)
            optimizer.zero_grad()
            y_scores = model(x_data)
            loss = criterion(y_scores.transpose(1, 2), y_data) # Modifications faites ici
            num_samples += len(y_data)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        valid_loss, valid_acc = perf(model, valid_loader)
        print(f'Epoch {epoch + 1}/{epochs} | Train loss: {total_loss/num_samples:.4f} | Valid loss: {valid_loss:.4f} | Acc: {valid_acc:.4%}')
    return model

In [48]:
device = torch.device('cpu')
rnn_model = fit(rnn_model, 200)
torch.save(rnn_model.state_dict(), 'model_tag.pt')

Epoch 1/200 | Train loss: 0.0012 | Valid loss: 0.0347 | Acc: 78.3158%
Epoch 2/200 | Train loss: 0.0010 | Valid loss: 0.0341 | Acc: 78.1053%
Epoch 3/200 | Train loss: 0.0011 | Valid loss: 0.0340 | Acc: 78.5965%
Epoch 4/200 | Train loss: 0.0010 | Valid loss: 0.0335 | Acc: 79.0175%
Epoch 5/200 | Train loss: 0.0009 | Valid loss: 0.0349 | Acc: 77.8947%
Epoch 6/200 | Train loss: 0.0010 | Valid loss: 0.0359 | Acc: 76.9123%
Epoch 7/200 | Train loss: 0.0009 | Valid loss: 0.0334 | Acc: 78.8070%
Epoch 8/200 | Train loss: 0.0009 | Valid loss: 0.0340 | Acc: 77.6842%
Epoch 9/200 | Train loss: 0.0009 | Valid loss: 0.0343 | Acc: 77.8947%
Epoch 10/200 | Train loss: 0.0009 | Valid loss: 0.0342 | Acc: 79.0175%
Epoch 11/200 | Train loss: 0.0009 | Valid loss: 0.0361 | Acc: 77.7544%
Epoch 12/200 | Train loss: 0.0009 | Valid loss: 0.0352 | Acc: 79.0877%
Epoch 13/200 | Train loss: 0.0010 | Valid loss: 0.0354 | Acc: 78.1754%
Epoch 14/200 | Train loss: 0.0012 | Valid loss: 0.0340 | Acc: 79.0175%
Epoch 15/200 | 

Epoch 117/200 | Train loss: 0.0009 | Valid loss: 0.0375 | Acc: 78.4561%
Epoch 118/200 | Train loss: 0.0009 | Valid loss: 0.0378 | Acc: 78.3860%
Epoch 119/200 | Train loss: 0.0009 | Valid loss: 0.0374 | Acc: 77.9649%
Epoch 120/200 | Train loss: 0.0009 | Valid loss: 0.0373 | Acc: 78.1754%
Epoch 121/200 | Train loss: 0.0009 | Valid loss: 0.0376 | Acc: 78.3158%
Epoch 122/200 | Train loss: 0.0009 | Valid loss: 0.0381 | Acc: 78.5263%
Epoch 123/200 | Train loss: 0.0009 | Valid loss: 0.0375 | Acc: 78.3860%
Epoch 124/200 | Train loss: 0.0009 | Valid loss: 0.0372 | Acc: 78.1053%
Epoch 125/200 | Train loss: 0.0008 | Valid loss: 0.0374 | Acc: 78.3860%
Epoch 126/200 | Train loss: 0.0009 | Valid loss: 0.0381 | Acc: 78.1754%
Epoch 127/200 | Train loss: 0.0009 | Valid loss: 0.0384 | Acc: 78.4561%
Epoch 128/200 | Train loss: 0.0009 | Valid loss: 0.0388 | Acc: 78.5263%
Epoch 129/200 | Train loss: 0.0008 | Valid loss: 0.0389 | Acc: 78.3158%
Epoch 130/200 | Train loss: 0.0008 | Valid loss: 0.0384 | Acc: 7

# evaluation 

In [49]:
# Evaualtion on test set
loss, accuracy = perf(rnn_model, test_loader)

In [50]:
print("Our model with test set we got Accuracy :"+str(accuracy*100)+'%')

Our model with test set we got Accuracy :79.0914747977598%


In [51]:
def tag_sentence(model, i):
    int2label[3] = 'N/A'
    sentence = X_test[i]
    labels = Y_test[i]
    model.eval()
    y_scores = model(sentence)
    y_pred = y_scores.argmax(1) 
    print('TOKEN'.ljust(10), 'PRED'.ljust(5), 'TRUE')
    print('-'*20)
    for j, pred in enumerate(y_pred):
        print(
              int2token[sentence[j].item()].ljust(10),
              int2label[pred.item()].ljust(5),
              int2label[labels[j].item()]
        )

In [52]:
tag_sentence(rnn_model,3)

TOKEN      PRED  TRUE
--------------------
we         O     O
test       O     O
two        O     O
ways       O     O
of         O     O
measuring  O     O
clusterability B     B
:          I     O
(          O     O
1          B     O
)          O     O
existing   B     O
measures   B     O
from       O     O
the        O     O
machine    B     B


In [53]:
list_pred = []
for sent in X_test:
    rnn_model.eval()
    y_scores = rnn_model(sent)
    y_pred = y_scores.argmax(1)
    
    for j, pred in enumerate(y_pred):
        list_pred.append(pred.item())

In [54]:
list_refs = []
for tag in Y_test:
    for sub_tag in tag:
#         print(sub_tag.item())
        list_refs.append(sub_tag.item())

In [55]:
pair_BI=[]
pred_BI=[]
refs_BI=[]
for i in range(len(list_refs)):
    if list_refs[i]!=label2int['O']:
        pair_BI.append([list_refs[i],list_pred[i]])
# print(pair_BI)

for i in range(len(pair_BI)):
    refs_BI.append(pair_BI[i][0])
    pred_BI.append(pair_BI[i][1])



In [56]:
label2int['O']

2

In [57]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
print("Evaluation with IBO tag")
print('f1-score:',f1_score(list_refs,list_pred,average='weighted'))
print('precision_score',precision_score(list_refs,list_pred,average='weighted'))
print('recall_score',recall_score(list_refs,list_pred,average='weighted'))
print('accuracy_score',accuracy_score(list_refs,list_pred))

Evaluation with IBO tag
f1-score: 0.7875494710127203
precision_score 0.7837184443219074
recall_score 0.7930825242718447
accuracy_score 0.7930825242718447


In [58]:
print("Evaluation with only IB tag")
print('f1-score:',f1_score(refs_BI,pred_BI,average='weighted'))
print('precision_score',precision_score(refs_BI,pred_BI,average='weighted'))
print('recall_score',recall_score(refs_BI,pred_BI,average='weighted'))
print('accuracy_score',accuracy_score(refs_BI,pred_BI))

Evaluation with only IB tag
f1-score: 0.5795602632569925
precision_score 0.8518604411461554
recall_score 0.4497354497354497
accuracy_score 0.4497354497354497


  _warn_prf(average, modifier, msg_start, len(result))
