In [3]:
import os
import pandas as pd
import fnmatch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
#pip install keybert

## Functions

In [4]:
#function that gets tokens from files
def tokens_from_files(directory, pattern='*.final'):
    """
    Recursively find all files matching the pattern,calls another function to extract tokens
    Input: path to directory (string) 
    Returns: list of tokens """
    
    file_path_list = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            file_path_list.append(os.path.join(root, filename))

    token_label_list=_extract_tokens(file_path_list)
    return token_label_list

def _extract_tokens(file_list):
    """
    Extracts tokens from a list of files
    Input: list of file names 
    Returns: list of tokens """
    
    token_label_list = []
    for file in file_list:
        with open(file,'r',errors='replace') as f:
             for token_label in f.read().splitlines():
                token_label_list.append(token_label)
    return token_label_list

#Function that takes a list of tokens, cleans it and outputs dataframe
def deep_clean(str_list):
    '''
    Cleans data and converts list of tokens to dataframe
    Input: list of tokens
    Returns: dataframe with columns 'Token' and 'Label' '''
    
    str_list = list(filter(str.strip, str_list)) #remove empty lines
    last_label='' #variable that keeps track of the label of the previous row
    df = pd.DataFrame(columns = ["Token", "Label"]) #creating an empty dataframe with two columns

    for line in str_list:
        line="".join(line.rstrip().lstrip()) #removes spaces on the left and right of each line
        if line[-1]=='I' and last_label in'Oo': #checks that no 'I' labels are preceded by an'O'
            line = list(line)
            line[-1] = 'B'  #if the condition is true, the 'I' is replaced by a 'B'
            line=''.join(line)
        if line[-1]=='0': #if the label is annotated as '0' it is changed to 'O'
            line = list(line)
            line[-1] = 'O'
            line=''.join(line)
        if line[-2:]=='II': #if the label is annotated as 'II' it is changed to 'I'
            line = list(line)
            line[-2:] = 'I'
            line=''.join(line)
        if line[-1] in 'OBIobi': #make sure all labels are either 'O' 'I' or 'B'
            try:
                row=line.split() #split by space or tab
                df.loc[len(df)] = row #add row to dataframe
            except ValueError:
                pass #skip problematic lines
            df['Label'] = df['Label'].str.upper() #make all labels upper case
        last_label=line[-1] #updates label of previous row for next loop
    return df
def df_to_text(df):
    '''
    Converts dataframe of tokens to text
    Input: dataframe
    Returns: string'''
    
    data_csv = df['Token'].to_csv(header=None, index=False).split('\n')
    text_from_df = ' '.join(data_csv)
    return text_from_df

def normalize_model_tag(label):
    '''Normalizes tags on model data
       Input: tag, dtype: string
       Output: tag, dtype: string'''
    
    map_dict = {'B-ORG':'B','B-LOC':'B','B-PER':'B','B-MISC':'MISC', 
                  'I-ORG':'ORG','I-LOC':'LOC','I-PER':'PER','I-MISC':'MISC'}
    replace_if_need = lambda entity: map_dict.get(entity,entity) #if the tag is in the dict, give its replacement, otherwise keep it 
    label['entity'] = replace_if_need(label['entity'])
    return label

### Main

In [5]:
# Note: This cell takes a long time to execute

path='../Dataset/' #add your directory path

str_list = tokens_from_files(path) #find final annotation files in directory and extract tokens
df=deep_clean(str_list) #create dataframe with tokens and labels (and remove/correct erroneous lines)

In [6]:
str_list = tokens_from_files(path+'train') #find final annotation files in directory and extract tokens
df_train=deep_clean(str_list) #create dataframe with tokens and labels (and remove/correct erroneous lines)

In [7]:
str_list = tokens_from_files(path+'test') #find final annotation files in directory and extract tokens
df_test=deep_clean(str_list) #create dataframe with tokens and labels (and remove/correct erroneous lines)

In [8]:
df

Unnamed: 0,Token,Label
0,Translation,B
1,models,I
2,used,O
3,for,O
4,statistical,B
...,...,...
26660,mutual,O
26661,disambiguation,B
26662,and,O
26663,generalization,B


In [9]:
text=df_to_text(df)

In [10]:
text

'Translation models used for statistical machine translation are compiled from parallel corpora that are manually translated . The common assumption is that parallel texts are symmetrical : The direction of translation is deemed irrelevant and is consequently ignored . Much research in Translation Studies indicates that the direction of translation matters "," however "," as translated language ( translationese ) has many unique properties . It has already been shown that phrase tables constructed from parallel corpora translated in the same direction as the translation task outperform those constructed from corpora translated in the opposite direction . We reconfirm that this is indeed the case "," but emphasize the importance of also using texts translated in the “ wrong ” direction . We take advantage of information pertaining to the direction of translation in constructing phrase tables by adapting the translation model to the special properties of translationese . We explore two a

## Comparison with KeyBERT

In [1]:
from keybert import KeyBERT
kw_model = KeyBERT(model="all-MiniLM-L6-v2")

#kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [11]:
kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 4), stop_words='english', top_n=200)

[('phrase tables adapting translation', 0.7342),
 ('tables statistical machine translation', 0.6856),
 ('translation constructing phrase tables', 0.6739),
 ('statistical machine translation specifically', 0.6675),
 ('tables adapting translation model', 0.6642),
 ('various statistical machine translation', 0.6628),
 ('tables adapting translation', 0.6601),
 ('statistical machine translation models', 0.6562),
 ('statistical machine translation', 0.6562),
 ('machine translation attempts', 0.6346),
 ('statistical machine translation model', 0.6344),
 ('current machine translation', 0.633),
 ('statistical machine translation actually', 0.6293),
 ('adapting translation model', 0.6291),
 ('translation models used statistical', 0.6255),
 ('machine translation information', 0.6202),
 ('techniques statistical machine translation', 0.6197),
 ('various existing machine translation', 0.619),
 ('traditional statistical machine translation', 0.6178),
 ('improved statistical machine translation', 0.61

## CRF

In [12]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [13]:
def make_sent(df):
    '''Turns every row in the data frame to a tuple. 
    The tuples of one sentence are saved in a list.
    The sentences are saved in a list.
    Input: dataframe
    Returns: list of list of tuples'''
    tuple_list=list(df.itertuples(index=False, name=None))
    stoplist=['.','?','!']
    sent_list=[]
    sent=[]
    for el in tuple_list:
        sent.append(el)
        if el[0] in stoplist:
            sent_list.append(sent)
            sent=[]
    return sent_list
    

In [14]:
train_sents=make_sent(df_train)
test_sents=make_sent(df_test)

In [15]:
train_sents

[[('The', 'O'),
  ('most', 'O'),
  ('widely', 'O'),
  ('adopted', 'O'),
  ('approaches', 'O'),
  ('for', 'O'),
  ('evaluation', 'B'),
  ('of', 'O'),
  ('summary', 'B'),
  ('content', 'I'),
  ('follow', 'O'),
  ('some', 'O'),
  ('protocol', 'O'),
  ('for', 'O'),
  ('comparing', 'O'),
  ('a', 'O'),
  ('summary', 'O'),
  ('with', 'O'),
  ('gold', 'B'),
  ('-', 'I'),
  ('standard', 'I'),
  ('human', 'B'),
  ('summaries', 'I'),
  (',', 'O'),
  ('which', 'O'),
  ('are', 'O'),
  ('traditionally', 'O'),
  ('called', 'O'),
  ('model', 'B'),
  ('summaries', 'I'),
  ('.', 'O')],
 [('This', 'O'),
  ('evaluation', 'B'),
  ('paradigm', 'I'),
  ('falls', 'O'),
  ('short', 'O'),
  ('when', 'O'),
  ('human', 'B'),
  ('summaries', 'I'),
  ('are', 'O'),
  ('not', 'O'),
  ('available', 'O'),
  ('and', 'O'),
  ('becomes', 'O'),
  ('less', 'O'),
  ('accurate', 'O'),
  ('when', 'O'),
  ('only', 'O'),
  ('a', 'O'),
  ('single', 'O'),
  ('model', 'B'),
  ('is', 'O'),
  ('available', 'O'),
  ('.', 'O')],
 [('We

In [16]:
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),      
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
                
    return features

In [17]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [18]:
sent2features(train_sents[0])[0]


{'bias': 1.0,
 'word.lower()': 'the',
 'word[-3:]': 'The',
 'word[-2:]': 'he',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'BOS': True,
 '+1:word.lower()': 'most',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False}

In [19]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 63.1 ms, sys: 3.46 ms, total: 66.6 ms
Wall time: 64.7 ms


In [20]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)

CPU times: user 679 ms, sys: 24.1 ms, total: 703 ms
Wall time: 701 ms


## Evaluation

In [29]:
# Evaluation with IBO tag
labels = list(crf.classes_)
labels.remove('O')
labels

['B', 'I']

In [30]:
y_pred = crf.predict(X_test)
f1=round(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels),3)
acc=round(metrics.flat_accuracy_score(y_test,y_pred),3)
prec=round(metrics.flat_precision_score(y_test, y_pred,average='weighted'),3)
rec=round(metrics.flat_recall_score(y_test, y_pred,average='weighted'),3)

In [31]:
print(f'Accuracy:  {acc}\nPrecision: {prec}\nRecall:    {rec}\nF1-score:  {f1}')

Accuracy:  0.879
Precision: 0.875
Recall:    0.879
F1-score:  0.723
