# <span style="font-family:Courier New; color:#CCCCCC">**Named Entity Recognition CRF**</span>

## <span style="font-family:Courier New; color:#336666">**Load Data and Imports**</span>

In [1]:
from preprocessing import convert_BIO
from NER_evaluation import *
from feature_getter import Feature_getter
import pycrfsuite

import nltk
nltk.download('conll2002')
from nltk.corpus import conll2002

esp_train = conll2002.iob_sents('esp.train') 
esp_val = conll2002.iob_sents('esp.testa')
esp_test = conll2002.iob_sents('esp.testb')

ned_train = conll2002.iob_sents('ned.train')
ned_val = conll2002.iob_sents('ned.testa')
ned_test = conll2002.iob_sents('ned.testb')

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Jordi\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


## <span style="font-family:Courier New; color:#336666">**Train Classifier**</span>

In [2]:
esp_train = convert_BIO(esp_train)
model = nltk.tag.CRFTagger(feature_func = Feature_getter())
model.train(esp_train, 'model.crf.tagger')

In [None]:
esp_test = convert_BIO(esp_test)
X_esp_test = [[word[0] for word in sent] for sent in esp_test]
pred = model.tag_sents(X_esp_test)

In [4]:
results, results_agg_ent = compute_metrics(esp_test, pred)
results

{'correct': 2577,
 'incorrect': 530,
 'partial': 108,
 'missed': 393,
 'spurious': 268,
 'possible': 3608,
 'actual': 3483,
 'precision': 0.739879414298019,
 'recall': 0.7142461197339246,
 'F1-score': 0.726836835425187}

## <span style="font-family:Courier New; color:#336666">**Feature selection**</span>

In [None]:
train_sents = convert_BIO(esp_train)
test_sents = convert_BIO(esp_test)
val_sents = convert_BIO(esp_val)

X_val_sents = [[word[0] for word in sent] for sent in val_sents]
X_test_sents = [[word[0] for word in sent] for sent in test_sents]

Unigram

### Best n-gram

Unigram

In [5]:
model = nltk.tag.CRFTagger()
model.train(train_sents, 'model.crf.tagger')

pred = model.tag_sents(X_val_sents)
results, _ = compute_metrics(val_sents, pred)

res_uni = results['F1-score']

In [6]:
res_uni

0.6810721454717203

Bigram

In [5]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(bigram = True, trigram = False, morphology = False, length = False, prefix = False,
                 sufix = True, lemma = False, POS = False, shape = False))
model.train(train_sents, 'model.crf.tagger')

pred = model.tag_sents(X_val_sents)
results, _ = compute_metrics(val_sents, pred)

res_bi = results['F1-score']

Processing sentence 10238...

In [6]:
res_bi

0.6605460832240371

Trigrams

In [8]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(bigram = True, trigram = True, morphology = False, length = False, prefix = False,
                 sufix = True, lemma = False, POS = False, shape = False))
model.train(train_sents, 'model.crf.tagger')

pred = model.tag_sents(X_val_sents)
results, _ = compute_metrics(val_sents, pred)

res_tri = results['F1-score']

Processing sentence 10238...

In [9]:
res_tri

0.6542188607894115

### Including morphology

Without morphology

In [12]:
res_uni 

0.6810721454717203

Including morphology

In [14]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(bigram = False, trigram = False, morphology = True, length = False, prefix = False,
                 sufix = True, lemma = False, POS = False, shape = False))
model.train(train_sents, 'model.crf.tagger')

pred = model.tag_sents(X_val_sents)
results, _ = compute_metrics(val_sents, pred)

best_morphology = results['F1-score']

Processing sentence 10238...

In [15]:
best_morphology

0.6142925890279114

Including all other variables


In [17]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(bigram = False, trigram = False, morphology = True, length = True, prefix = True,
                 sufix = True, lemma = True, POS = True, shape = True))
model.train(train_sents, 'model.crf.tagger')

pred = model.tag_sents(X_val_sents)
results, _ = compute_metrics(val_sents, pred)

best_other = results['F1-score']

Processing sentence 10238...

In [18]:
best_other

0.6606003308910423

Interactions(all)

In [20]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter())
model.train(train_sents, 'model.crf.tagger')

pred = model.tag_sents(X_val_sents)
results, _ = compute_metrics(val_sents, pred)

best_all = results['F1-score']

Processing sentence 10238...

In [21]:
best_all

0.682744960969358

## <span style="font-family:Courier New; color:#336666">**Hiperparameters selection**</span>

We will begin with hiperparameters selection. However, we will perform it on the base features of the classifier. The reason lies in the runtime that a training with all features bears with it, along with the assumption that the hiperparameters doesnt have distinct interactions among the different features

We will try to do a custom function that does a gridsearch over the different values we try to test.

In [22]:
import pandas as pd

In [23]:
hyperparameters = {
    'c1': [0.1, 0.5, 1.0],
    'c2': [0.1, 0.5, 1.0],
    'max_iterations': [50, 100, 200]
}

In [31]:
def gridsearch_cv(hyperparameters,train_sents,val_sents,X_val_sents):
    results_df = pd.DataFrame(columns = ['c1', 'c2', 'max_iterations', 'F1-score'])
    best_f1 = 0
    best_params = {}
    num_combinations = len(hyperparameters['c1']) * len(hyperparameters['c2']) * len(hyperparameters['max_iterations'])
    current_combination = 0
    for c1 in hyperparameters['c1']:
        for c2 in hyperparameters['c2']:
            for max_iter in hyperparameters['max_iterations']:
                current_combination += 1
                print(f'Fitting model {current_combination} of {num_combinations}', end = '\r')
                model = nltk.tag.CRFTagger(training_opt = {'c1': c1, 'c2': c2, 'max_iterations': max_iter})
                model.train(train_sents, 'model.crf.tagger')

                pred = model.tag_sents(X_val_sents)
                results, _ = compute_metrics(val_sents, pred)
                results_df.loc[len(results_df)] = [c1, c2, max_iter, results['F1-score']]
                if results['F1-score'] > best_f1:
                    best_f1 = results['F1-score']
                    best_params = {'c1': c1, 'c2': c2, 'max_iterations': max_iter}

    return best_f1,best_params,results_df

In [32]:
best, best_params,dataframe = gridsearch_cv(hyperparameters,train_sents,val_sents,X_val_sents)

Fitting model 27 of 27

In [35]:
dataframe


Unnamed: 0,c1,c2,max_iterations,F1-score
0,0.1,0.1,50.0,0.695958
1,0.1,0.1,100.0,0.70631
2,0.1,0.1,200.0,0.704532
3,0.1,0.5,50.0,0.684173
4,0.1,0.5,100.0,0.68849
5,0.1,0.5,200.0,0.689566
6,0.1,1.0,50.0,0.67844
7,0.1,1.0,100.0,0.679816
8,0.1,1.0,200.0,0.676269
9,0.5,0.1,50.0,0.690627


 Now lets try with the complete model

In [36]:
def gridsearch_cv_complete(hyperparameters,train_sents,val_sents,X_val_sents):
    results_df = pd.DataFrame(columns = ['c1', 'c2', 'max_iterations', 'F1-score'])
    best_f1 = 0
    best_params = {}
    num_combinations = len(hyperparameters['c1']) * len(hyperparameters['c2']) * len(hyperparameters['max_iterations'])
    current_combination = 0
    for c1 in hyperparameters['c1']:
        for c2 in hyperparameters['c2']:
            for max_iter in hyperparameters['max_iterations']:
                current_combination += 1
                print(f'Fitting model {current_combination} of {num_combinations}', end = '\r')
                model = nltk.tag.CRFTagger(training_opt = {'c1': c1, 'c2': c2, 'max_iterations': max_iter}, feature_func = Feature_getter())
                model.train(train_sents, 'model.crf.tagger')

                pred = model.tag_sents(X_val_sents)
                results, _ = compute_metrics(val_sents, pred)
                results_df.loc[len(results_df)] = [c1, c2, max_iter, results['F1-score']]
                if results['F1-score'] > best_f1:
                    best_f1 = results['F1-score']
                    best_params = {'c1': c1, 'c2': c2, 'max_iterations': max_iter}

    return best_f1,best_params,results_df

In [38]:
best_complete, best_params_complete,dataframe_complete = gridsearch_cv_complete(hyperparameters,train_sents,val_sents,X_val_sents)

Processing sentence 10238...

In [40]:
dataframe_complete.sort_values(by = 'F1-score', ascending = False)

Unnamed: 0,c1,c2,max_iterations,F1-score
8,0.1,1.0,200.0,0.680509
7,0.1,1.0,100.0,0.680135
17,0.5,1.0,200.0,0.677962
5,0.1,0.5,200.0,0.677239
14,0.5,0.5,200.0,0.676632
13,0.5,0.5,100.0,0.675064
4,0.1,0.5,100.0,0.67335
6,0.1,1.0,50.0,0.672271
2,0.1,0.1,200.0,0.671928
11,0.5,0.1,200.0,0.671798
