# <span style="font-family:Courier New; color:#CCCCCC">**CADEC Named Entity Recognition CRF**</span>

## <span style="font-family:Courier New; color:#336666">**Load Data and Imports**</span>

In [2]:
from preprocessing import convert_BIO
from NER_evaluation import *
from feature_getter import Feature_getter
import pycrfsuite
from collections import Counter
import pandas as pd
import nltk
import re

In [3]:
def decode_cadec(file_path):
    pattern = r'\.\d+$'
    ent_map = {1: 'ADR', 2: 'Di', 3: 'Dr', 4: 'S', 5: 'F'}
    data = list()
    with open(file_path, "r") as file:
        sent = list()
        for line in file:
            tok = line.split()
            if tok == []:
                "add previous sentence tokens"
                data.append(sent)
                sent = list()
            else:
                if not re.search(pattern, tok[0]):
                    for i, label in enumerate(tok):
                        if i > 0 and label != 'O':
                            sent.append((tok[0], '', label[0] + '-' + ent_map[i]))
                            break
                        if i == len(tok) - 1:
                            sent.append((tok[0], '', 'O'))
    return data

train_data = decode_cadec('data/train.conll')
test_data = decode_cadec('data/test.conll')

In [3]:
train_BIO = convert_BIO(train_data)
test_BIO = convert_BIO(test_data)

X_test_BIO = [[word[0] for word in sent] for sent in test_BIO]
y_test_BIO = [[word[1] for word in sent] for sent in test_BIO]

## <span style="font-family:Courier New; color:#336666">**Train Classifier**</span>

In [4]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter())
model.train(train_BIO, 'models/model_CADEC_BIO.crf.tagger')

In [10]:
results_df = pd.DataFrame()
def save_results(nclf, results, results_agg_ent, df):
    df.loc[nclf,'total acc'] = results["precision"]
    df.loc[nclf,'total recall'] = results["recall"]
    df.loc[nclf,'total F1'] = results["F1-score"]
    df.loc[nclf,'ADR F1'] = results_agg_ent["ADR"]["F1-score"]
    df.loc[nclf,'Di F1'] = results_agg_ent["Di"]["F1-score"]
    df.loc[nclf,'Dr F1'] = results_agg_ent["Dr"]["F1-score"]
    df.loc[nclf,'S F1'] = results_agg_ent["S"]["F1-score"]
    df.loc[nclf,'F F1'] = results_agg_ent["F"]["F1-score"]
    return df


pred = model.tag_sents(X_test_BIO)
results, results_agg_ent = compute_metrics(test_BIO, pred,mode = 'CADEC')
save_results("BIO_w/o_hiper", results, results_agg_ent, results_df)

Unnamed: 0,total acc,total recall,total F1,ADR F1,Di F1,Dr F1,S F1,F F1
BIO_w/o_hiper,0.737,0.605,0.664,0.664,0.305,0.847,0.25,0.182


### <span style="font-family:Courier New; color:#336666">**BIO with conll hiperparameters**</span>

In [11]:
customed_hyperparams = {'c1': 0.01, 'c2': 1, 'max_iterations': 200, 'feature.possible_transitions': False,
                                            'feature.possible_states': True, 'feature.minfreq': 0}

In [12]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(), training_opt = customed_hyperparams)
model.train(train_BIO, 'models/model_CADEC_BIO_hyp.crf.tagger')

In [14]:
pred = model.tag_sents(X_test_BIO)
results, results_agg_ent = compute_metrics(test_BIO, pred,mode = "CADEC")
save_results("BIO_hiper", results, results_agg_ent, results_df)

Unnamed: 0,total acc,total recall,total F1,ADR F1,Di F1,Dr F1,S F1,F F1
BIO_w/o_hiper,0.737,0.605,0.664,0.664,0.305,0.847,0.25,0.182
BIO_hiper,0.736,0.608,0.666,0.669,0.298,0.842,0.273,0.174


<span style="font-family:Courier New">We can see how DI,S and F categories are missclassified greatly. Speaking of hiperparameters though, we can see how the hiperparameters for the previous problem seem to slightly improve F1 score.</span>

### <span style="font-family:Courier New; color:#336666">**IO**</span>

In [15]:
train_IO = convert_BIO(train_data, begin = False)
test_IO = convert_BIO(test_data, begin = False)

X_test_IO = [[word[0] for word in sent] for sent in test_IO]
y_test_IO = [[word[1] for word in sent] for sent in test_IO]

In [16]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(), training_opt = customed_hyperparams)
model.train(train_IO, 'models/model_CADEC_IO_hyp.crf.tagger')

In [17]:
pred = model.tag_sents(X_test_IO)
results, results_agg_ent = compute_metrics(test_IO, pred,mode = "CADEC")
save_results("IO_hiper", results, results_agg_ent, results_df)

Unnamed: 0,total acc,total recall,total F1,ADR F1,Di F1,Dr F1,S F1,F F1
BIO_w/o_hiper,0.737,0.605,0.664,0.664,0.305,0.847,0.25,0.182
BIO_hiper,0.736,0.608,0.666,0.669,0.298,0.842,0.273,0.174
IO_hiper,0.725,0.602,0.658,0.656,0.302,0.839,0.25,0.171


### <span style="font-family:Courier New; color:#336666">**BIOS**</span>

In [18]:
train_BIOS = convert_BIO(train_data, begin = True,single = True)
test_BIOS = convert_BIO(test_data, begin = True, single = True)

X_test_BIOS = [[word[0] for word in sent] for sent in test_BIOS]
y_test_BIOS = [[word[1] for word in sent] for sent in test_BIOS]

In [19]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(), training_opt = customed_hyperparams)
model.train(train_BIOS, 'models/model_CADEC_BIOS_hyp.crf.tagger')

In [20]:
pred = model.tag_sents(X_test_BIOS)
results, results_agg_ent = compute_metrics(test_BIOS, pred,mode = "CADEC")
save_results("BIOS_hiper", results, results_agg_ent, results_df)

Unnamed: 0,total acc,total recall,total F1,ADR F1,Di F1,Dr F1,S F1,F F1
BIO_w/o_hiper,0.737,0.605,0.664,0.664,0.305,0.847,0.25,0.182
BIO_hiper,0.736,0.608,0.666,0.669,0.298,0.842,0.273,0.174
IO_hiper,0.725,0.602,0.658,0.656,0.302,0.839,0.25,0.171
BIOS_hiper,0.738,0.613,0.67,0.669,0.272,0.852,0.219,0.203


### <span style="font-family:Courier New; color:#336666">**BIOES**</span>

In [27]:
train_BIOES = convert_BIO(train_data, begin = True,single = True,end = True)
test_BIOES = convert_BIO(test_data, begin = True, single = True,end = True)

X_test_BIOES = [[word[0] for word in sent] for sent in test_BIOES]
y__test_BIOES = [[word[1] for word in sent] for sent in test_BIOES]

In [28]:
model = nltk.tag.CRFTagger(feature_func = Feature_getter(), training_opt = customed_hyperparams)
model.train(train_BIOES, 'models/model_CADEC_BIOES_hyp.crf.tagger')

In [29]:
pred = model.tag_sents(X_test_BIOES)
results, results_agg_ent = compute_metrics(test_BIOES, pred,mode = "CADEC")
save_results("BIOES_hiper", results, results_agg_ent, results_df)

Unnamed: 0,total acc,total recall,total F1,ADR F1,Di F1,Dr F1,S F1,F F1
BIO_w/o_hiper,0.737,0.605,0.664,0.664,0.305,0.847,0.25,0.182
BIO_hiper,0.736,0.608,0.666,0.669,0.298,0.842,0.273,0.174
IO_hiper,0.725,0.602,0.658,0.656,0.302,0.839,0.25,0.171
BIOS_hiper,0.738,0.613,0.67,0.669,0.272,0.852,0.219,0.203
BIOES_hiper,0.73,0.604,0.661,0.66,0.235,0.845,0.182,0.216
