In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from main import NER

### Ablation groups - each group is a list of features to remove from the feature set


In [None]:
feat_groups = [["lemma", "pos", "chunk", "prev_token", "prev_pos", "next_token", "next_pos", "is_acronym", "is_cap", "is_oov"],
                ["pos", "chunk", "prev_token", "prev_pos", "next_token", "next_pos",],
                ["lemma", "pos", "chunk", "is_acronym", "is_cap", "is_oov"],
                ["lemma", "pos", "chunk", "prev_token", "prev_pos", "next_token", "next_pos"],
                ["lemma", "pos", "prev_token", "prev_pos", "next_token", "next_pos", "is_acronym", "is_cap", "is_oov"],
                ["lemma", "chunk", "prev_token", "prev_pos", "next_token", "next_pos", "is_acronym", "is_cap", "is_oov"],
                ["lemma"],
                ["wv" + str(i) for i in range(300)]]

ner = NER()
feats, gold = ner.extract_features_and_labels('data/conll2003.train.conll')

In [None]:
for i, group in enumerate(feat_groups):
    mname = "lr" + str(i)
    ner = NER(mname)
    
    print("Ablation " + str(i) + "\n")

    new_feats = [{k: v for k, v in d.items() if k not in group} for d in feats]
    
    ml_model, vectoriser = ner.create_classifier(new_feats, gold)
    ner.classify_data(ml_model, vectoriser)

In [None]:
res = []
features, gold_labels = ner.extract_features_and_labels('data/conll2003.test.conll')
scores = pd.DataFrame(index=["ORG", "MISC", "O", "LOC", "PER"])

for i in range(len(feat_groups)):
    out = "out/ablation" + str(i) + ".txt"
    
    out_labels = ner.extract_features_and_labels(out, simple=True)[1]
    
    df = pd.DataFrame({'gold': gold_labels})
    df['out' + str(i)] = out_labels
    
    scores['F_' + str(i)] = f1_score(gold_labels, out_labels, average="macro")

scores.loc['AVERAGE'] = scores.mean()

scores

In [None]:
plt.bar(scores.loc["AVERAGE"].index, scores.loc["AVERAGE"])
plt.show()