# Exercise

Ruijie Wang, Pascal Severin Andermatt | 12.10.2022  
Based on [Named Entity Recognition and Classification with Scikit-Learn](https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2), [ACNER](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus), [Hidden Markov Model for POS-tagging](https://medium.com/mlearning-ai/introduction-to-hidden-markov-model-hmm-with-simple-ner-d1353ff35842) and [sklearn-crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/index.html).

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
import sklearn
from itertools import chain
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import numpy as np

## Conditional Random Fields (CRFs)

- Collate the original dataframe into a nested list.

In [None]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
# Reduce the number of samples here for faster training
df = df.fillna(method='ffill')

def collate(dataframe):
    agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                        s['POS'].values.tolist(), 
                                                        s['Tag'].values.tolist())]
    grouped = dataframe.groupby('Sentence #').apply(agg_func)
    return [s for s in grouped]

sentences = collate(df)

* Next, we extract more features (word parts, IBO prefixes, lower/title/upper flags, features of neighboring words) and convert them to the data structure required by sklearn-crfsuite — each sentence should be converted to a list of dicts.

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'word.lower()': word.lower(),  # the word in lowercase
        'word[-3:]': word[-3:],  # last three characters
        'word[-2:]': word[-2:],  # last two characters
        'word.isupper()': word.isupper(),  # true, if the word is in uppercase
        'word.istitle()': word.istitle(),  # true, if the first character is in uppercase and remaining characters are in lowercase
        'word.isdigit()': word.isdigit(),  # true, if all characters are digits
        'postag': postag,  # POS tag
        'postag[:2]': postag[:2],  # IOB prefix
    }
    if i > 0:
        word1 = sent[i-1][0]  # the previous word
        postag1 = sent[i-1][1]  # POS tag of the previous word
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })  # add some features of the previous word
    else:
        features['BOS'] = True  # BOS: begining of the sentence
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]  # the next word
        postag1 = sent[i+1][1]  # POS tag of the next word
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })  # add some features of the next word
    else:
        features['EOS'] = True  # EOS: end of the sentence
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

In [None]:
# X and y in the required format
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

# unique tags for reporting classification results
classes = np.unique(df.Tag.values)
classes = classes.tolist()

new_classes = classes.copy()
new_classes.pop()

# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

- Train a CRF model

In [None]:
# Task 1: Train a CRF model using sklearn_crfsuite and find the best parameters
# Hint: go to https://sklearn-crfsuite.readthedocs.io/en/latest/api.html and https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

# Task 2: Check the code below and run it to see the classification report/confusion matrix

# Task 3: add some features to the word2features function and see if you can improve the results

crf = sklearn_crfsuite.CRF(
    ...
)

# This might take a while (about 3 minutes). If you want to speed up the training, you can reduce the number of training samples.
crf.fit(X_train, y_train)

In [None]:
def confusion_matrix(y_true, y_pred, summarize_iob_tags=False):
    f_y_test = list(chain.from_iterable(y_test))
    f_y_pred = list(chain.from_iterable(y_pred))
    if summarize_iob_tags:
        # summarize IOB tags to only keep the first letter (eg. B-art -> B)
        f_y_test = [t[0] for t in f_y_test]
        f_y_pred = [t[0] for t in f_y_pred]

    labels = np.unique(f_y_test)
    print(sklearn.metrics.classification_report(f_y_test, f_y_pred))
    print(f"Accuracy: {sklearn.metrics.accuracy_score(f_y_test, f_y_pred)} for {labels}")

    cm = sklearn.metrics.confusion_matrix(f_y_test, f_y_pred, labels=labels)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    df_cm = pd.DataFrame(cm, index = labels, columns = labels).fillna(0)
    plt.figure(figsize = (10,7))
    sn.heatmap(df_cm, annot=True)

y_pred = crf.predict(X_test)

confusion_matrix(y_test, y_pred, summarize_iob_tags=True)
confusion_matrix(y_test, y_pred, summarize_iob_tags=False)

# [ELI5](https://eli5.readthedocs.io/en/latest/)
* ELI5 is a Python package which can be used to check weights of `sklearn_crfsuite.CRF` models.
* Inspect model weights

In [None]:
import eli5

In [None]:
eli5.show_weights(crf, top=10)  # top: number of features to show

* We can specify a subset of tags to check.

In [None]:
eli5.show_weights(crf, top=10, targets=['O', 'B-org', 'I-per'])

- Or check only some of the features for all tags.

In [None]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])