##Dependencies

Set up the logging and import the required packages

In [ ]:
import logging
logging.basicConfig(filename='notebook.log', level=logging.DEBUG)
logging.info('Ready to log.')

def log_print(text, level='info'):
    print(text)
    if level=='info':
        logging.info(text)
    elif level=='error':
        logging.error(text)
    else:
        logging.debug(text)


In [ ]:
import scipy
import pandas as pd
import numpy as np
import eli5
from sklearn_crfsuite import metrics, scorers, CRF, utils
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin 
import pickle
from datetime import datetime as dt
import spacy
import json
# import seaborn as sns
# import matplotlib.pyplot as plt

import os
import sys
sys.path.append(os.path.join(os.getcwd(),'../NER-evaluation'))

log_print('Imports successful.')

##Inspect

In [ ]:
def get_frames(path, k_fold=True):
    ''' Load the multi-index dataframes from file, join training and validation frames if we are using k-fold CV.
    
    Return
    ------
    data : dict
     Training, validation (if not k-fold CV), and test frames.
    '''

    data = {i:pd.read_csv(path+f'{i}.csv', index_col=['sentence_id','token_id']).fillna(value={'token': 'NA'}) for i in ('train','validation','test')}

    if k_fold:
        offset = data['train'].index.max(0)[0]+1
        df = data['validation']
        df.index = df.index.set_levels(df.index.levels[0]+offset, level=0)
        data['train'] = pd.concat([data['train'],data['validation']], verify_integrity=True)

    return data

In [ ]:
try:
    path = '../data/' if 'src' in os.getcwd() else './data/'
    data_conll = get_frames(path+'conll/')
    data_wnut = get_frames(path+'wnut/')

    logging.info('Frames loaded from file.')
except Exception as e:
    logging.error(f'Failed: {e}')
    raise e

In [ ]:
tag_map = {
    'B-product':'B-MISC',
    'I-product':'I-MISC',
    'B-creative-work':'B-MISC',
    'I-creative-work':'I-MISC',
    'B-corporation':'B-ORG',
    'I-corporation':'I-ORG',
    'B-group':'B-MISC',
    'I-group':'I-MISC',
    'B-person':'B-PER',
    'I-person':'I-PER',
    'B-location':'B-LOC',
    'I-location':'I-LOC',
    'O':'O'
}

In [ ]:
try:
    for frame in data_wnut.values():
        frame['tag'] = frame['tag'].map(tag_map)

    wnut_sort = sorted(data_conll['train']['tag'].unique(), key=lambda x: (x[1:],x[0]))
    conll_sort = sorted(data_wnut['train']['tag'].unique(), key=lambda x: (x[1:],x[0]))

    assert all([i==j for i,j in zip(wnut_sort,conll_sort)]), 'Mismatched tags between CoNLL and WNUT data.'

    log_print(f'Tag conversion successful. Tags: {str(wnut_sort)}')

except Exception as e:
    logging.error(f'Failed: {e}')
    raise e

In [ ]:
# Get the self-annotated tweets
path = '../data/' if 'src' in os.getcwd() else './data/'
tweets = pd.read_csv(path+f'twitter/test_29_11_19.csv', index_col=['sentence_id','token_id']).fillna(value={'token': 'NA'})
tweet_map = {
    'B-EVENT':'B-MISC',
    'I-EVENT':'I-MISC',
    'B-NORP':'B-MISC',
    'I-NORP':'I-MISC',
    'B-WORK_OF_ART':'B-MISC',
    'I-WORK_OF_ART':'I-MISC',
    'B-PRODUCT':'B-MISC',
    'I-PRODUCT':'I-MISC',
    'B-FAC':'B-LOC',
    'I-FAC':'I-LOC',
    'B-PERSON':'B-PER',
    'I-PERSON':'I-PER',
    'B-GPE':'B-LOC',
    'I-GPE':'I-LOC',
    'B-LOC':'B-LOC',
    'I-LOC':'I-LOC',
    'B-ORG':'B-ORG',
    'I-ORG':'I-ORG',
    'O':'O'
}

tweets['tag'] = tweets['tag'].map(tweet_map)

tweet_tags = sorted(tweets['tag'].unique(), key=lambda x: (x[1:],x[0]))
log_print(f'Testing tweets ready. Tags: {str(tweet_tags)}')
tweets.head()

##Feature Engineering

Coming up with good features for our model to assign weights to is critical. We've used a number of orthographic features, prefixes, suffixes, POS tags and lemmas. The tag transition is also modelled under the hood by crfsuite. The context window is of radius 1.

In [ ]:
class Sent2():
    '''Takes an array of sentences and their tags'''

    def __init__(self,attributes=None):
        self.attrs = attributes

    def __get_features(self, word, prefix):
        '''get features dictionary for a single token'''

        try:

            features = {
                f'{prefix}{attr}': word[attr] for attr in self.attrs
            }

            features.update({f'{prefix}bias': 1.0,
            f'{prefix}prefix2': word['token'][:2],
            f'{prefix}prefix3': word['token'][:2],
            f'{prefix}suffix2': word['token'][-2:],
            f'{prefix}suffix3': word['token'][-3:],
            })

        except TypeError:
            key = 'BOS' if prefix == '-1' else 'EOS'
            features = {key: True}
        return features

    def __word2features(self, sent, token_id):
        '''get features dictionary over the context window'''

        # get rows of context window
        current = sent.iloc[token_id]
        left = sent.iloc[token_id - 1] if token_id else None
        try:
            right = sent.iloc[token_id + 1]
        except:
            right = None

        features = {}

        # add features from all tokens in context window
        for row, prefix in zip((current, left, right), ('', '-1', '+1')):
            features.update(self.__get_features(row, prefix))

        features.pop('-1bias', None)
        features.pop('+1bias', None)

        return features

    def features(self, sent):
        '''convert a sentence to a list of context window feature vectors'''
        return [self.__word2features(sent, i) for i in range(len(sent))]

    def tokens(self, sent):
        '''get the sequence of string tokens for a sentence'''
        return ' '.join([row['token'] for _, row in sent.iterrows()])

    def labels(self,sent):
        '''get the sequence of target tags for a sentence'''
        return [row['tag'] for _, row in sent.iterrows()]

Sample sentences from our dataframes and create the input and output pairs for training/testing. An input is a list of feature vectors, one for each time step.

In [ ]:
def get_sentences(frame, p=1, name='unknown'):
    ''' Convert the multi-index frame into a DataFrame where rows contain a list of features vectors for each time step in sentence, and their targets.
    If p is < 1, randomly sample the sentences (where p is obviously the proportion sampled).

    Returns
    -------
    DataFrame
        Sentences

    '''  

    frame = frame.drop(columns='cluster')

    if p<1:
        sample_idx = np.random.choice(range(len(frame.groupby(level='sentence_id'))),
                                size=int(p*len(frame.groupby(level='sentence_id'))),
                                replace=False)
        frame = frame.loc[sample_idx,:]

    att = list(frame.columns.values)
    att.remove('tag')
    # print('Using token features:\n',att)

    x = frame.drop(columns='tag'
                    ).groupby(level='sentence_id'
                    ).apply(lambda sent: Sent2(att).features(sent))
    y = frame.groupby(level='sentence_id'
                    ).apply(lambda sent: Sent2().labels(sent))
    z = frame.groupby(level='sentence_id'
                    ).apply(lambda sent: Sent2().tokens(sent))


    # return pd.DataFrame(data={'x': x, 'y': y, 'tokens': z, 'dist': dist})
    return pd.DataFrame(data={'x': x, 'y': y, 'z': z, 'dist': [name]*len(x)})

In [ ]:
p = 1

try:
    log_print('Getting sentences from multi-index dataframes...')
    dists = {
        # 'tweets': get_sentences(tweets, 1, 'tweet'),
        'wnut_test': get_sentences(data_wnut['test'], p, 'wnut'),
        'wnut_train': get_sentences(data_wnut['train'], p, 'wnut'),
        # 'conll': get_sentences(data_conll['train'], p, 'conll')
    }

    # combine conll and wnut data and shuffle
    # dists['mixed'] = pd.concat([train_conll,train_wnut]).reset_index(drop=True).sample(frac=1)
except Exception as e:
    logging.error(f'Failed: {e}')
    raise e


##Train

The 'O' is not of interest to us; we are more interested in the other tags.
Let's evaluate models based on a flat f1 score over the other tags.

In [ ]:
labels = list(data_wnut['train']['tag'].unique())

labels.remove('O')

In [ ]:
get_name = lambda dist_: f'./models/crf_{dt.now().strftime("%d_%m_%y")}_{dist_}.pickle'

In [ ]:
# For grid searching for decent model regularisation parameters

# dist = 'mixed'

# model_name = get_name(dist)
# train = dists[dist]

# try:
#     iters = 20
#     cv = 3
#     params_space = {
#     'c1': scipy.stats.expon(scale=1),
#     'c2': scipy.stats.expon(scale=0.02),
#     }

#     logging.info(f'Fitting {iters*cv} models...')
#     rs = RandomizedSearchCV(CRF(all_possible_transitions=True), params_space, n_iter=iters, cv=cv, scoring=flat_f1, n_jobs=10, verbose=1)
#     rs.fit(train['x'], train['y'])
#     clf = rs.best_estimator_

#     log_print(f'Fit successful. Best params {rs.best_params_}')
# except Exception as e:
#     logging.error(f'Failed: {e}')
#     raise e


In [ ]:
# For fitting full model now (if grid search previously, full model in crf variable)

dist = 'wnut_train'

model_name = get_name(dist)
train = dists[dist]
best_params = {'c1':0.02, 'c2':0.005}

try:
    log_print(f'Fitting full model...')

    clf = CRF(all_possible_transitions=True, **best_params)
    clf.max_iterations = 100
    clf.epsilon = 1e-6
    clf.fit(train['x'], train['y'])

    log_print(f'Fit successful.')
except Exception as e:
    log_print(f'Failed: {e}')
    raise e
finally:
    try:
        with open(model_name, 'wb') as f:
            pickle.dump(clf,f)
    except:
        pass

##Evaluate

In [ ]:
from ner_evaluation.ner_eval import collect_named_entities, compute_metrics, compute_precision_recall_wrapper

In [ ]:
def score(y_true, y_pred):
    ''' Get counts for the metrics overall and per-entity '''

    results = None
    type_results = None
    tags = ['MISC','PER','LOC','ORG']
    metrics_ = ['correct', 'incorrect', 'partial', 'missed', 'spurious', 'possible', 'actual']
    measures = ['ent_type', 'partial', 'exact', 'strict']

    for y_true_, y_pred_ in zip(y_true,y_pred):
        tmp_results, tmp_type_results = compute_metrics(collect_named_entities(y_true_), collect_named_entities(y_pred_), tags)

        # aggregate overall results
        if results is None:
            results = tmp_results
        else:
            for eval_schema in measures:
                for metric in metrics_:
                    results[eval_schema][metric] += tmp_results[eval_schema][metric]

        # aggregate results by entity type
        if type_results is None:
            type_results = tmp_type_results
        else:
            for e_type in tags:
                for eval_schema in measures:
                    for metric in metrics_:
                        type_results[e_type][eval_schema][metric] += tmp_type_results[e_type][eval_schema][metric]

    return results, type_results

In [ ]:
models = ['crf_25_11_19_conll','crf_25_11_19_wnut','crf_27_11_19_mixed']
# models = ['crf_09_12_19_wnut_train']

test = dists['tweets']

for name in models:
    model_name = f'./models/{name}.pickle'

    # load the model
    with open(model_name, 'rb') as f:
        clf = pickle.load(f)
        log_print(f'Loaded pre-trained model {model_name}.')

    # regularisation weights
    log_print(f'c1:{clf.c1}, c2:{clf.c2}')
    
    # predict on test set
    y_pred = clf.predict(test['x'])

    # flat f1 score
    flat_f1_score = metrics.flat_f1_score(test['y'], y_pred,
                        average='weighted', labels=labels)

    log_print(f'{name} flat F1 score: '+str(flat_f1_score))

    sorted_labels = sorted(labels, key = lambda x: (x[1:],x[0]))

    # save predictions for wnut shared task
    # with open('wnut_predictions.txt', 'wb') as f:
    #     for sent_tokens, sent_trues, sent_preds in zip(data_wnut['test'].groupby(level='sentence_id'), test['y'], y_pred):
    #         for token, gold, pred in zip(sent_tokens[1]['token'], sent_trues, sent_preds):
    #             f.write(f'{token}\t{gold}\t{pred}\n'.encode('utf-8'))
    #         f.write('\n'.encode('utf-8'))

    # per-tag classification report
    print(metrics.flat_classification_report(
        test['y'], y_pred, labels=sorted_labels, digits=3
    ))

    # overall and per-entity type metric counts for each measure
    overall_results, type_results = score(test['y'], y_pred)

    # compute precision, recall, f1 for each measure
    for tag, results in type_results.items():
        res = compute_precision_recall_wrapper(results)
        for measure in res.keys():
            print(tag, measure)
            print('f1:', round(2*((res[measure]['precision']*res[measure]['recall'])/(res[measure]['precision']+res[measure]['recall'])),3))
        print('\n')

    results = compute_precision_recall_wrapper(overall_results)
    for k, res in results.items():
        print(k)
        print('f1:', round(2*((res['precision']*res['recall'])/(res['precision']+res['recall'])),3))


In [ ]:
# Transition and per-tag weights of each model

# 'crf_25_11_19_conll', 'crf_25_11_19_wnut', 'crf_27_11_19_mixed'

name = 'crf_27_11_19_mixed'
test = dists['tweets']
model_name = f'./models/{name}.pickle'

with open(model_name, 'rb') as f:
    clf = pickle.load(f)
    log_print(f'Loaded pre-trained model {model_name}.')

eli5.show_weights(clf, top=(5,5))

In [ ]:
# Confusion matrices

model_name = f'./models/crf_27_11_19_mixed.pickle'

with open(model_name,'rb') as f:
    clf = pickle.load(f)

y_pred = clf.predict(test['x'])

flat_pred = [i for y in y_pred for i in y]
flat_true = [i for y in test['y'] for i in y]

conf = confusion_matrix(flat_true, flat_pred, labels=sorted_labels)

conf_df = pd.DataFrame._from_arrays(conf, columns=sorted_labels, index=sorted_labels)

ax = sns.heatmap(data=conf_df, cmap='Blues')
ax.set(xlabel='Predicted', ylabel='True')

In [ ]:
def sent_to_flat_frame(X):
    '''Takes list of lists (sentences) of dicts (feature vectors) of features.
    Returns multiindex where each row is a context window feature vector + tag pair. This is required easy permutation later.
    '''

    flat_x = [(i,tok,sent) for sent,x in enumerate(X) for tok,i in enumerate(x)]
    
    features = [i for i,_,_ in flat_x]
    token_id = [j for _,j,_ in flat_x]
    sentence_id = [k for _,_,k in flat_x]


    df = pd.DataFrame.from_records(features)
    df['token_id'] = pd.Series(token_id)
    df['sentence_id'] = pd.Series(sentence_id)
    df.set_index(['sentence_id','token_id'], drop=True, inplace=True)

    return df


def importances(clf, scorer, X, y_true, labels, columns_to_shuffle=None, n_iters=3):
    '''Shuffles the input on each feature column and reports the drop in performance.
    '''

    y_pred = clf.predict(X)
    
    base_score = scorer(y_true, y_pred, average='weighted', labels=labels)

    reductions = {}

    X_flat = sent_to_flat_frame(X)

    if columns_to_shuffle is None:
        columns_to_shuffle = X_flat.columns
    
    for c in columns_to_shuffle:
        print(f'Shuffling on {c}')
        reductions_c = []
        for _ in range(n_iters):
            X_flat_copy = X_flat.copy()

            if isinstance(c,tuple):
                for ci in c:
                    X_flat_copy[ci] = np.random.permutation(X_flat[ci].values)
            else:
                X_flat_copy[c] = np.random.permutation(X_flat[c].values)

            sents = X_flat_copy.groupby(level='sentence_id'
                              ).apply(lambda sent: [{k:v for k,v in m.items() if pd.notnull(v)} for m in sent.to_dict('records')])
            y_pred = clf.predict(sents)
            c_score = scorer(y_true, y_pred, average='weighted', labels=labels)
            reductions_c.append(-c_score + base_score)
        reductions[str(c)] = reductions_c
    
    return (base_score,reductions)

In [ ]:
# Permutation importance

models = ['crf_25_11_19_conll','crf_25_11_19_wnut','crf_27_11_19_mixed']
test = dists['tweets']
fi_s = {}

for model in models:

    model_name = f'./models/{model}.pickle'

    with open(model_name,'rb') as f:
        clf = pickle.load(f)

    log_print('Feature importances for '+model_name)

    # columns = ['norm','lower']

    base_score, reductions = importances(clf, metrics.flat_f1_score, test['x'], test['y'], labels, columns)

    feature_importances = {k:{'mean':np.mean(v),'std':np.std(v), 'raw':v} for k,v in reductions.items()}
    fi_sorted = sorted(feature_importances.items(), key=lambda x: x[1]['mean'], reverse=True)
    fi_s[model] = fi_sorted
    
    log_print(str(fi_sorted))


In [ ]:
from scipy.stats import spearmanr
X = tweets.loc[:,['norm','lower']]
corr = spearmanr(X).correlation
corr

In [ ]:
# if os.name == 'nt':
#     _x = [s['c1'] for s in rs.cv_results_['params']]
#     _y = [s['c2'] for s in rs.cv_results_['params']]
#     _c = [round(s,2) for s in rs.cv_results_['mean_test_score']]

#     fig = plt.figure()
#     fig.set_size_inches(12, 12)
#     ax = plt.gca()
#     ax.set_yscale('log')
#     ax.set_xscale('log')
#     ax.set_xlabel('C1')
#     ax.set_ylabel('C2')
#     ax.set_title("Randomized Hyperparameter Search Sampled CV Results (min={:0.3}, max={:0.3})".format(
#         min(_c), max(_c)
#     ))

#     sns.scatterplot(x=_x, y=_y, hue=_c, size=_c, ax=ax)

#     # ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

#     print(f'Dark blue => {round(min(_c),3)}, dark red => {round(max(_c),3)}')

#     logging.info('Evaluation successful.')