In [2]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim

from future.utils import iteritems
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from subprocess import check_output

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter
import sklearn_crfsuite

import keras
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping 
import string

import tensorflow
from tensorflow.keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
beth = pd.read_csv('train_data/conll.txt', sep=" ", 
                         names = ["word", "POS_tag","chunk_tag", "NER_tag", "Sentence #"],
                         index_col = False)

In [4]:
beth = beth.dropna()

In [30]:
beth.groupby('NER_tag')['word'].count()

NER_tag
O            111655
problem       16774
test           8259
treatment      8662
Name: word, dtype: int64

In [7]:
punc = ["'", "!", "(", ")", "-", "[", "]", "{", "}", ",", ":", ";", "@", "<", ">"
        "#", "?", "~", "_", "&", "*", "/","^"]
beth_train = beth_train[~beth_train['word'].isin(punc)]
beth_train = beth_train.dropna()
beth_test = beth_test[~beth_test['word'].isin(punc)]
beth_test = beth_test.dropna()

In [5]:
beth_X = beth.drop('NER_tag', axis=1)
v = DictVectorizer(sparse=False)
beth_X = v.fit_transform(beth_X.to_dict('records'))
beth_y = beth.NER_tag.values

classes = np.unique(beth_y)
classes = classes.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    beth_X, beth_y, test_size = 0.2, random_state=0)

In [6]:
new_classes = classes.copy()

#### Baseline

#### 1.1 Naive Bayes

In [11]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01)

In [12]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

              precision    recall  f1-score   support

           O       0.96      0.93      0.94     22395
     problem       0.73      0.77      0.75      3398
        test       0.70      0.81      0.75      1558
   treatment       0.67      0.76      0.71      1719

    accuracy                           0.89     29070
   macro avg       0.76      0.82      0.79     29070
weighted avg       0.90      0.89      0.90     29070



#### 1.2 Linear classifiers with SGD training

In [13]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

SGDClassifier()

In [14]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

           O       0.86      0.99      0.92     22395
     problem       0.82      0.46      0.59      3398
        test       0.82      0.46      0.59      1558
   treatment       0.86      0.27      0.41      1719

    accuracy                           0.86     29070
   macro avg       0.84      0.54      0.63     29070
weighted avg       0.85      0.86      0.83     29070



#### CRF

In [7]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['word'].values.tolist(), 
                                                           s['POS_tag'].values.tolist(), 
                                                           s['NER_tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence:{}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [8]:
getter = SentenceGetter(beth)
sentences = getter.sentences

In [9]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-4:]': word[-4:],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.ispunctuation': (word in string.punctuation),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:word.ispunctuation': (word1 in string.punctuation)
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:word.ispunctuation': (word1 in string.punctuation)
        })
    else:
        features['EOS'] = True
        
    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
            '+2:word[:3]': word2[:3],
            '+2:word[:2]': word2[:2],
            '+2:word[-3:]': word2[-3:],
            '+2:word[-2:]': word2[-2:],
            '+2:word.isdigit()': word2.isdigit(),
            '+2:word.ispunctuation': (word2 in string.punctuation),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
    })
    return features

In [10]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [11]:
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [12]:
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [13]:
beth_X = [sent2features(s) for s in sentences]
beth_y = [sent2labels(s) for s in sentences]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    beth_X, beth_y, test_size=0.2, random_state=0)

In [19]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.01,
    max_iterations=100,
    all_possible_transitions=True
)

In [None]:
pip install -U 'scikit-learn<0.24'

In [20]:
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.01, c2=0.01,
    keep_tempfiles=None, max_iterations=100)

In [21]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

              precision    recall  f1-score   support

           O       0.95      0.97      0.96     21068
     problem       0.84      0.81      0.82      3367
        test       0.83      0.79      0.81      1584
   treatment       0.86      0.79      0.83      1578

    accuracy                           0.93     27597
   macro avg       0.87      0.84      0.85     27597
weighted avg       0.93      0.93      0.93     27597



In [None]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

In [24]:
c_1 = [0.01, 0.05, 0.1, 0.25, 0.5]
c_2 = [0.01, 0.05, 0.1, 0.25, 0.5]

for i in range(len(c_1)):
    for j in range(len(c_2)):
        crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=c_1[i],
            c2=c_2[j],
            max_iterations=100,
            all_possible_transitions=True
            )
        crf.fit(X_train, y_train)
        y_pred = crf.predict(X_test)
        print("C1:", c_1[i], "C2", c_2[j])
        print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))
        

C1: 0.01 C2 0.01
              precision    recall  f1-score   support

           O       0.95      0.97      0.96     21068
     problem       0.84      0.81      0.82      3367
        test       0.83      0.79      0.81      1584
   treatment       0.86      0.79      0.83      1578

    accuracy                           0.93     27597
   macro avg       0.87      0.84      0.85     27597
weighted avg       0.93      0.93      0.93     27597

C1: 0.01 C2 0.05
              precision    recall  f1-score   support

           O       0.95      0.97      0.96     21068
     problem       0.83      0.81      0.82      3367
        test       0.83      0.79      0.81      1584
   treatment       0.88      0.79      0.83      1578

    accuracy                           0.93     27597
   macro avg       0.88      0.84      0.86     27597
weighted avg       0.93      0.93      0.93     27597

C1: 0.01 C2 0.1
              precision    recall  f1-score   support

           O       0.95  