In [3]:
import sklearn, pickle
import pandas as pd

In [2]:
with open('candidates_TrainDevTest.pickle', 'rb') as f:
    candidate_dict = pickle.load(f)
    
for splt in candidate_dict.keys():
    print "Split = %i : imported %i candidates" %(splt, len(candidate_dict[splt].keys()))

Split = 0 : imported 9917 candidates
Split = 1 : imported 6227 candidates
Split = 2 : imported 8285 candidates


In [4]:
splt = 0

In [5]:
# Rebuild text

In [151]:
from copy import deepcopy
from itertools import groupby
from operator import itemgetter

def recreate_text_representation(value, entity_replacement = True, span_replacement = True, as_tokens=False , replace_conseq_entities = False):
    """Re-generate text from snorkel dict format
    
    value: dict of a certain candidate (containing keys: words, entity_types, lemmas)
    span_replacement: Boolean: Whether to replace the whole spans of entity candidates with ENTITY1 and ENTITY2
    T_type_replacement: Boolean: Whether to replace all of the found entity types with their names (eg. CH2-> CHEMICAL)
    """
    tokens = deepcopy(value['words'])
#     print tokens
    if entity_replacement:
        for i, t_type in enumerate(value['entity_types']):
            if t_type !=u'O':
                tokens[i] = str(t_type).upper()
                
    #TODO: potentially replace with lemmas here maybe
    
    #replace span with ENTITY1, ENTITY2 --- destruction of list consistency after this point
    if span_replacement:
        for idx in value['chem_idx']:
            tokens[idx] = "ENTITY1"
        for idx in value['gene_idx']:
            tokens[idx] = "ENTITY2"
        #convert whole span to 1 token
        idx_to_del = value['gene_idx'][1:]+value['chem_idx'][1:]
        for index in sorted(idx_to_del, reverse=True):
            del tokens[index]
        
    if replace_conseq_entities:
        tokens = map(itemgetter(0), groupby(tokens))
#     print tokens
    if as_tokens:
        return tokens
    else:
        return ' '.join(tokens) #TODO : slow - optimize

# Load train, dev, test datasets

In [152]:
candidate_dict[0][1].keys()

['gene_idx',
 'lemmas',
 'chem_idx',
 'sent_id',
 'pos_tags',
 'label',
 'words',
 'cand_id',
 'entity_types',
 'doc_id']

In [156]:
# take text + labels from train ds
splt = 0
split
def candidate_dict_to_df(candidate_dict, entity_replacement = True, span_replacement = True, as_tokens=False , replace_conseq_entities = False ):
    """Function to convert the snorkel candidate dict into a dataframe, 
    containing the text representation (with options to modify entity replacement etc)
    
    cand_dict: dictionary containing all required candidate elements from snorkel.
                ** requires inner split dictionary (eg cand_dict = candidate_dictionary[split_number])
    
    entity_replacement,
    span_replacement,
    as_tokens,
    replace_conseq_entities: arguments regarding recreating the string (check recreate_text_representation doc function)
    
    """
    df = pd.DataFrame(map(lambda x: recreate_text_representation(x,  entity_replacement = entity_replacement, span_replacement = span_replacement, as_tokens=as_tokens , replace_conseq_entities = replace_conseq_entities), 
                          candidate_dict.values()), 
                      columns = ['text'],
                      index = map(lambda x: x['cand_id'], candidate_dict.values())
                     )
    df['label'] = map(lambda x: x['label'], candidate_dict.values())
    df['doc_id'] = map(lambda x: x['doc_id'], candidate_dict.values())
    df['sent_id'] = map(lambda x: x['sent_id'], candidate_dict.values())
    return df

In [162]:
df_train = candidate_dict_to_df(candidate_dict[0])
df_dev = candidate_dict_to_df(candidate_dict[1])
df_test = candidate_dict_to_df(candidate_dict[2])

In [163]:
df_train.head()

Unnamed: 0,text,label,doc_id,sent_id
1,CONCLUSION : Our results indicate that ENTITY1...,1,15650315,24197
2,CONCLUSION : Our results indicate that ENTITY1...,-1,15650315,24197
3,To identify important covariates associated wi...,-1,23238783,26907
4,To identify important covariates associated wi...,-1,23238783,26907
5,To identify important covariates associated wi...,-1,23238783,26907


In [164]:
df_dev.head()

Unnamed: 0,text,label,doc_id,sent_id
16028,"BACKGROUND : CHEMICAL , the first dual inhibit...",-1,18803986,6601
16029,"BACKGROUND : CHEMICAL , the first dual inhibit...",-1,18803986,6601
16030,"BACKGROUND : CHEMICAL , the first dual inhibit...",-1,18803986,6601
16031,"BACKGROUND : CHEMICAL , the first dual inhibit...",-1,18803986,6601
16033,"BACKGROUND : ENTITY1 , the first dual inhibito...",1,18803986,6601


** combine train + dev datasets **

In [None]:
df_train_dev = df_train.append(df_dev)

# Label distribution investigate

In [169]:
df_train_dev.label.value_counts()

-1    12120
 1     4024
Name: label, dtype: int64

In [170]:
df_test.label.value_counts()

-1    6305
 1    1980
Name: label, dtype: int64

### Check if I drop duplicates
(if label col is ignored, basically drops cands inside the same sentences)

In [171]:
print "Train+dev:\n",df_train_dev.drop_duplicates(subset=['sent_id']).label.value_counts()
print "Test:\n",df_test.drop_duplicates(subset=['sent_id']).label.value_counts()

Train+dev:
-1    2771
 1    1180
Name: label, dtype: int64
Test:
-1    1410
 1     614
Name: label, dtype: int64


# Build a BOW classifier

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [33]:
vectorizer = CountVectorizer()

In [167]:
# X_train = vectorizer.fit_transform(df_train.text)
# y_train = df_train.label

# X_dev = vectorizer.transform(df_dev.text)
# y_dev = df_dev.label

X_train_dev = vectorizer.fit_transform(df_train_dev.text)
y_train_dev = df_train_dev.label

X_test = vectorizer.transform(df_test.text)
y_test = df_test.label

In [172]:
len(df_train_dev)

16144

In [284]:
from sklearn.metrics import confusion_matrix

In [216]:
from sklearn.metrics import classification_report
from io import StringIO
import re

def report_to_df(report):
    report = re.sub(r" +", " ", report).replace("avg / total", "avg/total").replace("\n ", "\n")
    report_df = pd.read_csv(StringIO("Classes" + report), sep=' ', index_col=0)        
    return(report_df)

def train_evaluate(model, to_latex=False):
    model.fit(X_train_dev,y_train_dev)
#     print "Dev score:", accuracy_score(model.predict(X_dev), y_dev)
#     print "Test score:", accuracy_score(model.predict(X_test), y_test)
#     print classification_report(list(lr.predict(X_dev))+list(lr.predict(X_test)), list(y_dev)+list(y_test))
    y_pred = model.predict(X_test)
    report = report_to_df(classification_report(y_test, y_pred))
    print "confusion_matrix:\n",confusion_matrix(y_test, y_pred)
    
    if to_latex:
#         print report
        print "Latex table:\n"
        print """\\begin{table}[H]\centering"""+report.to_latex()+"""\\caption{Table label}
\end{table}"""
    
    return report


## Logistic Regression

In [None]:
SVC(kernel='poly',)

In [316]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [317]:
models = {
#     'LogisticRegression': LogisticRegression(),
#     'SVC_linear': SVC(kernel = 'linear'),
# #     'SVC_rbf': SVC(kernel = 'rbf'),
# #     'SVC_rbf_C=100': SVC(kernel = 'rbf', C=100),
# #     'SVC_rbf_C=500': SVC(kernel = 'rbf', C=500),
# #     'SVC_rbf_C=1000': SVC(kernel = 'rbf', C=1000),
# #     'SVC_poly3_C=100': SVC(kernel = 'poly', C=100,),
# #     'SVC_poly3_C=500': SVC(kernel = 'poly', C=500, ),
# #     'SVC_poly3_C=1000': SVC(kernel = 'poly', C=1000, ),
# #     'SVC_rbf': SVC(kernel = 'rbf'),
# #     'SVC_rbf': SVC(kernel = 'rbf'),
#     'RandomForestClassifier': RandomForestClassifier(),
    'kNN_5': KNeighborsClassifier()
#     'RandomForestClassifier': RandomForestClassifier(n_estimators=100, max_depth=5),
    
    
    
}

In [302]:
from collections import defaultdict
import time

In [217]:
train_evaluate(LogisticRegression(), to_latex=True)

confusion_matrix:
[[5538  767]
 [1181  799]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.82 &    0.88 &      0.85 &     6305 \\
1         &       0.51 &    0.40 &      0.45 &     1980 \\
avg/total &       0.75 &    0.76 &      0.75 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.82,0.88,0.85,6305
1,0.51,0.4,0.45,1980
avg/total,0.75,0.76,0.75,8285


## SVM

In [95]:
from sklearn.svm import SVC

** linear **

In [218]:
train_evaluate(SVC(kernel = 'linear'), to_latex=True)

confusion_matrix:
[[5034 1271]
 [1072  908]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.82 &    0.80 &      0.81 &     6305 \\
1         &       0.42 &    0.46 &      0.44 &     1980 \\
avg/total &       0.73 &    0.72 &      0.72 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.82,0.8,0.81,6305
1,0.42,0.46,0.44,1980
avg/total,0.73,0.72,0.72,8285


In [229]:
train_evaluate(SVC(kernel = 'linear', class_weight='balanced'), to_latex=True)

confusion_matrix:
[[4471 1834]
 [ 750 1230]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.86 &    0.71 &      0.78 &     6305 \\
1         &       0.40 &    0.62 &      0.49 &     1980 \\
avg/total &       0.75 &    0.69 &      0.71 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.86,0.71,0.78,6305
1,0.4,0.62,0.49,1980
avg/total,0.75,0.69,0.71,8285


** poly = 2 **

In [219]:
train_evaluate(SVC(kernel = 'poly', degree=2), to_latex=True)

confusion_matrix:
[[6305    0]
 [1980    0]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.76 &    1.00 &      0.86 &     6305 \\
1         &       0.00 &    0.00 &      0.00 &     1980 \\
avg/total &       0.58 &    0.76 &      0.66 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.76,1.0,0.86,6305
1,0.0,0.0,0.0,1980
avg/total,0.58,0.76,0.66,8285


In [230]:
train_evaluate(SVC(kernel = 'poly', degree=2, class_weight='balanced'), to_latex=True)

confusion_matrix:
[[ 480 5825]
 [  47 1933]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.91 &    0.08 &      0.14 &     6305 \\
1         &       0.25 &    0.98 &      0.40 &     1980 \\
avg/total &       0.75 &    0.29 &      0.20 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.91,0.08,0.14,6305
1,0.25,0.98,0.4,1980
avg/total,0.75,0.29,0.2,8285


** poly = 3 **

In [220]:
train_evaluate(SVC(kernel = 'poly', degree=3), to_latex=True)

confusion_matrix:
[[6305    0]
 [1980    0]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.76 &    1.00 &      0.86 &     6305 \\
1         &       0.00 &    0.00 &      0.00 &     1980 \\
avg/total &       0.58 &    0.76 &      0.66 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.76,1.0,0.86,6305
1,0.0,0.0,0.0,1980
avg/total,0.58,0.76,0.66,8285


In [232]:
train_evaluate(SVC(kernel = 'poly', degree=3,class_weight='balanced'), to_latex=True)

confusion_matrix:
[[   0 6305]
 [   0 1980]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.00 &    0.00 &      0.00 &     6305 \\
1         &       0.24 &    1.00 &      0.39 &     1980 \\
avg/total &       0.06 &    0.24 &      0.09 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.0,0.0,0.0,6305
1,0.24,1.0,0.39,1980
avg/total,0.06,0.24,0.09,8285


** Gaussian **

In [221]:
train_evaluate(SVC(kernel = 'rbf'), to_latex=True)

confusion_matrix:
[[6305    0]
 [1980    0]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.76 &    1.00 &      0.86 &     6305 \\
1         &       0.00 &    0.00 &      0.00 &     1980 \\
avg/total &       0.58 &    0.76 &      0.66 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.76,1.0,0.86,6305
1,0.0,0.0,0.0,1980
avg/total,0.58,0.76,0.66,8285


In [233]:
train_evaluate(SVC(kernel = 'rbf', class_weight = 'balanced'), to_latex=True)

confusion_matrix:
[[2622 3683]
 [ 350 1630]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.88 &    0.42 &      0.57 &     6305 \\
1         &       0.31 &    0.82 &      0.45 &     1980 \\
avg/total &       0.74 &    0.51 &      0.54 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.88,0.42,0.57,6305
1,0.31,0.82,0.45,1980
avg/total,0.74,0.51,0.54,8285


In [181]:
#### Penalize RBF

In [222]:
train_evaluate(SVC(kernel = 'rbf', C=100), to_latex=True)

confusion_matrix:
[[5882  423]
 [1465  515]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.80 &    0.93 &      0.86 &     6305 \\
1         &       0.55 &    0.26 &      0.35 &     1980 \\
avg/total &       0.74 &    0.77 &      0.74 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.8,0.93,0.86,6305
1,0.55,0.26,0.35,1980
avg/total,0.74,0.77,0.74,8285


In [234]:
train_evaluate(SVC(kernel = 'rbf', C=100, class_weight = 'balanced'), to_latex=True)

confusion_matrix:
[[4600 1705]
 [ 548 1432]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.89 &    0.73 &      0.80 &     6305 \\
1         &       0.46 &    0.72 &      0.56 &     1980 \\
avg/total &       0.79 &    0.73 &      0.75 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.89,0.73,0.8,6305
1,0.46,0.72,0.56,1980
avg/total,0.79,0.73,0.75,8285


In [224]:
train_evaluate(SVC(kernel = 'rbf', C=500), to_latex=True)

confusion_matrix:
[[5549  756]
 [1206  774]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.82 &    0.88 &      0.85 &     6305 \\
1         &       0.51 &    0.39 &      0.44 &     1980 \\
avg/total &       0.75 &    0.76 &      0.75 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.82,0.88,0.85,6305
1,0.51,0.39,0.44,1980
avg/total,0.75,0.76,0.75,8285


In [235]:
train_evaluate(SVC(kernel = 'rbf', C=500, class_weight = 'balanced'), to_latex=True)

confusion_matrix:
[[4611 1694]
 [ 610 1370]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.88 &    0.73 &      0.80 &     6305 \\
1         &       0.45 &    0.69 &      0.54 &     1980 \\
avg/total &       0.78 &    0.72 &      0.74 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.88,0.73,0.8,6305
1,0.45,0.69,0.54,1980
avg/total,0.78,0.72,0.74,8285


In [223]:
train_evaluate(SVC(kernel = 'rbf', C=1000), to_latex=True)

confusion_matrix:
[[5476  829]
 [1159  821]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.83 &    0.87 &      0.85 &     6305 \\
1         &       0.50 &    0.41 &      0.45 &     1980 \\
avg/total &       0.75 &    0.76 &      0.75 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.83,0.87,0.85,6305
1,0.5,0.41,0.45,1980
avg/total,0.75,0.76,0.75,8285


In [236]:
train_evaluate(SVC(kernel = 'rbf', C=1000, class_weight = 'balanced'), to_latex=True)

confusion_matrix:
[[4588 1717]
 [ 671 1309]]
Latex table:

\begin{table}[H]\centering\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
Classes   &            &         &           &          \\
\midrule
-1        &       0.87 &    0.73 &      0.79 &     6305 \\
1         &       0.43 &    0.66 &      0.52 &     1980 \\
avg/total &       0.77 &    0.71 &      0.73 &     8285 \\
\bottomrule
\end{tabular}
\caption{Table label}
\end{table}


Unnamed: 0_level_0,precision,recall,f1-score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.87,0.73,0.79,6305
1,0.43,0.66,0.52,1980
avg/total,0.77,0.71,0.73,8285


# Explore diversity between classifiers

In [116]:
results_a = SVC(kernel = 'rbf', C=1000).fit(X_train_dev,y_train_dev).predict(X_test)
results_b = SVC(kernel = 'linear').fit(X_train_dev,y_train_dev).predict(X_test)


In [118]:
cohen_kappa_score(results_a, results_b)

0.7092279180754234

In [237]:
from sklearn.metrics import cohen_kappa_score

def cohens_kappa(model1,model2):
    """
    Returns Cohens kappa interannotator agreement between on two models
    (after training on dev+train and predicting in X_test)
    """
    results_a = model1.fit(X_train_dev,y_train_dev).predict(X_test)
    results_b = model2.fit(X_train_dev,y_train_dev).predict(X_test)
    return cohen_kappa_score(results_a, results_b)

In [238]:
cohens_kappa(SVC(kernel = 'rbf', C=1000),
             SVC(kernel = 'linear')
            )

0.7092279180754234

In [239]:
cohens_kappa(SVC(kernel = 'rbf', C=1000),
             SVC(kernel = 'rbf', C=500),
            )

0.8443934207622563