In [1]:
experiment_name = 'CDR'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# %run init.py

print "Setting up variables & DB connection for experiment:\n"
print "*******************\n%s\n*******************"%experiment_name

# point to appropriate DBs, pickle files etc.
pkl_paths = 'pickles/%s/'%experiment_name
path_candidate_dict_pkl = pkl_paths+ 'candidate_dict.pickle' # TODO rename that
path_pubmed_ids_pkl = pkl_paths + 'pubmed_ids.pickle'
path_base_learners = pkl_paths + 'base_learner_predictions' # TODO create dirs


# Shortcuts to connect to database, initialize candidate subclass and return snorkel session
import os
#TODO: set experiment_name and restructure dir
os.environ['SNORKELDB'] = 'postgres:///snorkel'+experiment_name

from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import  Document, Sentence
import matplotlib.pyplot as plt
from snorkel.annotations import save_marginals
from snorkel.models import Candidate, candidate_subclass
ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])

print "\nSnorkel session connected to: ",os.environ['SNORKELDB']

Setting up variables & DB connection for experiment:

*******************
CDR
*******************


  """)



Snorkel session connected to:  postgres:///snorkelCDR


# Part I: Corpus Preprocessing


In [3]:
import os
from snorkel.parser import XMLMultiDocPreprocessor

# The following line is for testing only. Feel free to ignore it.
file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml'

doc_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

In [4]:
from snorkel.parser import CorpusParser
# from utils import TaggerOneTagger
from snorkel.utils_cdr import TaggerOneTagger, CDRTagger
from snorkel.parser.spacy_parser import Spacy


tagger_one = TaggerOneTagger()
corpus_parser = CorpusParser(fn=tagger_one.tag, parser=Spacy())
corpus_parser.apply(list(doc_preprocessor))

Clearing existing...
Running UDF...



In [5]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

('Documents:', 1500L)
('Sentences:', 14593L)


# Part II: Candidate Extraction

In [6]:
# modified to implement random splitting of (official training set) to D_B & D_U
from six.moves.cPickle import load
import random
seed = 448
random.seed(seed)


with open('data/doc_ids.pkl', 'rb') as f:
    train_off_ids, dev_ids, test_ids = load(f)
dev_ids, test_ids = set(dev_ids), set(test_ids)

#split train_ids into D_B & D_U 
random.shuffle(train_off_ids)
train_ids = set(train_off_ids[:300]) # This will be D_B
unlab_ids = set(train_off_ids[300:]) # This will be D_U
print( len(train_ids), len(dev_ids), len(test_ids), len(unlab_ids))

train_sents, dev_sents, test_sents, unlab_sents = set(), set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        elif doc.name in unlab_ids:
            unlab_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(doc.name))

print(len(train_sents), len(dev_sents), len(test_sents),len(unlab_sents))

(300, 100, 500, 600)
(2940, 920, 5056, 5677)


# Candidate extraction

In [7]:
from snorkel.candidates import PretaggedCandidateExtractor

candidate_extractor = PretaggedCandidateExtractor(ChemicalDisease, ['Chemical', 'Disease'])

In [8]:
for k, sents in enumerate([train_sents, dev_sents, test_sents, unlab_sents]):
    candidate_extractor.apply(sents, split=k)
    print("Number of candidates:", session.query(ChemicalDisease).filter(ChemicalDisease.split == k).count())

Clearing existing...
Running UDF...

('Number of candidates:', 2860L)
Clearing existing...
Running UDF...

('Number of candidates:', 920L)
Clearing existing...
Running UDF...

('Number of candidates:', 4680L)
Clearing existing...
Running UDF...

('Number of candidates:', 5576L)


# Load gold labels

In [20]:
from load_external_annotations import load_external_labels
from snorkel.annotations import load_gold_labels

for splt in range(4):
    load_external_labels(session, ChemicalDisease, split=splt, annotator='gold')

AnnotatorLabels created: 2860
AnnotatorLabels created: 920
AnnotatorLabels created: 0
AnnotatorLabels created: 5576


In [19]:
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
L_gold_test

<4680x1 sparse matrix of type '<type 'numpy.int64'>'
	with 4680 stored elements in Compressed Sparse Row format>

In [21]:
L_gold_test.nnz

4680

In [77]:
from utils import check_class_imbalance

In [87]:
check_class_imbalance(L_gold_test.data)

-1    0.676282
 1    0.323718
dtype: float64

In [22]:
from snorkel.models import StableLabel
from sqlalchemy import and_

In [24]:
for k in range(4):
    print 'split = ',k
    print 'Total cands:', session.query(ChemicalDisease).filter(ChemicalDisease.split == k).count()
    print 'Mapped cands:', session.query(ChemicalDisease).filter(ChemicalDisease.split == k).filter(ChemicalDisease.gold_labels).count()
    print 'Un-mapped cands:', session.query(ChemicalDisease).filter(ChemicalDisease.split == k).filter(~ChemicalDisease.gold_labels.any()).count()
    print session.query(ChemicalDisease).filter(ChemicalDisease.split == k).count() == (session.query(ChemicalDisease).filter(ChemicalDisease.split == k).filter(ChemicalDisease.gold_labels).count() +
                                                                           session.query(ChemicalDisease).filter(ChemicalDisease.split == k).filter(~ChemicalDisease.gold_labels.any()).count())
    print ''


 split =  0
Total cands: 2860
Mapped cands: 2860
Un-mapped cands: 0
True

split =  1
Total cands: 920
Mapped cands: 920
Un-mapped cands: 0
True

split =  2
Total cands: 4680
Mapped cands: 4680
Un-mapped cands: 0
True

split =  3
Total cands: 5576
Mapped cands: 5576
Un-mapped cands: 0
True



#### Exporting candidates from snorkel to sklearn for ML model training

In [62]:
from sklearn_bridge import export_snorkel_candidates

In [63]:
# export candidates for train, dev, test dataset
candidates = dict()
nr_cands_extracted=0
for i in range(4): #for train,dev,test export only labelled candidates 
    candidates[i] = export_snorkel_candidates(session,ChemicalDisease, i, True)
    print 'Extracted %i candidates from split = %i '%(len(candidates[i].keys()), i)
    nr_cands_extracted += len(candidates[i].keys())

print 'Extracted %i candidates in total'%nr_cands_extracted

Extracted 2860 candidates from split = 0 
Extracted 920 candidates from split = 1 
Extracted 4680 candidates from split = 2 
Extracted 5576 candidates from split = 3 
Extracted 14036 candidates in total


In [64]:
print "Saving to: %s"%path_candidate_dict_pkl

Saving to: pickles/CDR/candidate_dict.pickle


In [65]:
with open(path_candidate_dict_pkl, 'wb') as f:
    pickle.dump(dict(candidates),f)

# #########################################
# Once this is done, results are persisted into snorkel.db and this step is no longer required, unless more documents are added.
# #########################################

# -------------------------------
# Part 1A (training the Base Learners)
# -------------------------------

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [32]:
# Rebuild text representation
from sklearn_bridge import recreate_text_representation, candidate_dict_to_df

In [33]:
from MLutils import report_to_df, train_evaluate, diversity_heatmap

In [34]:
import sklearn, pickle, time, seaborn
import pandas as pd
from itertools import product

In [66]:
with open(path_candidate_dict_pkl, 'rb') as f:
    candidate_dict = pickle.load(f)

for splt in candidate_dict.keys():
    print "Split = %i : imported %i candidates" %(splt, len(candidate_dict[splt].keys()))

Split = 0 : imported 2860 candidates
Split = 1 : imported 920 candidates
Split = 2 : imported 4680 candidates
Split = 3 : imported 5576 candidates


In [67]:
# Set pipeline options
shortest_dep_paths = [True, False] #  TODO also add shortest_dep_path
trimmings = [(False,0),
             (True,0), 
             (True, 5)
            ]
use_lemmas_ = [True, 
               #False
              ]
random_undersample_ = [True]
text_vectorizer_ = [CountVectorizer(binary=True, min_df=5, stop_words='english'),
                    #CountVectorizer(binary=True, min_df=1 , stop_words=None),
                    CountVectorizer(binary=True, min_df=5 , ngram_range=(0,3)),
                    
# #                    CountVectorizer(binary=True, min_df=5 , stop_words='english'),
                    TfidfVectorizer(binary=False, min_df=5 , stop_words='english'),
                    #TfidfVectorizer(binary=False, min_df=1 , stop_words=None ),
                    TfidfVectorizer(binary=False, min_df=5, ngram_range=(0,3) ),
# #                     TfidfVectorizer(binary=True, min_df=5 , stop_words='english'),
                   ]

lsa_dims = [200, None]

In [69]:
# select models to train
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC_linear': SVC(kernel = 'linear'),
# # # #     'SVC_rbf': SVC(kernel = 'rbf'),
    'SVC_rbf_C=75': SVC(kernel = 'rbf', C=75),
    'SVC_rbf_C=150': SVC(kernel = 'rbf', C=150),
    'SVC_rbf_C=250': SVC(kernel = 'rbf', C=250),
# #     'kNN_5': KNeighborsClassifier(),
# #     'kNN_25': KNeighborsClassifier(n_neighbors=25),
# #     'kNN_2': KNeighborsClassifier(n_neighbors=2),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100, max_depth=5),
# #     'NeuralNet(10,10)' : MLPClassifier(hidden_layer_sizes=(10,10)),
# #     'BernoulliNB': BernoulliNB(),
}


In [70]:
combs = product(trimmings,use_lemmas_, random_undersample_, text_vectorizer_, lsa_dims, shortest_dep_paths )
combs = list(combs)
print len(combs)*len(models)


288


# Perform training 

In [71]:
import imblearn
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
# import numpy as np
from scipy.sparse import hstack
from MLutils import get_positive_logit, logits_to_bin_labels, logits_to_neg_labels, classif_report_from_dicts

In [88]:
debug=False
start_time = time.time()
for comb in combs: 
    
    #pass pipeline selections
    (trim_text,trim_window) = comb[0]
    use_lemmas = comb[1]
    random_undersample = comb[2]
    text_vectorizer = comb[3]
    lsa_dims = comb[4]
    shortest_dep_path = comb[5]
    
    # determine name
    pkl_name = 'results_dict,'
    if shortest_dep_path:
        pkl_name += 'ShortDepPath,'
    elif trim_text:
        pkl_name += 'trim='+str(trim_window)+','
    if use_lemmas:
        pkl_name += 'lemmas'+','
    if random_undersample:
        pkl_name += ',RuS'+','
    #CV, TFIDF
    if isinstance(text_vectorizer, TfidfVectorizer):
        pkl_name += ',TfIdf_'+','
    elif isinstance(text_vectorizer, CountVectorizer):
        pkl_name += ',CV_'+','      
    if text_vectorizer.binary:
        pkl_name += 'bin_'+','
    if text_vectorizer.min_df:
        pkl_name += 'minFreq='+str(text_vectorizer.min_df)+','
    if text_vectorizer.stop_words:
        pkl_name += '_stopw='+str(text_vectorizer.stop_words)+','
    if text_vectorizer.ngram_range:
        pkl_name += '_ngrams='+str(text_vectorizer.ngram_range)+','
    if lsa_dims:
        pkl_name += ',LSA'+str(lsa_dims)+','
    pkl_name = pkl_name.replace(',,',',')
    
    # Debug pkl name
    if debug:
        print pkl_name
        continue
    
    
    print 'Saving as: \n'+pkl_name.strip(',')+'.pkl'
    print time.time() - start_time
    print ''
    
    if os.path.isfile(path_base_learners+'/'+pkl_name+'.pkl'):
        
        print pkl_name,"exists, skipping . . ."
        continue
    
    if shortest_dep_path:
        #reload from pickle df_train,val,test etc
        try:
            with open(path_base_learners+'/sdp/SDP_df_TrainValTestUnlab.pickle' , 'rb') as f:
                (df_train, df_val, df_test, df_unlab) = pickle.load(f)
        except:
            # create it and save it
            df_train = candidate_dict_to_df(candidate_dict[0],trim_text=False, window=0, lemmas = True ,shortest_dep_path = shortest_dep_path)
            df_val = candidate_dict_to_df(candidate_dict[1],trim_text=False, window=0, lemmas =  True ,shortest_dep_path = shortest_dep_path )
            df_test = candidate_dict_to_df(candidate_dict[2],trim_text=False, window=0, lemmas = True ,shortest_dep_path = shortest_dep_path)
            df_unlab = candidate_dict_to_df(candidate_dict[3],trim_text=False, window=0, lemmas = True ,shortest_dep_path = shortest_dep_path)
            with open(path_base_learners+'/sdp/SDP_df_TrainValTestUnlab.pickle' , 'wb') as f:
                pickle.dump((df_train, df_val, df_test, df_unlab),f)
    else:
        # create train/test set
        df_train = candidate_dict_to_df(candidate_dict[0],trim_text=trim_text, window=trim_window, lemmas = use_lemmas ,shortest_dep_path = shortest_dep_path)
        df_val = candidate_dict_to_df(candidate_dict[1],trim_text=trim_text, window=trim_window, lemmas =  use_lemmas,shortest_dep_path = shortest_dep_path )
        df_test = candidate_dict_to_df(candidate_dict[2],trim_text=trim_text, window=trim_window, lemmas = use_lemmas ,shortest_dep_path = shortest_dep_path)
        df_unlab = candidate_dict_to_df(candidate_dict[3],trim_text=trim_text, window=trim_window, lemmas = use_lemmas ,shortest_dep_path = shortest_dep_path)
    
    
    # get indices (only for val, test, unlab) to zip with results in a dict
    val_ids = list(df_val.index)
    test_ids = list(df_test.index)
    unlab_ids = list(df_unlab.index)
    
    
    
    #TODO: move random undersampling here (will fasten up algorithm)
    
    # text 2 matrix (CV/TF-idf)
    X_train = text_vectorizer.fit_transform(df_train.text)
    y_train = df_train.label
    

    X_val = text_vectorizer.transform(df_val.text)
#     y_val = df_val.label # To delete    
    
    X_test = text_vectorizer.transform(df_test.text)
#     y_test = df_test.label # To delete
    
    X_unlab = text_vectorizer.transform(df_unlab.text)
#     y_unlab = df_unlab.label # To delete
    
    
    #new
    y_val_gold = dict(df_val.label)
    y_test_gold = dict(df_test.label)
    
    
    # LSA
    if lsa_dims: #TODO save in diff. matrix to append later?
        svd = TruncatedSVD(n_components=lsa_dims)
        X_train_svd = svd.fit_transform(X_train)
        X_val_svd = svd.transform(X_val)
        X_test_svd = svd.transform(X_test)
        X_unlab_svd = svd.transform(X_unlab)
        
        
        if True:#TODO replace with concat keyword
            X_train = hstack((X_train,X_train_svd))
            X_val = hstack((X_val,X_val_svd))
            X_test = hstack((X_test,X_test_svd))
            X_unlab = hstack((X_unlab,X_unlab_svd))
    
    # undersampling
    if random_undersample:
#         print('Original dataset shape {}'.format(Counter(y_train)))
        rus = RandomUnderSampler(random_state=42)
        X_train, y_train = rus.fit_sample(X_train, y_train)
#         print('Resampled dataset shape {}'.format(Counter(y_train)))

    # train/predict, construct results_dict
    results_dict = dict()
    for model_name,model in models.iteritems():
        print 'Training',model_name
#         start_time = time.time()

        try:
            model.probability = True
        except:
            pass
        
        #train & predict
        model.fit(X_train,y_train)
        positive_class_position = list(model.classes_).index(1) #either 0 or 1, depending on model.classes_ (used to get + logits)
                
        logits = model.predict_proba(X_val)
        y_val_pred_pos_logit = dict(zip(val_ids, get_positive_logit(logits, positive_class_position)))
        y_val_pred_neg = dict(zip(val_ids, logits_to_neg_labels(logits))) #only used for evaluation
        
        #save classif report
        class_report = report_to_df(
            classif_report_from_dicts(y_val_gold, y_val_pred_neg)
                                 )
        print class_report
        
        if class_report.loc['1','f1-score']<0.5:
            continue
        
        # perform predictions for the rest of the data
        logits = model.predict_proba(X_test)
        y_test_pred_pos_logit = dict(zip(test_ids, get_positive_logit(logits, positive_class_position)))
        
        logits = model.predict_proba(X_unlab)
        y_unlab_pred_pos_logit = dict(zip(unlab_ids, get_positive_logit(logits, positive_class_position)))
        
#         To delete
#         logits = model.predict()
#         y_test_pred_pos_logit = dict(zip(test_ids, get_positive_logit(logits)))
#         y_test_pred_bin = dict(zip(val_ids, logits_to_bin_labels(logits))) #only used for evaluation
#         y_test_pred_neg = dict(zip(val_ids, logits_to_neg_labels(logits))) #only used for evaluation
        
        
        # TODO: do smth for later, when true lbls not in place.
        
        results_dict[model_name] = {
                                    "label_val_prob+" : y_val_pred_pos_logit,
                                   "label_test_prob+" : y_test_pred_pos_logit,
                                   "label_unlab_prob+" : y_unlab_pred_pos_logit,
                                    "classification_report": class_report,
                                    "f1+": class_report.loc['1','f1-score']
                                   }
#         print "%s done. (%.2f sec)"%(model_name,time.time()-start_time)
    #save predictions in dict
    
    with open(path_base_learners+'/'+pkl_name+'.pkl', 'wb') as f:
        pickle.dump(results_dict,f)
        
#     diversity_heatmap(results_dict, title = pkl_name)

Saving as: 
results_dict,ShortDepPath,lemmas,RuS,CV_,bin_,minFreq=5,_stopw=english,_ngrams=(1, 1),LSA200.pkl
0.00162291526794

Training LogisticRegression
           precision  recall  f1-score  support
Classes                                        
-1              0.75    0.66      0.70      614
1               0.45    0.55      0.49      306
avg/total       0.65    0.62      0.63      920
Training SVC_rbf_C=250
           precision  recall  f1-score  support
Classes                                        
-1              0.75    0.67      0.71      614
1               0.45    0.56      0.50      306
avg/total       0.65    0.63      0.64      920
Training RandomForestClassifier
           precision  recall  f1-score  support
Classes                                        
-1              0.72    0.79      0.75      614
1               0.47    0.37      0.41      306
avg/total       0.63    0.65      0.64      920
Training SVC_linear
           precision  recall  f1-score  support
Cl

Training SVC_rbf_C=150
           precision  recall  f1-score  support
Classes                                        
-1              0.78    0.64      0.70      614
1               0.47    0.65      0.55      306
avg/total       0.68    0.64      0.65      920
Saving as: 
results_dict,lemmas,RuS,CV_,bin_,minFreq=5,_ngrams=(0, 3),LSA200.pkl
195.900989771

Training LogisticRegression
           precision  recall  f1-score  support
Classes                                        
-1              0.74    0.64      0.69      614
1               0.43    0.55      0.48      306
avg/total       0.64    0.61      0.62      920
Training SVC_rbf_C=250
           precision  recall  f1-score  support
Classes                                        
-1              0.74    0.65      0.69      614
1               0.44    0.55      0.49      306
avg/total       0.64    0.62      0.63      920
Training RandomForestClassifier
           precision  recall  f1-score  support
Classes                       

           precision  recall  f1-score  support
Classes                                        
-1              0.69    0.58      0.63      614
1               0.37    0.49      0.42      306
avg/total       0.58    0.55      0.56      920
Training SVC_rbf_C=150
           precision  recall  f1-score  support
Classes                                        
-1              0.69    0.54      0.60      614
1               0.35    0.51      0.42      306
avg/total       0.58    0.53      0.54      920
Saving as: 
results_dict,ShortDepPath,lemmas,RuS,TfIdf_,minFreq=5,_stopw=english,_ngrams=(1, 1).pkl
396.669467926

Training LogisticRegression
           precision  recall  f1-score  support
Classes                                        
-1              0.77    0.58      0.66      614
1               0.43    0.64      0.52      306
avg/total       0.65    0.60      0.61      920
Training SVC_rbf_C=250
           precision  recall  f1-score  support
Classes                                    

  'precision', 'predicted', average, warn_for)


           precision  recall  f1-score  support
Classes                                        
-1              0.00    0.00      0.00      614
1               0.33    1.00      0.50      306
avg/total       0.11    0.33      0.17      920
Training SVC_rbf_C=150
           precision  recall  f1-score  support
Classes                                        
-1              0.73    0.78      0.75      614
1               0.49    0.42      0.45      306
avg/total       0.65    0.66      0.65      920
Saving as: 
results_dict,ShortDepPath,lemmas,RuS,TfIdf_,minFreq=5,_ngrams=(0, 3).pkl
548.173787832

Training LogisticRegression
           precision  recall  f1-score  support
Classes                                        
-1              0.79    0.65      0.72      614
1               0.49    0.66      0.56      306
avg/total       0.69    0.66      0.66      920
Training SVC_rbf_C=250
           precision  recall  f1-score  support
Classes                                        
-1        

           precision  recall  f1-score  support
Classes                                        
-1              0.77    0.58      0.66      614
1               0.44    0.65      0.52      306
avg/total       0.66    0.60      0.61      920
Training RandomForestClassifier
           precision  recall  f1-score  support
Classes                                        
-1              0.74    0.66      0.70      614
1               0.44    0.53      0.48      306
avg/total       0.64    0.62      0.63      920
Training SVC_linear
           precision  recall  f1-score  support
Classes                                        
-1              0.78    0.60      0.67      614
1               0.45    0.66      0.53      306
avg/total       0.67    0.62      0.63      920
Training SVC_rbf_C=75
           precision  recall  f1-score  support
Classes                                        
-1              0.77    0.57      0.66      614
1               0.43    0.66      0.52      306
avg/total     

Training SVC_rbf_C=75
           precision  recall  f1-score  support
Classes                                        
-1              0.76    0.72      0.74      614
1               0.50    0.55      0.52      306
avg/total       0.68    0.67      0.67      920
Training SVC_rbf_C=150
           precision  recall  f1-score  support
Classes                                        
-1              0.77    0.68      0.72      614
1               0.47    0.58      0.52      306
avg/total       0.67    0.65      0.65      920
Saving as: 
results_dict,ShortDepPath,lemmas,RuS,TfIdf_,minFreq=5,_ngrams=(0, 3).pkl
849.429516792

results_dict,ShortDepPath,lemmas,RuS,TfIdf_,minFreq=5,_ngrams=(0, 3), exists, skipping . . .
Saving as: 
results_dict,trim=0,lemmas,RuS,TfIdf_,minFreq=5,_ngrams=(0, 3).pkl
849.429691792

Training LogisticRegression
           precision  recall  f1-score  support
Classes                                        
-1              0.78    0.69      0.73      614
1               

Training LogisticRegression
           precision  recall  f1-score  support
Classes                                        
-1              0.77    0.65      0.70      614
1               0.46    0.61      0.53      306
avg/total       0.67    0.64      0.65      920
Training SVC_rbf_C=250
           precision  recall  f1-score  support
Classes                                        
-1              0.77    0.63      0.69      614
1               0.45    0.61      0.52      306
avg/total       0.66    0.62      0.64      920
Training RandomForestClassifier
           precision  recall  f1-score  support
Classes                                        
-1              0.75    0.67      0.71      614
1               0.45    0.55      0.50      306
avg/total       0.65    0.63      0.64      920
Training SVC_linear
           precision  recall  f1-score  support
Classes                                        
-1              0.76    0.59      0.66      614
1               0.43    0.63     

           precision  recall  f1-score  support
Classes                                        
-1              0.75    0.57      0.65      614
1               0.42    0.61      0.49      306
avg/total       0.64    0.59      0.60      920
Training SVC_linear
           precision  recall  f1-score  support
Classes                                        
-1              0.77    0.69      0.72      614
1               0.48    0.58      0.52      306
avg/total       0.67    0.65      0.66      920
Training SVC_rbf_C=75
           precision  recall  f1-score  support
Classes                                        
-1              0.76    0.63      0.69      614
1               0.45    0.61      0.52      306
avg/total       0.66    0.62      0.63      920
Training SVC_rbf_C=150
           precision  recall  f1-score  support
Classes                                        
-1              0.74    0.85      0.79      614
1               0.57    0.39      0.46      306
avg/total       0.68   

# ----------->
# Open CNN-text-classification-keras-CDR and generate results for CNN

# -------------------------------
# Part 2 (snorkel LF)
# -------------------------------

In [3]:
import seaborn as sns
from snorkel.annotations import save_marginals

In [5]:
from snorkel.lf_helpers import *
import pickle,glob
import pandas as pd
from MLutils import cohen_kappa_score, plot_marginals_histogram, neg_to_bin_labels, diversity_heatmap, merge_pickles_pred_dicts


In [6]:
# load gold labels 
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

In [10]:
# make sure unmapped cands are gone
from sqlalchemy import  any_,or_,and_
#create list of unmapped cands to drop
to_drop = []
for k in range(4):
    query = session.query(ChemicalDisease).filter(and_(ChemicalDisease.split==k,~ChemicalDisease.gold_labels.any()))
    print 'Adding %i candidates from split=%i in to_drop list'%(query.count(), k)
    to_drop.extend(map(lambda x: x.id,query.all()))

Adding 0 candidates from split=0 in to_drop list
Adding 0 candidates from split=1 in to_drop list
Adding 0 candidates from split=2 in to_drop list
Adding 0 candidates from split=3 in to_drop list


In [11]:
l = glob.glob(path_base_learners+"/KMeansPeaks/*.pickle")
l.sort()
l

[]

In [None]:
# TODO move in LSTM training section

# needed to train the LSTM here
train = session.query(REGULATOR).filter(REGULATOR.split == 0).order_by(REGULATOR.id).all()
dev = session.query(REGULATOR).filter(REGULATOR.split == 1).order_by(REGULATOR.id).all()
test = session.query(REGULATOR).filter(REGULATOR.split == 2).order_by(REGULATOR.id).all()
unlab = session.query(REGULATOR).filter(REGULATOR.split == 3).order_by(REGULATOR.id).all()