In [1]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import utils, json

In [2]:
# PATHS
data_path = '/scratch/juanmoo1'
EMA_dump_path = os.path.join(data_path, './jsons/EMA_dump.json')
EMA_xmls_path = os.path.join(data_path, './xmls/')
EMA_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/annotations.xlsx')

pickle_dumps_path = os.path.join(data_path, './pickle_dumps/')

In [3]:
import pickle, os

def save_value(key, val):
    with open(os.path.join(pickle_dumps_path, 'checkpoint.pickle'), 'rb') as f:
        try:
            saved_env = pickle.load(f)
            saved_env.keys()
        except:
            saved_env = dict()
            
    saved_env[key] = val
    
    with open(os.path.join(pickle_dumps_path, 'checkpoint.pickle'), 'wb') as f:
        f.write(pickle.dumps(saved_env))
    
def load_value(key):
    with open(os.path.join(pickle_dumps_path, 'checkpoint.pickle'), 'rb') as f:
        s = f.read()
        try:
            saved_env = pickle.loads(s)
            ans = saved_env[key]
        except:
            ans = None
        
        return ans

In [16]:
processed_documents = load_value('processed_documents')

## Train/Test Procedures

In [17]:
import random

def cross_validation(doc_list, train_algo, test_algo, k, verbose=False, config=None):
    N = len(doc_list)
    size = N//k
    indeces = list(range(N))
    random.shuffle(indeces)
    all_indeces = set(indeces)
    
    pres_list = []
    rec_list = []
    
    for j in range(N//size):
        train_indeces = indeces[j * size:(j + 1) * size] + indeces[size * k + j: size * k + j + 1]
        test_indeces = list(all_indeces - set(train_indeces))
        
        train_docs = [doc_list[i] for i in train_indeces]
        test_docs = [doc_list[i] for i in test_indeces]
        
        if verbose:
            print('Fold %d starting!'%(j + 1))
        
        m, tt, ht = train_algo(train_docs, config=config)
        pres, rec = test_algo(m, tt, ht, test_docs)
        
        pres_list.append(pres)
        rec_list.append(rec)
        
        if verbose:
            print('precision:', pres)
            print('recall:', rec)
            print('-' * 10 + '\n')
    
    return sum(pres_list)/k, sum(rec_list)/k

In [50]:
from scipy.sparse import hstack, vstack
from functools import reduce
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score

def model_test(model, text_tokenizer, header_tokenizer, doc_list, use_headers = True):
    test_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    tokenized_texts = text_tokenizer.transform(test_texts)
    test_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    if use_headers:
        test_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
        test_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
        tokenized_header1 = header_tokenizer.transform(test_header1)
        tokenized_header2 = header_tokenizer.transform(test_header2)
        X_test = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    else:
        X_test = tokenized_texts
        
    Y_test = test_labels
    pred = model.predict(X_test)
    cm = np.array(confusion_matrix(Y_test, pred))
    
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall
    
    # Remove 'other' Category
    c_freq = c_freq[0:1] + c_freq[2:]
    pres = pres[0:1] + pres[2:]
    rec = rec[0:1] + rec[2:]
    
    return pres.sum()/c_freq.sum(), rec.sum()/c_freq.sum(), cm

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier

def mlp_train(doc_list, config=None, use_headers=True):
    train_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    train_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    if use_headers:
        train_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
        train_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])

    if config is None:
        config = load_value('best_config_header')
        
    text_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    
    if use_headers:
        header_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
        tokenized_header1 = header_tokenizer.fit_transform(train_header1)
        tokenized_header2 = header_tokenizer.transform(train_header2)
    else:
        header_tokenizer = None

    
    tokenized_texts = text_tokenizer.fit_transform(train_texts)
    
    if use_headers:
        X_train = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    else:
        X_train = tokenized_texts
        
        
    Y_train = train_labels
    
    model = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100, 100), random_state=None, verbose=True)
    model.fit(X_train, Y_train)
    
    return (model, text_tokenizer, header_tokenizer)

In [30]:
m, tt, ht = mlp_train(list(processed_documents), config=None)

Iteration 1, loss = 0.42394903
Iteration 2, loss = 0.09654049
Iteration 3, loss = 0.05834669
Iteration 4, loss = 0.04552078
Iteration 5, loss = 0.03875755
Iteration 6, loss = 0.03603909
Iteration 7, loss = 0.03344196
Iteration 8, loss = 0.03180807
Iteration 9, loss = 0.03088413
Iteration 10, loss = 0.02959805
Iteration 11, loss = 0.02924986
Iteration 12, loss = 0.02878188
Iteration 13, loss = 0.02848419
Iteration 14, loss = 0.02819246
Iteration 15, loss = 0.02748516
Iteration 16, loss = 0.02750734
Iteration 17, loss = 0.02704540
Iteration 18, loss = 0.02705784
Iteration 19, loss = 0.02642833
Iteration 20, loss = 0.02735820
Iteration 21, loss = 0.02654729
Iteration 22, loss = 0.02644453
Iteration 23, loss = 0.02622576
Iteration 24, loss = 0.02587575
Iteration 25, loss = 0.02596482
Iteration 26, loss = 0.02593038
Iteration 27, loss = 0.02577982
Iteration 28, loss = 0.02543931
Iteration 29, loss = 0.02560631
Iteration 30, loss = 0.02541220
Iteration 31, loss = 0.02513286
Iteration 32, los

In [35]:
p, r, cm = model_test(m, tt, ht, list(processed_documents))

In [36]:
print('Precision:', p)
print('Recall:', r)
print(cm)

Precision: 0.8451717734447539
Recall: 0.8891629383142611
[[  214     9     0     0     0     0     0     0     0     2     0    39]
 [   23 25159     1     0     1     2     0     0     0    22     4    17]
 [    0     4   119     0     0     0     0     0     0     0     0     0]
 [    0     7     0    85     0     0     0     0     0     0     0     0]
 [    0     2     0     0    38     0     0     0     0     0     0     1]
 [    0    24     1     0     0    51     0     0     0     0     0     0]
 [    0     0     0     0     0     0     1     0     0     0     0     0]
 [    1     6     0     0     0     0     0    30     0     0     0     4]
 [    0     0     0     0     0     0     0     0     4     0     0     0]
 [    1    39     0     0     0     0     0     0     0   289     0     9]
 [    0     0     0     0     0     0     0     0     0     0    21     0]
 [   10    57     0     0     0     0     0     1     0     0     0   863]]


In [38]:
# Save Model
save_value('neural_model_100_100', m)
save_value('neural_model_tt_100_100', tt)
save_value('neural_model_ht_100_100', ht)

# 80/20 Train-test

In [72]:
# Data Split
import random

names = list(processed_documents)
random.shuffle(names)

i = int(.8 * len(names))
train_docs = names[:i]
test_docs = names[i:]

# Save Used Split
save_value('8020_split_train', train_docs)
save_value('8020_split_test', test_docs)

In [None]:
# Model Training
m2, tt2, ht2 = mlp_train(train_docs, config=None, use_headers=True)

# Save Model
save_value('neural_model_100_100_8020', m2)
save_value('neural_model_tt_100_100_8020', tt2)
save_value('neural_model_ht_100_100_8020', ht2)

Iteration 1, loss = 0.52337805
Iteration 2, loss = 0.10967314
Iteration 3, loss = 0.06513091
Iteration 4, loss = 0.05140556
Iteration 5, loss = 0.04262207
Iteration 6, loss = 0.03943536
Iteration 7, loss = 0.03670238
Iteration 8, loss = 0.03456121
Iteration 9, loss = 0.03334999
Iteration 10, loss = 0.03309002
Iteration 11, loss = 0.03212916
Iteration 12, loss = 0.03062918
Iteration 13, loss = 0.03004202
Iteration 14, loss = 0.03040245
Iteration 15, loss = 0.03014385
Iteration 16, loss = 0.02993898
Iteration 17, loss = 0.02914089
Iteration 18, loss = 0.02938927
Iteration 19, loss = 0.02816157
Iteration 20, loss = 0.02848160
Iteration 21, loss = 0.02815876
Iteration 22, loss = 0.02791965
Iteration 23, loss = 0.02752088
Iteration 24, loss = 0.02760487
Iteration 25, loss = 0.02714258
Iteration 26, loss = 0.02744343


In [None]:
m2 = load_value('neural_model_100_100_8020')
tt2 = load_value('neural_model_tt_100_100_8020')
ht2 = load_value('neural_model_ht_100_100_8020')
# Model Test
p, r, cm = model_test(m2, tt2, ht2, test_docs)
print('Precision:', p)
print('Recall:', r)
print(cm)

## Without Headers

In [54]:
# Training
m3, tt3, ht3 = mlp_train(train_docs, config=None, use_headers=False)

Iteration 1, loss = 0.56314502
Iteration 2, loss = 0.12537349
Iteration 3, loss = 0.08026345
Iteration 4, loss = 0.06622139
Iteration 5, loss = 0.05712021
Iteration 6, loss = 0.05104173
Iteration 7, loss = 0.04939483
Iteration 8, loss = 0.04801441
Iteration 9, loss = 0.04716561
Iteration 10, loss = 0.04639908
Iteration 11, loss = 0.04508791
Iteration 12, loss = 0.04596397
Iteration 13, loss = 0.04444380
Iteration 14, loss = 0.04442796
Iteration 15, loss = 0.04403545
Iteration 16, loss = 0.04320014
Iteration 17, loss = 0.04549352
Iteration 18, loss = 0.04350482
Iteration 19, loss = 0.04310921
Iteration 20, loss = 0.04252026
Iteration 21, loss = 0.04250790
Iteration 22, loss = 0.04237036
Iteration 23, loss = 0.04256695
Iteration 24, loss = 0.04243339
Iteration 25, loss = 0.04217606
Iteration 26, loss = 0.04198119
Iteration 27, loss = 0.04228552
Iteration 28, loss = 0.04226891
Iteration 29, loss = 0.04188171
Iteration 30, loss = 0.04201180
Iteration 31, loss = 0.04202332
Iteration 32, los

In [55]:
# Save Model
save_value('neural_model_100_100_nohead', m3)
save_value('neural_model_tt_100_100_nohead', tt3)

In [None]:
# Model Test
m3 = load_value('neural_model_100_100_nohead')
tt3 = load_value('neural_model_tt_100_100_nohead')
ht3 = None

p, r, cm = model_test(m3, tt3, ht3, list(processed_documents), use_headers=False)
print('Precision:', p)
print('Recall:', r)
print(cm)