In [18]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import json
from utils import save_value, load_value, load_env_keys

In [19]:
# PATHS
data_path = '/scratch/juanmoo1'
EMA_dump_path = os.path.join(data_path, './jsons/EMA_dump.json')
EMA_xmls_path = os.path.join(data_path, './xmls/')
EMA_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/annotations.xlsx')

pickle_dumps_path = os.path.join(data_path, './pickle_dumps/')
checkpoint_file = os.path.normpath(os.path.join(pickle_dumps_path, 'checkpoint.pickle'))

In [20]:
processed_documents = load_value('processed_documents', path=checkpoint_file)

## Train/Test Procedures for MLP

In [21]:
import random

def cross_validation(doc_list, train_algo, test_algo, k, verbose=False, config=None):
    N = len(doc_list)
    size = N//k
    indeces = list(range(N))
    random.shuffle(indeces)
    all_indeces = set(indeces)
    
    pres_list = []
    rec_list = []
    
    for j in range(N//size):
        train_indeces = indeces[j * size:(j + 1) * size] + indeces[size * k + j: size * k + j + 1]
        test_indeces = list(all_indeces - set(train_indeces))
        
        train_docs = [doc_list[i] for i in train_indeces]
        test_docs = [doc_list[i] for i in test_indeces]
        
        if verbose:
            print('Fold %d starting!'%(j + 1))
        
        m, tt, ht = train_algo(train_docs, config=config)
        pres, rec = test_algo(m, tt, ht, test_docs)
        
        pres_list.append(pres)
        rec_list.append(rec)
        
        if verbose:
            print('precision:', pres)
            print('recall:', rec)
            print('-' * 10 + '\n')
    
    return sum(pres_list)/k, sum(rec_list)/k

In [22]:
from scipy.sparse import hstack, vstack
from functools import reduce
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score

def model_test(model, text_tokenizer, header_tokenizer, doc_list, use_headers = True):
    test_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    tokenized_texts = text_tokenizer.transform(test_texts)
    test_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    if use_headers:
        test_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
        test_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
        tokenized_header1 = header_tokenizer.transform(test_header1)
        tokenized_header2 = header_tokenizer.transform(test_header2)
        X_test = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    else:
        X_test = tokenized_texts
        
    Y_test = test_labels
    pred = model.predict(X_test)
    cm = np.array(confusion_matrix(Y_test, pred))
    
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall
    
    # Remove 'other' Category
    c_freq = c_freq[0:1] + c_freq[2:]
    pres = pres[0:1] + pres[2:]
    rec = rec[0:1] + rec[2:]
    
    return pres.sum()/c_freq.sum(), rec.sum()/c_freq.sum(), cm

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier

def mlp_train(doc_list, config=None, use_headers=True):
    train_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    train_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    if use_headers:
        train_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
        train_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])

    if config is None:
        config = load_value('best_header_config', path=checkpoint_file)
        
    text_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    
    if use_headers:
        header_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
        tokenized_header1 = header_tokenizer.fit_transform(train_header1)
        tokenized_header2 = header_tokenizer.transform(train_header2)
    else:
        header_tokenizer = None

    
    tokenized_texts = text_tokenizer.fit_transform(train_texts)
    
    if use_headers:
        X_train = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    else:
        X_train = tokenized_texts
        
        
    Y_train = train_labels
    
    model = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100, 100), random_state=None, verbose=True)
    model.fit(X_train, Y_train)
    
    return (model, text_tokenizer, header_tokenizer)

In [None]:
m, tt, ht = mlp_train(list(processed_documents), config=None)
# Save Model
save_value('neural_model_100_100', path=checkpoint_file)
save_value('neural_model_tt_100_100', tt, path=checkpoint_file)
save_value('neural_model_ht_100_100', ht, path=checkpoint_file)

In [4]:
m = load_value('neural_model_100_100', path=checkpoint_file)
tt = load_value('neural_model_tt_100_100', path=checkpoint_file)
ht = load_value('neural_model_ht_100_100', path=checkpoint_file)

In [5]:
type(m)

NoneType

In [8]:
p, r, cm = model_test(m, tt, ht, list(processed_documents))
print('Precision:', p)
print('Recall:', r)
print(cm)

AttributeError: 'NoneType' object has no attribute 'transform'

# 80/20 Train-test

In [30]:
# Data Split
import random

names = list(processed_documents)
random.shuffle(names)

i = int(.8 * len(names))
train_docs = names[:i]
test_docs = names[i:]

# Save Used Split
save_value('8020_split_train', train_docs, path=checkpoint_file)
save_value('8020_split_test', test_docs, path=checkpoint_file)

In [31]:
# Model Training
m2, tt2, ht2 = mlp_train(train_docs, config=None, use_headers=True)

# Save Model
save_value('neural_model_100_100_8020', m2, path=checkpoint_file)
save_value('neural_model_tt_100_100_8020', tt2, path=checkpoint_file)
save_value('neural_model_ht_100_100_8020', ht2, path=checkpoint_file)

Iteration 1, loss = 0.53449386
Iteration 2, loss = 0.11339391
Iteration 3, loss = 0.06942646
Iteration 4, loss = 0.05202540
Iteration 5, loss = 0.04540920
Iteration 6, loss = 0.04124154
Iteration 7, loss = 0.03727206
Iteration 8, loss = 0.03557919
Iteration 9, loss = 0.03419705
Iteration 10, loss = 0.03378738
Iteration 11, loss = 0.03323914
Iteration 12, loss = 0.03285407
Iteration 13, loss = 0.03114829
Iteration 14, loss = 0.03106815
Iteration 15, loss = 0.03023612
Iteration 16, loss = 0.03095014
Iteration 17, loss = 0.03062793
Iteration 18, loss = 0.03030639
Iteration 19, loss = 0.03057404
Iteration 20, loss = 0.02969658
Iteration 21, loss = 0.02933685
Iteration 22, loss = 0.02931908
Iteration 23, loss = 0.02899157
Iteration 24, loss = 0.02892000
Iteration 25, loss = 0.02909354
Iteration 26, loss = 0.02836927
Iteration 27, loss = 0.02849505
Iteration 28, loss = 0.02862916
Iteration 29, loss = 0.02800112
Iteration 30, loss = 0.02859718
Iteration 31, loss = 0.02795442
Iteration 32, los

In [40]:
m2 = load_value('neural_model_100_100_8020', path=checkpoint_file)
tt2 = load_value('neural_model_tt_100_100_8020', path=checkpoint_file)
ht2 = load_value('neural_model_ht_100_100_8020', path=checkpoint_file)
test_docs = load_value('8020_split_test', path=checkpoint_file)

# Model Test
p, r, cm = model_test(m2, tt2, ht2, test_docs)
print('Precision:', p)
print('Recall:', r)
print(cm)

Precision: 0.4799999999999999
Recall: 0.2781074979650294
[[  47   15    0    0    0    0    0    0    0    7    0    9]
 [  99 5887    7    6    0   13    0    1    0   32    0  128]
 [   0    3    0    0    0    1    0    0    0    0    0    0]
 [   0    1    0    0    0    0    0    0    0    0    0    0]
 [   0   19    0    0    0    0    0    0    0    0    0    2]
 [   0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    1    0    0]
 [   1   12    0    0    0    0    0    0    0    0    0    5]
 [   0    1    0    0    0    0    0    0    0    0    0    0]
 [   2   64    0    0    0    1    0    0    0   33    0    4]
 [   0    8    0    0    0    0    0    0    0    0    0    0]
 [   4  124    0    0    0    0    0    2    0    7    0   25]]


  precision = np.where(class_sum == 0, 0, diagonal/class_sum)
  recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)


## Without Headers

In [38]:
# Training
m3, tt3, ht3 = mlp_train(train_docs, config=None, use_headers=False)
# Save Model
save_value('neural_model_100_100_nohead', m3, path=checkpoint_file)
save_value('neural_model_tt_100_100_nohead', tt3, path=checkpoint_file)

Iteration 1, loss = 0.84406257
Iteration 2, loss = 0.15876564
Iteration 3, loss = 0.10261932
Iteration 4, loss = 0.08203058
Iteration 5, loss = 0.07047248
Iteration 6, loss = 0.06466662
Iteration 7, loss = 0.06038527
Iteration 8, loss = 0.05784284
Iteration 9, loss = 0.05684197
Iteration 10, loss = 0.05629444
Iteration 11, loss = 0.05483570
Iteration 12, loss = 0.05396532
Iteration 13, loss = 0.05399739
Iteration 14, loss = 0.05299765
Iteration 15, loss = 0.05296739
Iteration 16, loss = 0.05283849
Iteration 17, loss = 0.05256334
Iteration 18, loss = 0.05262052
Iteration 19, loss = 0.05186606
Iteration 20, loss = 0.05175251
Iteration 21, loss = 0.05237662
Iteration 22, loss = 0.05190384
Iteration 23, loss = 0.05168975
Iteration 24, loss = 0.05156262
Iteration 25, loss = 0.05153554
Iteration 26, loss = 0.05160857
Iteration 27, loss = 0.05218182
Iteration 28, loss = 0.05098553
Iteration 29, loss = 0.05227251
Iteration 30, loss = 0.05169069
Iteration 31, loss = 0.05092994
Iteration 32, los

In [41]:
# Model Test
m3 = load_value('neural_model_100_100_nohead', path=checkpoint_file)
tt3 = load_value('neural_model_tt_100_100_nohead', path=checkpoint_file)
ht3 = None

p, r, cm = model_test(m3, tt3, ht3, list(processed_documents), use_headers=False)
print('Precision:', p)
print('Recall:', r)
print(cm)

Precision: 0.564531104921077
Recall: 0.7621791303973712
[[  134   120     0     0     0     0     0     0     0     8     0     2]
 [   53 25021     1     1     6     8     0     3     0    64     4    68]
 [    0    33    90     0     0     0     0     0     0     0     0     0]
 [    0    11     0    81     0     0     0     0     0     0     0     0]
 [    0    22     1     0    18     0     0     0     0     0     0     0]
 [    0    40     0     0     0    36     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     1     0     0]
 [    0    27     0     0     0     0     0    13     0     0     0     1]
 [    0     1     0     0     0     0     0     0     3     0     0     0]
 [    2   132     0     0     0     0     0     0     0   204     0     0]
 [    0     6     0     0     0     0     0     0     0     0    11     4]
 [    2   287     0     0     0     0     0     2     0     4     0   636]]


  recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
