In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Setup Logging
import logging
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO,filename=None, format=logging_format)
log = logging.getLogger(__name__)
    
from att_emb_enc import preproc as pp
from att_emb_enc import utils
from att_emb_enc import embed
from att_emb_enc import modelling as mod
from tqdm import tqdm

# Sklearn
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split

# Datetime
from datetime import datetime

Using TensorFlow backend.


In [3]:
# Get Data
X, Y, vocabulary = utils.create_or_load_preproc(20000, 300, False)
embed_matrix = utils.create_or_load_embeddings(vocabulary, 20000, 300, False)
log.info("Inputs and Embedding Matrix Files Loaded")

2019-06-11 07:05:12,913 - INFO - Input Data Loaded From data/processed_inputs/X_Y_word_index_maxfeat20000_maxlen300.pkl
2019-06-11 07:05:12,951 - INFO - Embeddings File Loaded From data/processed_embeddings/avg_embeddings_maxfeat20000.pkl

2019-06-11 07:05:12,951 - INFO - Inputs and Embedding Matrix Files Loaded


In [4]:
# Split Test Set From All Articles
X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.1, shuffle=True, random_state=42)
log.info(f"Training Shape = {y_train.shape}")
log.info(f"Testing Shape = {y_test.shape}")

2019-06-11 07:05:12,990 - INFO - Training Shape = (31397, 37)
2019-06-11 07:05:12,990 - INFO - Testing Shape = (3489, 37)


In [None]:
6 * 4 * 2

In [5]:
# Create Parameter Grid
param_grid = {
    'n_hidden': [16, 32, 64, 96], 
    'number_of_texts': [1000, 1875, 3750, 7500, 15000, 30000], 
    'activation': ['softmax', 'sigmoid']
}
param_grid = list(ParameterGrid(param_grid))

n_samples_li = []
n_hidden_li = []
activation_li = []
training_time_li = []
val_loss_li = []
val_acc_li = []
val_prec_li = []
val_rec_li = []
test_loss_li = []
test_acc_li = []
test_prec_li = []
test_rec_li = []

for ix, grid in enumerate(param_grid):
    start = datetime.now().timestamp()
    # Get Data for sample size
    X_train_grid = X_train[0:grid['number_of_texts']].copy()
    Y_train_grid = y_train[0:grid['number_of_texts']].copy()
    Xtr, Xval, ytr, yval = train_test_split(
        X_train_grid, Y_train_grid, test_size=0.25, shuffle=False, random_state=42)
    
    # Train Model
    model, history = mod.train_and_return_model(
        Xtr, Xval, ytr, yval, embed_matrix, 300, 20000, 300, grid['n_hidden'], grid['activation'])
    
    # Extract Results
    val_loss, val_acc, val_prec, val_rec = utils.extract_validation_results(history)
    test_loss, test_acc, test_prec, test_rec = model.evaluate(X_test, y_test, batch_size=1024, verbose=0)
    end = datetime.now().timestamp()
    
    # Save To Lists
    n_samples_li.append(grid['number_of_texts'])
    n_hidden_li.append(grid['n_hidden'])
    training_time_li.append(end - start)
    activation_li.append(grid['activation'])
    val_loss_li.append(val_loss)
    val_acc_li.append(val_acc)
    val_prec_li.append(val_prec)
    val_rec_li.append(val_rec)
    test_loss_li.append(test_loss)
    test_acc_li.append(test_acc)
    test_prec_li.append(test_prec)
    test_rec_li.append(test_rec)
    
    # Log Results
    log.info(f"Model {ix} - Prec {test_prec} - Rec {test_rec}")
    log.info(f"Settings - n_samples = {grid['number_of_texts']}, n_hidden = {grid['n_hidden']}")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


Epoch 00023: early stopping


2019-06-11 07:05:25,293 - INFO - Model 0 - Prec 0.0 - Rec 0.0
2019-06-11 07:05:25,295 - INFO - Settings - n_samples = 1000, n_hidden = 16


Epoch 00047: early stopping


2019-06-11 07:05:59,545 - INFO - Model 1 - Prec 0.0 - Rec 0.0
2019-06-11 07:05:59,547 - INFO - Settings - n_samples = 1875, n_hidden = 16


Epoch 00029: early stopping


2019-06-11 07:06:38,793 - INFO - Model 2 - Prec 0.0 - Rec 0.0
2019-06-11 07:06:38,795 - INFO - Settings - n_samples = 3750, n_hidden = 16


Epoch 00028: early stopping


2019-06-11 07:07:52,578 - INFO - Model 3 - Prec 0.0 - Rec 0.0
2019-06-11 07:07:52,580 - INFO - Settings - n_samples = 7500, n_hidden = 16


Epoch 00033: early stopping


2019-06-11 07:10:42,960 - INFO - Model 4 - Prec 0.0 - Rec 0.0
2019-06-11 07:10:42,962 - INFO - Settings - n_samples = 15000, n_hidden = 16


Epoch 00026: early stopping


2019-06-11 07:15:09,184 - INFO - Model 5 - Prec 0.0 - Rec 0.0
2019-06-11 07:15:09,185 - INFO - Settings - n_samples = 30000, n_hidden = 16


Epoch 00032: early stopping


2019-06-11 07:15:26,696 - INFO - Model 6 - Prec 0.0 - Rec 0.0
2019-06-11 07:15:26,697 - INFO - Settings - n_samples = 1000, n_hidden = 32


Epoch 00015: early stopping


2019-06-11 07:15:42,406 - INFO - Model 7 - Prec 0.0 - Rec 0.0
2019-06-11 07:15:42,407 - INFO - Settings - n_samples = 1875, n_hidden = 32


Epoch 00024: early stopping


2019-06-11 07:16:24,298 - INFO - Model 8 - Prec 0.0 - Rec 0.0
2019-06-11 07:16:24,300 - INFO - Settings - n_samples = 3750, n_hidden = 32


Epoch 00031: early stopping


2019-06-11 07:18:04,985 - INFO - Model 9 - Prec 0.0 - Rec 0.0
2019-06-11 07:18:04,986 - INFO - Settings - n_samples = 7500, n_hidden = 32


Epoch 00033: early stopping


2019-06-11 07:21:35,114 - INFO - Model 10 - Prec 0.0 - Rec 0.0
2019-06-11 07:21:35,115 - INFO - Settings - n_samples = 15000, n_hidden = 32


Epoch 00026: early stopping


2019-06-11 07:27:03,554 - INFO - Model 11 - Prec 0.0 - Rec 0.0
2019-06-11 07:27:03,555 - INFO - Settings - n_samples = 30000, n_hidden = 32


Epoch 00027: early stopping


2019-06-11 07:27:25,876 - INFO - Model 12 - Prec 0.0 - Rec 0.0
2019-06-11 07:27:25,877 - INFO - Settings - n_samples = 1000, n_hidden = 64


Epoch 00025: early stopping


2019-06-11 07:27:59,758 - INFO - Model 13 - Prec 0.0 - Rec 0.0
2019-06-11 07:27:59,759 - INFO - Settings - n_samples = 1875, n_hidden = 64


Epoch 00010: early stopping


2019-06-11 07:28:28,881 - INFO - Model 14 - Prec 0.0 - Rec 0.0
2019-06-11 07:28:28,882 - INFO - Settings - n_samples = 3750, n_hidden = 64


Epoch 00026: early stopping


2019-06-11 07:30:27,014 - INFO - Model 15 - Prec 0.0 - Rec 0.0
2019-06-11 07:30:27,016 - INFO - Settings - n_samples = 7500, n_hidden = 64


Epoch 00029: early stopping


2019-06-11 07:34:40,971 - INFO - Model 16 - Prec 0.0 - Rec 0.0
2019-06-11 07:34:40,972 - INFO - Settings - n_samples = 15000, n_hidden = 64


Epoch 00019: early stopping


2019-06-11 07:40:12,547 - INFO - Model 17 - Prec 0.0 - Rec 0.0
2019-06-11 07:40:12,549 - INFO - Settings - n_samples = 30000, n_hidden = 64


Epoch 00020: early stopping


2019-06-11 07:40:37,942 - INFO - Model 18 - Prec 0.0 - Rec 0.0
2019-06-11 07:40:37,943 - INFO - Settings - n_samples = 1000, n_hidden = 96


Epoch 00011: early stopping


2019-06-11 07:41:03,638 - INFO - Model 19 - Prec 0.0 - Rec 0.0
2019-06-11 07:41:03,639 - INFO - Settings - n_samples = 1875, n_hidden = 96


Epoch 00018: early stopping


2019-06-11 07:42:03,936 - INFO - Model 20 - Prec 0.0 - Rec 0.0
2019-06-11 07:42:03,937 - INFO - Settings - n_samples = 3750, n_hidden = 96


Epoch 00019: early stopping


2019-06-11 07:43:59,467 - INFO - Model 21 - Prec 0.0 - Rec 0.0
2019-06-11 07:43:59,468 - INFO - Settings - n_samples = 7500, n_hidden = 96


Epoch 00014: early stopping


2019-06-11 07:46:45,619 - INFO - Model 22 - Prec 0.0 - Rec 0.0
2019-06-11 07:46:45,620 - INFO - Settings - n_samples = 15000, n_hidden = 96


Epoch 00015: early stopping


2019-06-11 07:52:50,004 - INFO - Model 23 - Prec 0.0 - Rec 0.0
2019-06-11 07:52:50,006 - INFO - Settings - n_samples = 30000, n_hidden = 96


Epoch 00065: early stopping


2019-06-11 07:53:30,731 - INFO - Model 24 - Prec 0.0 - Rec 0.0
2019-06-11 07:53:30,732 - INFO - Settings - n_samples = 1000, n_hidden = 16


Epoch 00043: early stopping


2019-06-11 07:54:15,907 - INFO - Model 25 - Prec 0.0 - Rec 0.0
2019-06-11 07:54:15,908 - INFO - Settings - n_samples = 1875, n_hidden = 16


Epoch 00014: early stopping


2019-06-11 07:54:50,254 - INFO - Model 26 - Prec 0.0 - Rec 0.0
2019-06-11 07:54:50,254 - INFO - Settings - n_samples = 3750, n_hidden = 16


Epoch 00032: early stopping


2019-06-11 07:56:32,621 - INFO - Model 27 - Prec 0.4423076914571006 - Rec 0.27380952348356014
2019-06-11 07:56:32,622 - INFO - Settings - n_samples = 7500, n_hidden = 16


Epoch 00032: early stopping


2019-06-11 07:59:43,390 - INFO - Model 28 - Prec 0.6279069752839372 - Rec 0.3214285710459184
2019-06-11 07:59:43,391 - INFO - Settings - n_samples = 15000, n_hidden = 16


Epoch 00026: early stopping


2019-06-11 08:04:32,684 - INFO - Model 29 - Prec 0.5098039205690119 - Rec 0.30952380915532884
2019-06-11 08:04:32,685 - INFO - Settings - n_samples = 30000, n_hidden = 16


Epoch 00040: early stopping


2019-06-11 08:05:08,276 - INFO - Model 30 - Prec 0.0 - Rec 0.0
2019-06-11 08:05:08,276 - INFO - Settings - n_samples = 1000, n_hidden = 32


Epoch 00039: early stopping


2019-06-11 08:05:59,294 - INFO - Model 31 - Prec 0.0 - Rec 0.0
2019-06-11 08:05:59,295 - INFO - Settings - n_samples = 1875, n_hidden = 32


Epoch 00018: early stopping


2019-06-11 08:06:48,115 - INFO - Model 32 - Prec 0.0 - Rec 0.0
2019-06-11 08:06:48,116 - INFO - Settings - n_samples = 3750, n_hidden = 32


Epoch 00023: early stopping


2019-06-11 08:08:19,701 - INFO - Model 33 - Prec 0.49999999375 - Rec 0.04761904756235828
2019-06-11 08:08:19,702 - INFO - Settings - n_samples = 7500, n_hidden = 32


Epoch 00020: early stopping


2019-06-11 08:10:45,767 - INFO - Model 34 - Prec 0.38461538165680476 - Rec 0.05952380945294785
2019-06-11 08:10:45,768 - INFO - Settings - n_samples = 15000, n_hidden = 32


Epoch 00028: early stopping


2019-06-11 08:17:09,321 - INFO - Model 35 - Prec 0.605263157098338 - Rec 0.5476190469671203
2019-06-11 08:17:09,321 - INFO - Settings - n_samples = 30000, n_hidden = 32


Epoch 00022: early stopping


2019-06-11 08:17:43,228 - INFO - Model 36 - Prec 0.0 - Rec 0.0
2019-06-11 08:17:43,229 - INFO - Settings - n_samples = 1000, n_hidden = 64


Epoch 00033: early stopping


2019-06-11 08:18:42,449 - INFO - Model 37 - Prec 0.0 - Rec 0.0
2019-06-11 08:18:42,450 - INFO - Settings - n_samples = 1875, n_hidden = 64


Epoch 00032: early stopping


2019-06-11 08:20:16,399 - INFO - Model 38 - Prec 0.4482758605231867 - Rec 0.15476190457766442
2019-06-11 08:20:16,400 - INFO - Settings - n_samples = 3750, n_hidden = 64


Epoch 00028: early stopping


2019-06-11 08:22:43,560 - INFO - Model 39 - Prec 0.6923076896449704 - Rec 0.21428571403061225
2019-06-11 08:22:43,562 - INFO - Settings - n_samples = 7500, n_hidden = 64


Epoch 00023: early stopping


2019-06-11 08:27:02,318 - INFO - Model 40 - Prec 0.553191488184699 - Rec 0.30952380915532884
2019-06-11 08:27:02,320 - INFO - Settings - n_samples = 15000, n_hidden = 64


Epoch 00020: early stopping


2019-06-11 08:34:24,704 - INFO - Model 41 - Prec 0.7878787866850322 - Rec 0.6190476183106577
2019-06-11 08:34:24,706 - INFO - Settings - n_samples = 30000, n_hidden = 64


Epoch 00020: early stopping


2019-06-11 08:35:29,789 - INFO - Model 42 - Prec 0.0 - Rec 0.0
2019-06-11 08:35:29,790 - INFO - Settings - n_samples = 1000, n_hidden = 96


Epoch 00018: early stopping


2019-06-11 08:36:20,978 - INFO - Model 43 - Prec 0.0 - Rec 0.0
2019-06-11 08:36:20,980 - INFO - Settings - n_samples = 1875, n_hidden = 96


Epoch 00030: early stopping


2019-06-11 08:38:31,077 - INFO - Model 44 - Prec 0.5185185165980796 - Rec 0.16666666646825398
2019-06-11 08:38:31,078 - INFO - Settings - n_samples = 3750, n_hidden = 96


Epoch 00022: early stopping


2019-06-11 08:42:00,613 - INFO - Model 45 - Prec 0.49999999886363633 - Rec 0.26190476159297055
2019-06-11 08:42:00,615 - INFO - Settings - n_samples = 7500, n_hidden = 96


Epoch 00015: early stopping


2019-06-11 08:46:34,432 - INFO - Model 46 - Prec 0.4411764692906574 - Rec 0.17857142835884354
2019-06-11 08:46:34,434 - INFO - Settings - n_samples = 15000, n_hidden = 96


Epoch 00017: early stopping


2019-06-11 08:54:56,325 - INFO - Model 47 - Prec 0.6521739120982988 - Rec 0.5357142850765306
2019-06-11 08:54:56,328 - INFO - Settings - n_samples = 30000, n_hidden = 96


In [12]:
import pandas as pd
results_df = pd.DataFrame({
    'n_samples': n_samples_li,
    'n_hidden': n_hidden_li,
    'activation':activation_li,
    'training_time': training_time_li,
    'val_loss': val_loss_li,
    'val_accuracy': val_acc_li,
    'val_precision': val_prec_li,
    'val_recall': val_rec_li,
    'test_loss': test_loss_li,
    'test_accuracy': test_acc_li,
    'test_precision': test_prec_li,
    'test_recall': test_rec_li,
})
# Append F1 - Score
results_df['val_f1'] = results_df.apply(lambda x: utils.calc_f1(x.val_precision, x.val_recall), axis=1)
results_df['test_f1'] = results_df.apply(lambda x: utils.calc_f1(x.test_precision, x.test_recall), axis=1)
results_df.sort_values('test_loss', ascending=True).to_csv('data/results_enc_emb_att.csv', index=False)

In [13]:
results_df.sort_values('test_loss', ascending=True)

Unnamed: 0,n_samples,n_hidden,activation,training_time,val_loss,val_accuracy,val_precision,val_recall,test_loss,test_accuracy,test_precision,test_recall,val_f1,test_f1
47,30000,96,sigmoid,501.888605,0.160303,0.93595,0.664384,0.625806,0.163766,0.934776,0.652174,0.535714,0.644518,0.588235
41,30000,64,sigmoid,442.382329,0.16513,0.936807,0.837838,0.6,0.168144,0.936162,0.787879,0.619048,0.699248,0.693333
35,30000,32,sigmoid,383.550871,0.1661,0.934094,0.617834,0.625806,0.169503,0.931801,0.605263,0.547619,0.621795,0.575
29,30000,16,sigmoid,289.291727,0.17132,0.929452,0.56701,0.354839,0.175454,0.927053,0.509804,0.309524,0.436508,0.385185
46,15000,96,sigmoid,273.81542,0.175283,0.931596,0.480769,0.287356,0.184181,0.925751,0.441176,0.178571,0.359712,0.254237
28,15000,16,sigmoid,190.766743,0.177503,0.928872,0.5,0.275862,0.185247,0.92579,0.627907,0.321429,0.355556,0.425197
34,15000,32,sigmoid,146.063606,0.177813,0.928274,0.666667,0.068966,0.185629,0.925635,0.384615,0.059524,0.125,0.103093
40,15000,64,sigmoid,258.754622,0.175295,0.930422,0.5,0.287356,0.185879,0.927277,0.553191,0.309524,0.364964,0.396947
45,7500,96,sigmoid,209.534456,0.197488,0.92062,0.592593,0.333333,0.190144,0.924078,0.5,0.261905,0.426667,0.34375
39,7500,64,sigmoid,147.158974,0.197551,0.920332,0.65,0.270833,0.191587,0.922382,0.692308,0.214286,0.382353,0.327273
