In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Setup Logging
import logging
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO,filename=None, format=logging_format)
log = logging.getLogger(__name__)
    
from minimal_bert import preproc as pp
from minimal_bert import modelling as mod
from minimal_bert import utils
from minimal_bert.utils import prepare_and_save_bert_encoded_df as prep_bert
from minimal_bert.utils import load_bert_and_prep_for_training as load_bert

# Sklearny
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [3]:
# Bulk Process All Articles
# prep_bert(maxlen=300)

# Split Test Set From All Articles
X_train, X_test, y_train, y_test = load_bert(None,0.1)
log.info(f"Testing Shape = {y_train.shape}")

2019-06-10 13:48:53,252 - INFO - Testing Shape = (31397, 24)


In [None]:
# Create Parameter Grid
n_sample_li = [1000, 5000, 10000, 20000, 30000, None]
n_hidden = [32, 64, 128, 256]
param_grid = {'n_hidden': n_hidden, 'number_of_texts': n_sample_li}
param_grid = list(ParameterGrid(param_grid))

n_samples_li = []
n_hidden_li = []
val_loss_li = []
val_acc_li = []
val_prec_li = []
val_rec_li = []
test_loss_li = []
test_acc_li = []
test_prec_li = []
test_rec_li = []
for ix, grid in enumerate(param_grid):
    
    # Get Data for sample size
    X = X_train[0:grid['number_of_texts']].copy()
    Y = y_train[0:grid['number_of_texts']].copy()
    Xtr, Xval, ytr, yval = train_test_split(
        X, Y, test_size=0.25, shuffle=True, random_state=42)
    
    # Train Model
    model, history = mod.train_and_return_model(
        Xtr, Xval, ytr, yval, grid['n_hidden'])
    
    # Extract Results
    val_loss, val_acc, val_prec, val_rec = utils.extract_validation_results(history)
    test_loss, test_acc, test_prec, test_rec = model.evaluate(X_test, y_test, batch_size=1024, verbose=0)
    
    # Save To Lists
    n_samples_li.append(grid['number_of_texts'])
    n_hidden_li.append(grid['n_hidden'])
    val_loss_li.append(val_loss)
    val_acc_li.append(val_acc)
    val_prec_li.append(val_prec)
    val_rec_li.append(val_rec)
    test_loss_li.append(test_loss)
    test_acc_li.append(test_acc)
    test_prec_li.append(test_prec)
    test_rec_li.append(test_rec)
    
    # Log Results
    log.info(f"Model {ix} - Prec {test_prec} - Rec {test_rec}")
    log.info(f"Settings - n_samples = {grid['number_of_texts']}, n_hidden = {grid['n_hidden']}")

2019-06-10 13:49:17,862 - INFO - Model 0 - Prec 0.8126036483796238 - Rec 0.8641975308133922
2019-06-10 13:49:17,862 - INFO - Settings - n_samples = 1000, n_hidden = 32


Epoch 00156: early stopping


2019-06-10 13:49:22,563 - INFO - Model 1 - Prec 0.8456790122982223 - Rec 0.885949441452913
2019-06-10 13:49:22,563 - INFO - Settings - n_samples = 5000, n_hidden = 32


Epoch 00084: early stopping


2019-06-10 13:49:29,260 - INFO - Model 2 - Prec 0.8227582590024772 - Rec 0.9223985890110289
2019-06-10 13:49:29,260 - INFO - Settings - n_samples = 10000, n_hidden = 32


Epoch 00063: early stopping


2019-06-10 13:49:41,012 - INFO - Model 3 - Prec 0.8375470682712454 - Rec 0.9153439152901032
2019-06-10 13:49:41,013 - INFO - Settings - n_samples = 20000, n_hidden = 32


Epoch 00061: early stopping


2019-06-10 13:49:58,821 - INFO - Model 4 - Prec 0.8482339955381769 - Rec 0.9035861257552271
2019-06-10 13:49:58,821 - INFO - Settings - n_samples = 30000, n_hidden = 32


Epoch 00057: early stopping


2019-06-10 13:50:15,029 - INFO - Model 5 - Prec 0.8558352402256387 - Rec 0.8794826572087311
2019-06-10 13:50:15,030 - INFO - Settings - n_samples = None, n_hidden = 32


Epoch 00051: early stopping


2019-06-10 13:50:17,677 - INFO - Model 6 - Prec 0.8120260021228588 - Rec 0.8812463256389625
2019-06-10 13:50:17,677 - INFO - Settings - n_samples = 1000, n_hidden = 64


Epoch 00126: early stopping


2019-06-10 13:50:22,637 - INFO - Model 7 - Prec 0.8298097251147035 - Rec 0.9229864784877726
2019-06-10 13:50:22,637 - INFO - Settings - n_samples = 5000, n_hidden = 64


Epoch 00071: early stopping


2019-06-10 13:50:28,411 - INFO - Model 8 - Prec 0.8250652741083516 - Rec 0.9288653732552107
2019-06-10 13:50:28,411 - INFO - Settings - n_samples = 10000, n_hidden = 64


Epoch 00047: early stopping


2019-06-10 13:50:37,870 - INFO - Model 9 - Prec 0.8382193267731693 - Rec 0.9077013520924337
2019-06-10 13:50:37,871 - INFO - Settings - n_samples = 20000, n_hidden = 64


Epoch 00043: early stopping


2019-06-10 13:50:53,154 - INFO - Model 10 - Prec 0.841738197379735 - Rec 0.9223985890110289
2019-06-10 13:50:53,155 - INFO - Settings - n_samples = 30000, n_hidden = 64


Epoch 00047: early stopping


In [None]:
import pandas as pd
results_df = pd.DataFrame({
    'n_samples':n_samples_li,
    'n_hidden':n_hidden_li,
    'loss':loss_li,
    'accuracy':acc_li,
    'precision':prec_li,
    'recall':rec_li
}).sort_values('precision',ascending=False)
results_df