In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Setup Logging
import logging
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO,filename=None, format=logging_format)
log = logging.getLogger(__name__)
    
from minimal_bert import preproc as pp
from minimal_bert import modelling as mod
from minimal_bert import utils
from minimal_bert.utils import prepare_and_save_bert_encoded_df as prep_bert
from minimal_bert.utils import load_bert_and_prep_for_training as load_bert

# Sklearn
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split

# Datetime
from datetime import datetime

Using TensorFlow backend.


In [3]:
# Bulk Process All Articles
# prep_bert(maxlen=300)

# Split Test Set From All Articles
X_train, X_test, y_train, y_test = load_bert(None,0.1)
log.info(f"Training Shape = {y_test.shape}")
log.info(f"Testing Shape = {y_train.shape}")

2019-06-10 16:56:56,340 - INFO - Training Shape = (3489, 24)
2019-06-10 16:56:56,340 - INFO - Testing Shape = (31397, 24)


In [None]:
# Create Parameter Grid
param_grid = {
    'n_hidden': [32, 64, 128], 
    'number_of_texts': [1000, 1875, 3750, 7500, 15000, 30000], 
    'activation': ['softmax', 'sigmoid']
}
param_grid = list(ParameterGrid(param_grid))

n_samples_li = []
n_hidden_li = []
activation_li = []
training_time_li = []
val_loss_li = []
val_acc_li = []
val_prec_li = []
val_rec_li = []
test_loss_li = []
test_acc_li = []
test_prec_li = []
test_rec_li = []

for ix, grid in enumerate(param_grid):
    start = datetime.now().timestamp()
    # Get Data for sample size
    X = X_train[0:grid['number_of_texts']].copy()
    Y = y_train[0:grid['number_of_texts']].copy()
    Xtr, Xval, ytr, yval = train_test_split(
        X, Y, test_size=0.25, shuffle=False, random_state=42)
    
    # Train Model
    model, history = mod.train_and_return_model(
        Xtr, Xval, ytr, yval, grid['n_hidden'], grid['activation'])
    
    
    # Extract Results
    val_loss, val_acc, val_prec, val_rec = utils.extract_validation_results(history)
    test_loss, test_acc, test_prec, test_rec = model.evaluate(X_test, y_test, batch_size=1024, verbose=0)
    end = datetime.now().timestamp()
    
    # Save To Lists
    n_samples_li.append(grid['number_of_texts'])
    n_hidden_li.append(grid['n_hidden'])
    training_time_li.append(end - start)
    activation_li.append(grid['activation'])
    val_loss_li.append(val_loss)
    val_acc_li.append(val_acc)
    val_prec_li.append(val_prec)
    val_rec_li.append(val_rec)
    test_loss_li.append(test_loss)
    test_acc_li.append(test_acc)
    test_prec_li.append(test_prec)
    test_rec_li.append(test_rec)
    
    # Log Results
    log.info(f"Model {ix} - Prec {test_prec} - Rec {test_rec}")
    log.info(f"Settings - n_samples = {grid['number_of_texts']}, n_hidden = {grid['n_hidden']}")

In [None]:
import pandas as pd
results_df = pd.DataFrame({
    'n_samples': n_samples_li,
    'n_hidden': n_hidden_li,
    'activation':activation_li,
    'training_time': training_time_li,
    'val_loss': val_loss_li,
    'val_accuracy': val_acc_li,
    'val_precision': val_prec_li,
    'val_recall': val_rec_li,
    'test_loss': test_loss_li,
    'test_accuracy': test_acc_li,
    'test_precision': test_prec_li,
    'test_recall': test_rec_li,
})
# Append F1 - Score
results_df['val_f1'] = results_df.apply(lambda x: utils.calc_f1(x.val_precision, x.val_recall), axis=1)
results_df['test_f1'] = results_df.apply(lambda x: utils.calc_f1(x.test_precision, x.test_recall), axis=1)
results_df.sort_values('test_loss', ascending=True).to_csv('data/results_bert.csv', index=False)
results_df.sort_values('test_loss', ascending=True)

In [2]:
6 * 4 * 2

48