In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Setup Logging
import logging
logging_format = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO,filename=None, format=logging_format)
log = logging.getLogger(__name__)
    
from att_emb_enc import preproc as pp
from att_emb_enc import utils
from att_emb_enc import embed
from tqdm import tqdm

# Sklearn
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


### 1. Create Input Parameters For Different Settings

In [4]:
# Input Processing Parameters
param_grid = {'max_features': [10000, 20000], 'maxlen': [300]}
param_grid = list(ParameterGrid(param_grid))

log.info("Creating Inputs and Embedding Matrix Files")
for grid in tqdm(param_grid):
    # Get X,Y, vocab, Save to Pickle
    X, Y, vocabulary = utils.create_or_load_preproc(
        grid['max_features'], grid['maxlen'],False)
    
    # Create Embedding Matrix and Save to Pickle
    embed_matrix = utils.create_or_load_embeddings(
        vocabulary, grid['max_features'],300)
log.info("All Inputs and Embedding Matrix Files Created")

2019-06-10 14:54:33,375 - INFO - Creating Inputs and Embedding Matrix Files
  0%|          | 0/2 [00:00<?, ?it/s]2019-06-10 14:54:33,459 - INFO - Input Data Loaded From data/processed_inputs/X_Y_word_index_maxfeat10000_maxlen300.pkl


Loaded paragram_300_sl999.txt
Loaded glove.840B.300d.txt
Loaded wiki-news-300d-1M.vec


2019-06-10 14:58:57,829 - INFO - 'pattern' package not found; tag filters are not available for English
2019-06-10 14:58:57,836 - INFO - loading projection weights from data/embeddings/GoogleNews-vectors-negative300.bin
2019-06-10 14:59:29,064 - INFO - loaded (3000000, 300) matrix from data/embeddings/GoogleNews-vectors-negative300.bin
2019-06-10 14:59:29,582 - INFO - Embeddings Saved to data/processed_embeddings/avg_embeddings_maxfeat10000.pkl

 50%|█████     | 1/2 [04:56<04:56, 296.20s/it]2019-06-10 14:59:29,648 - INFO - Input Data Loaded From data/processed_inputs/X_Y_word_index_maxfeat20000_maxlen300.pkl


Loaded GoogleNews-vectors-negative300.bin
Loaded paragram_300_sl999.txt
Loaded glove.840B.300d.txt


2019-06-10 15:03:48,565 - INFO - loading projection weights from data/embeddings/GoogleNews-vectors-negative300.bin


Loaded wiki-news-300d-1M.vec


2019-06-10 15:04:20,281 - INFO - loaded (3000000, 300) matrix from data/embeddings/GoogleNews-vectors-negative300.bin
2019-06-10 15:04:20,932 - INFO - Embeddings Saved to data/processed_embeddings/avg_embeddings_maxfeat20000.pkl

100%|██████████| 2/2 [09:47<00:00, 294.75s/it]
2019-06-10 15:04:20,933 - INFO - All Inputs and Embedding Matrix Files Created


Loaded GoogleNews-vectors-negative300.bin


### 2. Prepare Data For Training

In [None]:
# Input Processing Parameters
input_dir = 'data/processed_inputs/'
input_files = [input_dir + fn for fn in os.listdir(input_dir) if 'X_Y' in fn]
param_grid = {'n_hidden': [32, 64], 'input_file': input_files}
param_grid = list(ParameterGrid(param_grid))
out_file = 'data/results.csv'
overwrite_results = False

# Overwrite Results if Desired
if overwrite_results is True:
    out_connection = open(out_file, 'w')
    writer = csv.writer(out_connection)
    writer.writerow([
        'input_f', 'maxlen', 'max_features', 'n_hidden', 
        'loss_m', 'loss_std','acc_m', 'acc_std'
    ])
    out_connection.close()

# Run Parameters and Aggregate Results
for grid in tqdm(param_grid):
    if m.check_if_grid_has_ran(grid, out_file) is True:
        print('Grid Already Ran, Skipping')
    else:
        m.unpack_and_test_grid(grid, dropout=False, out_file=out_file)

In [None]:
# Bulk Process All Articles
# prep_bert(maxlen=300)

# Split Test Set From All Articles
X_train, X_test, y_train, y_test = load_bert(None,0.1)
log.info(f"Testing Shape = {y_train.shape}")

In [None]:
# Create Parameter Grid
n_sample_li = [1000, 5000, 10000, 20000, 30000]
n_hidden = [64]
param_grid = {'n_hidden': n_hidden, 'number_of_texts': n_sample_li}
param_grid = list(ParameterGrid(param_grid))

n_samples_li = []
n_hidden_li = []
val_loss_li = []
val_acc_li = []
val_prec_li = []
val_rec_li = []
test_loss_li = []
test_acc_li = []
test_prec_li = []
test_rec_li = []
for ix, grid in enumerate(param_grid):
    
    # Get Data for sample size
    X = X_train[0:grid['number_of_texts']].copy()
    Y = y_train[0:grid['number_of_texts']].copy()
    Xtr, Xval, ytr, yval = train_test_split(
        X, Y, test_size=0.25, shuffle=False, random_state=42)
    
    # Train Model
    model, history = mod.train_and_return_model(
        Xtr, Xval, ytr, yval, grid['n_hidden'])
    
    # Extract Results
    val_loss, val_acc, val_prec, val_rec = utils.extract_validation_results(history)
    test_loss, test_acc, test_prec, test_rec = model.evaluate(X_test, y_test, batch_size=1024, verbose=0)
    
    # Save To Lists
    n_samples_li.append(grid['number_of_texts'])
    n_hidden_li.append(grid['n_hidden'])
    val_loss_li.append(val_loss)
    val_acc_li.append(val_acc)
    val_prec_li.append(val_prec)
    val_rec_li.append(val_rec)
    test_loss_li.append(test_loss)
    test_acc_li.append(test_acc)
    test_prec_li.append(test_prec)
    test_rec_li.append(test_rec)
    
    # Log Results
    log.info(f"Model {ix} - Prec {test_prec} - Rec {test_rec}")
    log.info(f"Settings - n_samples = {grid['number_of_texts']}, n_hidden = {grid['n_hidden']}")

In [None]:
import pandas as pd
results_df = pd.DataFrame({
    'n_samples': n_samples_li,
    'n_hidden': n_hidden_li,
    'val_loss': val_loss_li,
    'val_accuracy': val_acc_li,
    'val_precision': val_prec_li,
    'val_recall': val_rec_li,
    'test_loss': test_loss_li,
    'test_accuracy': test_acc_li,
    'test_precision': test_prec_li,
    'test_recall': test_rec_li
})
# Append F1 - Score
results_df['val_f1'] = results_df.apply(lambda x: utils.calc_f1(x.val_precision, x.val_recall), axis=1)
results_df['test_f1'] = results_df.apply(lambda x: utils.calc_f1(x.test_precision, x.test_recall), axis=1)
results_df.sort_values('test_loss', ascending=True)