In [1]:
import pandas as pd

fname = '../data/relevance_labeled_tweets_train.csv'

fname_test = '../data/relevance_labeled_tweets_holdout.csv'

dat = pd.read_csv(fname)
dat_test = pd.read_csv(fname_test)
dat.head(5)
dat['relevant_binary'] = dat['relevant'].apply(lambda x: 1 if x else 0)
dat_test['relevant_binary'] = dat_test['relevant'].apply(lambda x: 1 if x else 0)

In [2]:
train, test = dat, dat_test
import numpy as np
txt_train, y_train = np.array(train.text_cleaned), np.array(train.relevant_binary)
txt_test, y_test = np.array(test.text_cleaned), np.array(test.relevant_binary)
print("train size: {} (avg {} words".format(len(txt_train), round(train.text_cleaned.str.split(" ").str.len().mean())))
print('test size: {} (avg {} words)'.format(len(txt_test),round(test.text_cleaned.str.split(" ").str.len().mean())))

print("Train distribution")
print(train.relevant_binary.value_counts())
print("Test distribution")
print(test.relevant_binary.value_counts())


train size: 1599 (avg 15.0 words
test size: 400 (avg 15.0 words)
Train distribution
1    1318
0     281
Name: relevant_binary, dtype: int64
Test distribution
1    330
0     70
Name: relevant_binary, dtype: int64


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


# set text-related hyper params 
maxlen = 150
max_features = 2000


# instantiate Tokenizer class (`num_words` to restrict vocab size)
# extract vocab and count words (makes several attrs available)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(txt_train)

# integer encode the docs
x_train = tokenizer.texts_to_sequences(txt_train)
x_test = tokenizer.texts_to_sequences(txt_test)

# pad the sequences (default params `padding='pre', truncating='pre'`)
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

  return f(*args, **kwds)
Using TensorFlow backend.


In [39]:
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight('balanced', [0,1], y_train)
class_weights = {0: weights[0],
                 1: weights[1]}
print(class_weights)

{0: 2.8451957295373664, 1: 0.6066009104704098}


In [5]:
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from talos.model.early_stopper import early_stopper
from talos.model.normalizers import lr_normalizer
def relevance_model(x_train, y_train, x_val, y_val, params):
    model = Sequential()                            
    model.add(Embedding(input_dim=2000,
                        output_dim=params['hidden_dim']))
    model.add(LSTM(params['hidden_dim'], dropout=params['dropout'], recurrent_dropout=params['recurrent_dropout']))
    model.add(Dense(1,
                    activation=params['last_activation']))

    model.compile(optimizer=params['optimizer'](lr=lr_normalizer(params['lr'], params['optimizer'])),
                  loss=params['loss'],
                  metrics=['acc'])

    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val],
                    callbacks=early_stopper(params['epochs'], mode='strict'))
    
    return out, model

from keras.optimizers import Adam, Nadam
from keras.activations import sigmoid
from keras.losses import binary_crossentropy
#p = {'lr': [0.1, 0.2, 0.3, 0.4],
#     'hidden_dim': [100, 200, 500],
#     'batch_size': [128, 256],
#     'epochs': [5, 10],
#     'dropout': [0, 0.1, 0.2],
#     'recurrent_dropout': [0, 0.1, 0.2],
#     'optimizer': [Adam, Nadam],
#     'loss': [binary_crossentropy],
#     'last_activation': [sigmoid],
#     'weight_regulizer':[None]}

p = {'lr': [0.1, 0.2],
     'hidden_dim': [100, 200],
     'batch_size': [128, 256],
     'epochs': [5],
     'dropout': [0.1, 0.2],
     'recurrent_dropout': [0.1, 0.2],
     'optimizer': [Adam],
     'loss': [binary_crossentropy],
     'last_activation': [sigmoid],
     'weight_regulizer':[None]}

In [6]:
import talos as ta
h = ta.Scan(x=x_train, y=y_train, x_val=x_test, y_val=y_test, params=p,
            model=relevance_model,
            dataset_name='juul_relevance',
            experiment_no='1')

100%|██████████| 32/32 [13:20<00:00, 23.39s/it]

Scan Finished!





In [99]:

p = ta.Predict(h)
preds_test = p.predict(x_test)
print(preds_test)
preds_test_bool = [bool(pred >= .5) for pred in preds_test]

In [102]:
from sklearn.metrics import precision_score, recall_score, f1_score

# evaluate model performance on full test set with familiar metrics
metrics = [precision_score, recall_score, f1_score]

for metric in metrics:
  print('{}: {}'.format(metric.__name__, round(metric(y_test, preds_test_bool), 4)))

precision_score: 0.9248
recall_score: 0.6333
f1_score: 0.7518


In [103]:
from sklearn.metrics import confusion_matrix
#from plotting_util import human_readable_confusion_table

conf_mat = confusion_matrix(y_test, preds_test_bool)
print('test set confusion matrix: ', conf_mat, sep='\n')

#human_readable_confusion_table(y_test, preds_test_bool)

test set confusion matrix: 
[[ 53  17]
 [121 209]]
