In [0]:
import os
import numpy as np
import keras as k
import pandas as pd
import string
from nltk.corpus import stopwords
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Convolution1D, GlobalMaxPool1D, Flatten, Dense, Dropout, LSTM
from keras.layers import Bidirectional, MaxPooling1D, Lambda, MaxPooling2D, Reshape
from keras.layers.merge import Concatenate
from keras.models import Model
from keras.utils import to_categorical
from keras.backend import one_hot

# Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
# from gensim.models import KeyedVectors
# from keras.utils import to_categorical
# from nltk.corpus import stopwords

Using TensorFlow backend.


### Load in the files

In [0]:
import pickle
def load_tokenizer(filename = 'tokenizer.pickle'):
    with open(filename, 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer
def load_df(filename):
    return pd.read_pickle(filename)
def load_embedding_matrix(filename = 'embedding_matrix.npy'):
    return np.load(filename)

In [0]:
t = load_tokenizer()
embedding_matrix = load_embedding_matrix()
df_train = load_df('df_train.pkl')
df_val = load_df('df_val.pkl')
df_test = load_df('df_test.pkl')
df_train = df_train.replace('', 'nan')

l2i = {'pants-fire':0, 'false':1, 'barely-true':2, 'half-true':3, 'mostly-true':4, 'true':5}

In [0]:
def df_to_input(df, tokenizer, max_len):
    return pad_sequences(tokenizer.texts_to_sequences(df.p_statement), maxlen=max_len, padding='post', truncating='post')
def df_to_label(df):
    return to_categorical(df.label)

In [0]:
print(np.median(df_train['length']))

10.0


In [0]:
sentence_max_len = 15
X_train = df_to_input(df_train, t, sentence_max_len)
X_val = df_to_input(df_val, t, sentence_max_len)
X_test = df_to_input(df_test, t, sentence_max_len)

Y_train = df_to_label(df_train)
Y_val = df_to_label(df_val)
Y_test = df_to_label(df_test)

### Model

In [0]:
# Liar
sentence_max_len = X_train.shape[1]
filter_sizes = (2,3,4)
n_filters = 128
vocab_size = len(t.word_index) + 1
dimensions = 300
input_shape = (sentence_max_len,)
n_categories = len(l2i)
batch_size = 64
epochs = 10
dropout_prob = 0.8

# Github
sentence_max_len = X_train.shape[1]
filter_sizes = (3,3,3)
n_filters = 128
vocab_size = len(t.word_index) + 1
dimensions = 300
input_shape = (sentence_max_len,)
n_categories = len(l2i)
batch_size = 40
epochs = 30
dropout_prob = 0.6

In [0]:
# Input layer
input_layer = Input(shape=input_shape)

# Embedding layer
embedding_layer = Embedding(vocab_size, dimensions, weights=[embedding_matrix], 
              input_length=sentence_max_len,trainable=False, name='embedding')(input_layer)

convolutional_blocks = []
for i, filter_size in enumerate(filter_sizes):
    convolution_layer = Convolution1D(filters=n_filters, kernel_size=filter_size, 
                                      padding="valid", activation="relu", strides = 1,
                                      name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                      + str(n_filters))(embedding_layer)
    convolution_layer = GlobalMaxPool1D()(convolution_layer)
    convolutional_blocks.append(convolution_layer)
if (len(filter_sizes)>1):
    conv = Concatenate()(convolutional_blocks)
else:
    # untested
    conv = convolution_layer

dropout_layer = Dropout(dropout_prob, name="dropout")(conv)
dense_layer = Dense(np.sum(n_filters), activation='relu', name='dense')(dropout_layer)
output_layer = Dense(n_categories, activation='softmax', name='output_dense')(dense_layer)

model = Model(input_layer, output_layer)
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["categorical_accuracy"])

In [0]:
model.fit(X_train, Y_train, batch_size = batch_size, epochs=epochs, validation_data=(X_val, Y_val))

Train on 10240 samples, validate on 1284 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x7fa585a2ec88>

In [0]:
# model.predict(X_test, batch_size = batch_size)

In [0]:
prediction = model.predict(X_test, batch_size = batch_size)
Y_labels_classes = df_test.label.to_numpy()
np.sum(np.argmax(prediction, axis=1) == Y_labels_classes)/Y_labels_classes.shape[0]

0.2430939226519337

### Hybrid CNN

In [0]:
def load_dep_meta_pos(path = "./features/dep_meta_pos/", feature = "POS" ):
    train = np.load(path + "Train_" + feature +".npy")
    val = np.load(path + "Valid_" + feature +".npy")
    test = np.load(path + "Test_" + feature +".npy")
    return train, val, test
def load_pos():
    return load_dep_meta_pos(feature="POS")
def load_dep():
    return load_dep_meta_pos(feature="DEP")
def load_meta():
    return load_dep_meta_pos(feature="Meta")
senti2int = {"textblob":1, "vader":2, "lr": 3, "svm": 4, "ft":5, "flair_glove": 6, "flair_elmo": 7, "flair_bert": 8}
def load_senti(path = "./features/senti/", feature = "flair_glove" ):
    if isinstance(feature, str):
        feature = senti2int[feature.lower()]
    feature = str(feature)
    train = np.load(path + "Train_Senti_" + feature +".npy")
    train_class = np.load(path + "Train_SentiClass_" + feature +".npy")

    val = np.load(path + "Valid_Senti_" + feature +".npy")
    val_class = np.load(path + "Valid_SentiClass_" + feature +".npy")
    
    test = np.load(path + "Test_Senti_" + feature +".npy")
    test_class = np.load(path + "Test_SentiClass_" + feature +".npy")
    return train, train_class, val, val_class, test, test_class

In [0]:
import pickle
def load_tokenizer(filename='tokenizer.pickle'):
    with open(filename, 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer
def load_tokenizers(count=6, filename='tokenizer'):
    ts = []
    for i in range(count):
        ts.append(load_tokenizer(filename=filename+"_"+str(i)+".pickle"))
    return ts
cols = ['subject', 'speaker', 'job', 'state', 'party', 'venue']

In [0]:
ts = load_tokenizers()

In [0]:
def df_to_meta_input(df, tokenizers, columns):
    a = np.zeros((df.shape[0], len(columns)), dtype=int)
    for i, col in enumerate(columns):
        a[:,i] = np.array(tokenizers[i].texts_to_sequences(df[col])).reshape(-1)
    return a

In [0]:
X_meta_train = df_to_meta_input(df_train, ts, cols)
X_meta_val = df_to_meta_input(df_val, ts, cols)
X_meta_test = df_to_meta_input(df_test, ts, cols)

In [0]:
train_pos, val_pos, test_pos = load_pos()
train_dep, val_dep, test_dep = load_dep()
train_meta, val_meta, test_meta = load_meta()

train_senti, train_senti_class, val_senti, val_senti_class, test_senti, test_senti_class = load_senti(feature = 8)

In [0]:
# Liar
sentence_max_len = X_train.shape[1]
filter_sizes = (2,3,4)
n_filters = 128
vocab_size = len(t.word_index) + 1
dimensions = 300
we_input_shape = (sentence_max_len,)
n_categories = len(l2i)
batch_size = 64
epochs = 5
dropout_prob = 0.8

number_of_meta_features = X_meta_train.shape[1]
md_input_shape = (number_of_meta_features,)
md_dimensions = dimensions
# number_of_meta_features = 10
md_embedding_input_size = 2
md_filter_sizes = (3,6)
md_n_filters = 10
lstm_size = 3
max_pooling_pool_size = 1

In [0]:
################################## Word Embeddings ~ we ##################################
# Input layer
we_input = Input(shape=we_input_shape, name='we_input')

# Embedding layer
we_embedding_layer = Embedding(vocab_size, dimensions, weights=[embedding_matrix], 
              input_length=sentence_max_len,trainable=False, name='we_embedding')(we_input)

convolutional_blocks = []
for i, filter_size in enumerate(filter_sizes):
    convolution_layer = Convolution1D(filters=n_filters, kernel_size=filter_size, 
                                      padding="valid", activation="relu", strides = 1,
                                      name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                      + str(n_filters))(we_embedding_layer)
    convolution_layer = GlobalMaxPool1D()(convolution_layer)
    convolutional_blocks.append(convolution_layer)
# print(we_input)
# if (len(filter_sizes)>1):
#     conv = Concatenate()(convolutional_blocks)
# else:
#     # untested
#     conv = convolution_layer

################################## Meta Data ##################################
# Input layer
# md_input = Input(shape=md_input_shape, name='md_input')

### Embedding layers ###
# md_input_split = Lambda(lambda x: tf.split(x, number_of_meta_features, axis=1))(md_input)
# print(md_input_split)
all_inputs = []
all_inputs.append(we_input)

md_input = Input(shape=md_input_shape, name="md_input")
all_inputs.append(md_input)
md_input_split = Lambda(lambda x: tf.split(x, number_of_meta_features, axis=1))(md_input)

meta_embedding_layers = []
for i, c in enumerate(cols):
#     md_input = Input(shape=(1,))
    vocab_dim = len(ts[i].word_index) + 1
#     md_inputs.append(md_input)
    meta_embedding_layer = Embedding(vocab_dim, md_dimensions)(md_input_split[i])
    meta_embedding_layers.append(meta_embedding_layer)
md_embed_concat = Concatenate(axis=1, name='md_embed_concat')(meta_embedding_layers)

# md_embedding_layer = Embedding(md_embedding_input_size, dimensions, input_length=number_of_meta_features, 
#                                name='md_embedding_layer')(md_input)
# print(md_embed_concat)
# print(we_embedding_layer)
md_convolutional_blocks = []
for i, filter_size in enumerate(md_filter_sizes):
    md_convolution_layer = Convolution1D(filters=md_n_filters, kernel_size=filter_size, 
                                      padding="valid", activation="relu", strides = 1,
                                      name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                      + str(n_filters))(md_embed_concat)
#     print(filter_size)
#     md_convolution_layer = MaxPooling1D(pool_size=max_pooling_pool_size)(md_convolution_layer)
    md_convolution_layer = GlobalMaxPool1D()(md_convolution_layer)

    md_convolutional_blocks.append(md_convolution_layer)
if (len(filter_sizes)>1):
    md_conv = Concatenate()(md_convolutional_blocks)
else:
    # untested
    md_conv = m_d_convolution_layer

reshape_md_conv = Reshape((md_conv.shape[1], 1))(md_conv)

md_bilstm = Bidirectional(LSTM(lstm_size, return_sequences=False))(reshape_md_conv)

################################## Concat ##################################
convolutional_blocks.append(md_bilstm)

if (len(filter_sizes)>1):
    conv = Concatenate()(convolutional_blocks)
else:
    # untested
    conv = convolution_layer

################################## Output ##################################
dropout_layer = Dropout(dropout_prob, name="dropout")(conv)

dense_layer = Dense(n_filters, activation='relu', name='dense')(dropout_layer)
output_layer = Dense(n_categories, activation='softmax', name='output_dense')(dense_layer)

model = Model(all_inputs, output_layer)
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["categorical_accuracy"])

In [0]:
# model.summary()

In [0]:
model.fit([X_train, X_meta_train], Y_train, batch_size = batch_size, epochs=epochs, validation_data=([X_val, X_meta_val], Y_val))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fda3266feb8>

In [0]:
prediction = model.predict([X_test, X_meta_test], batch_size = batch_size)
Y_labels_classes = df_test.label.to_numpy()
np.sum(np.argmax(prediction, axis=1) == Y_labels_classes)/Y_labels_classes.shape[0]

0.24861878453038674

# Tuning - Hybrid CNN

In [0]:
#!pip install talos  #dependancy
import talos

sentence_max_len = X_train.shape[1]
filter_sizes = (2,3,4)
#n_filters = 128
vocab_size = len(t.word_index) + 1
dimensions = 300
we_input_shape = (sentence_max_len,)
n_categories = len(l2i)
#batch_size = 64
#epochs = 5
dropout_prob = 0.8

number_of_meta_features = X_meta_train.shape[1]
md_input_shape = (number_of_meta_features,)
md_dimensions = dimensions
# number_of_meta_features = 10
md_embedding_input_size = 2
md_filter_sizes = (3,6)
md_n_filters = 10
lstm_size = 3
max_pooling_pool_size = 1

def fakeNewsModel(x_train, y_train, x_val, y_val, params):
  we_input = Input(shape=we_input_shape, name='we_input')

  # Embedding layer
  we_embedding_layer = Embedding(vocab_size, dimensions, weights=[embedding_matrix], 
              input_length=sentence_max_len,trainable=False, name='we_embedding')(we_input)

  convolutional_blocks = []
  for i, filter_size in enumerate(filter_sizes):
      convolution_layer = Convolution1D(filters=params['n_filters'], kernel_size=filter_size, 
                                        padding="valid", activation="relu", strides = 1,
                                        name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                        + str(params['n_filters']))(we_embedding_layer)
      convolution_layer = GlobalMaxPool1D()(convolution_layer)
      convolutional_blocks.append(convolution_layer)

  all_inputs = []
  all_inputs.append(we_input)

  md_input = Input(shape=md_input_shape, name="md_input")
  all_inputs.append(md_input)
  md_input_split = Lambda(lambda x: tf.split(x, number_of_meta_features, axis=1))(md_input)

  meta_embedding_layers = []
  for i, c in enumerate(cols):
#     md_input = Input(shape=(1,))
      vocab_dim = len(ts[i].word_index) + 1
#     md_inputs.append(md_input)
      meta_embedding_layer = Embedding(vocab_dim, md_dimensions)(md_input_split[i])
      meta_embedding_layers.append(meta_embedding_layer)
  md_embed_concat = Concatenate(axis=1, name='md_embed_concat')(meta_embedding_layers)

  md_convolutional_blocks = []
  for i, filter_size in enumerate(md_filter_sizes):
      md_convolution_layer = Convolution1D(filters=md_n_filters, kernel_size=filter_size, 
                                        padding="valid", activation="relu", strides = 1,
                                        name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                        + str(n_filters))(md_embed_concat)
#     print(filter_size)
#     md_convolution_layer = MaxPooling1D(pool_size=max_pooling_pool_size)(md_convolution_layer)
      md_convolution_layer = GlobalMaxPool1D()(md_convolution_layer)

      md_convolutional_blocks.append(md_convolution_layer)
  if (len(filter_sizes)>1):
      md_conv = Concatenate()(md_convolutional_blocks)
  else:
    # untested
      md_conv = m_d_convolution_layer

  reshape_md_conv = Reshape((md_conv.shape[1], 1))(md_conv)

  md_bilstm = Bidirectional(LSTM(lstm_size, return_sequences=False))(reshape_md_conv)

################################## Concat ##################################
  convolutional_blocks.append(md_bilstm)

  if (len(filter_sizes)>1):
      conv = Concatenate()(convolutional_blocks)
  else:
    # untested
      conv = convolution_layer

################################## Output ##################################
  dropout_layer = Dropout(dropout_prob, name="dropout")(conv)

  dense_layer = Dense(n_filters, activation='relu', name='dense')(dropout_layer)
  output_layer = Dense(n_categories, activation='softmax', name='output_dense')(dense_layer)

  model = Model(all_inputs, output_layer)
  model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["categorical_accuracy"])
  out = model.fit([X_train, X_meta_train], Y_train, batch_size = params['batch_size'], epochs=params['epochs'], validation_data=([X_val, X_meta_val], Y_val))
  return out, model


p = {'n_filters':[32, 64, 128],
      'batch_size': [10, 50, 64, 100],
      'epochs': [5, 10, 20]}

scan_object = talos.Scan(x=[X_train, X_meta_train],
                         y=Y_train,
                         x_val=[X_val, X_meta_val],
                         y_val=Y_val,
                         params=p,
                         model= fakeNewsModel,
                         experiment_name="test")


  0%|          | 0/36 [00:00<?, ?it/s][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



  3%|▎         | 1/36 [05:42<3:20:04, 342.99s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



  6%|▌         | 2/36 [11:22<3:13:42, 341.85s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



  8%|▊         | 3/36 [17:14<3:09:43, 344.96s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 11%|█         | 4/36 [27:50<3:50:37, 432.43s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 14%|█▍        | 5/36 [38:53<4:19:06, 501.51s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 17%|█▋        | 6/36 [50:51<4:43:08, 566.27s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 19%|█▉        | 7/36 [1:13:00<6:24:23, 795.31s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 22%|██▏       | 8/36 [1:35:08<7:25:38, 954.96s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 25%|██▌       | 9/36 [1:59:08<8:15:18, 1100.67s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 28%|██▊       | 10/36 [2:00:31<5:44:35, 795.22s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 31%|███       | 11/36 [2:01:56<4:02:31, 582.07s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 33%|███▎      | 12/36 [2:03:33<2:54:36, 436.53s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 36%|███▌      | 13/36 [2:06:17<2:16:00, 354.81s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 39%|███▉      | 14/36 [2:09:06<1:49:44, 299.28s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 42%|████▏     | 15/36 [2:12:14<1:33:02, 265.84s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 44%|████▍     | 16/36 [2:17:21<1:32:40, 278.01s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 47%|████▋     | 17/36 [2:22:42<1:32:10, 291.07s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 50%|█████     | 18/36 [2:28:43<1:33:37, 312.06s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 53%|█████▎    | 19/36 [2:29:51<1:07:38, 238.74s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 56%|█████▌    | 20/36 [2:31:01<50:13, 188.32s/it]  [A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 58%|█████▊    | 21/36 [2:32:21<38:56, 155.79s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 61%|██████    | 22/36 [2:34:29<34:22, 147.32s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 64%|██████▍   | 23/36 [2:36:45<31:12, 144.06s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 67%|██████▋   | 24/36 [2:39:21<29:30, 147.54s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 69%|██████▉   | 25/36 [2:43:35<32:53, 179.41s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 72%|███████▏  | 26/36 [2:48:04<34:24, 206.41s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 75%|███████▌  | 27/36 [2:53:21<35:56, 239.64s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 78%|███████▊  | 28/36 [2:54:12<24:23, 182.99s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 81%|████████  | 29/36 [2:55:07<16:52, 144.58s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



 83%|████████▎ | 30/36 [2:56:11<12:01, 120.22s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 86%|████████▌ | 31/36 [2:57:47<09:25, 113.01s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 89%|████████▉ | 32/36 [2:59:29<07:19, 109.91s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 92%|█████████▏| 33/36 [3:01:30<05:39, 113.25s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 94%|█████████▍| 34/36 [3:04:38<04:31, 135.57s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



 97%|█████████▋| 35/36 [3:08:05<02:36, 156.91s/it][A

Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20



100%|██████████| 36/36 [3:12:01<00:00, 320.03s/it]


In [0]:
from talos import Reporting
r = Reporting('051320043711.csv')
r.data

Unnamed: 0,round_epochs,val_loss,val_categorical_accuracy,loss,categorical_accuracy,batch_size,epochs,n_filters
0,5,1.775169,0.234424,1.555639,0.315039,10,5,32
1,5,1.744743,0.245327,1.559238,0.30918,10,5,64
2,5,1.726437,0.231308,1.597748,0.308398,10,5,128
3,10,2.068363,0.228972,1.362049,0.380273,10,10,32
4,10,1.810726,0.249221,1.422801,0.366309,10,10,64
5,10,1.872692,0.246106,1.392654,0.382715,10,10,128
6,20,2.94399,0.198598,1.070493,0.562695,10,20,32
7,20,2.392147,0.213396,1.141115,0.510254,10,20,64
8,20,2.153073,0.249221,1.128985,0.526855,10,20,128
9,5,1.719903,0.261682,1.568136,0.323145,50,5,32


In [0]:
from talos import  Predict
#from talos import best_model
r.high('val_categorical_accuracy')
#r.best_params(metric='val_categorical_accuracy', exclude = [])[0]


0.2694703936576843

## Hybrid CNN with History

In [0]:
def get_history_columns(keys, history_cols=['barely_true_c', 'false_c', 'half_true_c', 'mostly_true_c', 'pants_on_fire_c']):
    d = {'pants_on_fire_c':0, 'false_c':1, 'barely_true_c':2, 'half_true_c':3, 'mostly_true_c':4, 'true_c':5}
    h_columns = [0, 0, 0, 0, 0]
    for i, h_c in enumerate(keys):
        if h_c in history_cols:
            h_columns[d[h_c]] = i
#             h_columns.append(i)
    return h_columns

def df_hist(df):
#     get_history_columns(df.keys())
    return np.array(df.iloc[:,get_history_columns(df.keys())]) - np.array(one_hot(np.array(df.label), 5), dtype=int)

def hist_account_for_current(a, df):
    print(df.label)
    return (a - one_hot(np.array(df.label, 5)))

In [0]:
def df_to_meta_input_with_H(df, tokenizers, columns):
    a = np.zeros((df.shape[0], len(columns)), dtype=int)
    for i, col in enumerate(columns):
        a[:,i] = np.array(tokenizers[i].texts_to_sequences(df[col])).reshape(-1)
    a = np.hstack((a, df_hist(df)))
    return a

In [0]:
X_h_train = df_hist(df_train)
X_h_val = df_hist(df_val)
X_h_test = df_hist(df_test)

# With sentiment
X_h_train = np.hstack((df_hist(df_train), train_senti.reshape(-1,1)))
X_h_val = np.hstack((df_hist(df_val), val_senti.reshape(-1,1)))
X_h_test = np.hstack((df_hist(df_test), test_senti.reshape(-1,1)))

# without

#X_h_train = df_hist(df_train)
#X_h_val = df_hist(df_val)
#X_h_test = df_hist(df_test)

In [0]:
# Liar
sentence_max_len = X_train.shape[1]
filter_sizes = (2,3,4) #
n_filters = 128 #
vocab_size = len(t.word_index) + 1
dimensions = 300
we_input_shape = (sentence_max_len,)
n_categories = len(l2i)
batch_size = 64 #
epochs = 5 #
dropout_prob = 0.8 #

number_of_meta_features = X_meta_train.shape[1]
md_input_shape = (number_of_meta_features,)
md_dimensions = dimensions #
# number_of_meta_features = 10
md_embedding_input_size = 2 #
md_filter_sizes = (3,6) # max is the amount of features can also have more than 2 e.g. (3, 4, 5)
md_n_filters = 10 #
lstm_size = 3 #
# max_pooling_pool_size = 1 #

history_size = (X_h_train.shape[1],)
h_out_dim = dimensions #


In [0]:
################################## Word Embeddings ~ we ##################################
# Input layer
we_input = Input(shape=we_input_shape, name='we_input')

# Embedding layer
we_embedding_layer = Embedding(vocab_size, dimensions, weights=[embedding_matrix], 
              input_length=sentence_max_len,trainable=False, name='we_embedding')(we_input)

convolutional_blocks = []
for i, filter_size in enumerate(filter_sizes):
    convolution_layer = Convolution1D(filters=n_filters, kernel_size=filter_size, 
                                      padding="valid", activation="relu", strides = 1,
                                      name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                      + str(n_filters))(we_embedding_layer)
    convolution_layer = GlobalMaxPool1D()(convolution_layer)
    convolutional_blocks.append(convolution_layer)
# print(we_input)
# if (len(filter_sizes)>1):
#     conv = Concatenate()(convolutional_blocks)
# else:
#     # untested
#     conv = convolution_layer

################################## Meta Data ##################################
# Input layer
# md_input = Input(shape=md_input_shape, name='md_input')

### Embedding layers ###
# md_input_split = Lambda(lambda x: tf.split(x, number_of_meta_features, axis=1))(md_input)
# print(md_input_split)
all_inputs = []
all_inputs.append(we_input)

md_input = Input(shape=md_input_shape, name="md_input")
all_inputs.append(md_input)
md_input_split = Lambda(lambda x: tf.split(x, number_of_meta_features, axis=1))(md_input)

history_input = Input(shape=history_size, name="history_input")
all_inputs.append(history_input)

dense_hist = Dense(h_out_dim, activation='tanh')(history_input)
reshape_dense_hist = Reshape((1, dense_hist.shape[1]))(dense_hist)
# print(dense_hist)
meta_embedding_layers = []
for i, c in enumerate(cols):
#     md_input = Input(shape=(1,))
    vocab_dim = len(ts[i].word_index) + 1
#     md_inputs.append(md_input)
    meta_embedding_layer = Embedding(vocab_dim, md_dimensions)(md_input_split[i])
    meta_embedding_layers.append(meta_embedding_layer)

meta_embedding_layers.append(reshape_dense_hist)
    
md_embed_concat = Concatenate(axis=1, name='md_embed_concat')(meta_embedding_layers)

md_convolutional_blocks = []
for i, filter_size in enumerate(md_filter_sizes):
    md_convolution_layer = Convolution1D(filters=md_n_filters, kernel_size=filter_size, 
                                      padding="valid", activation="relu", strides = 1,
                                      name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                      + str(n_filters))(md_embed_concat)

    md_convolution_layer = GlobalMaxPool1D()(md_convolution_layer)

    md_convolutional_blocks.append(md_convolution_layer)
if (len(filter_sizes)>1):
    md_conv = Concatenate()(md_convolutional_blocks)
else:
    # untested
    md_conv = m_d_convolution_layer

reshape_md_conv = Reshape((md_conv.shape[1], 1))(md_conv)

md_bilstm = Bidirectional(LSTM(lstm_size, return_sequences=False))(reshape_md_conv)

################################## Concat ##################################
convolutional_blocks.append(md_bilstm)

if (len(filter_sizes)>1):
    conv = Concatenate()(convolutional_blocks)
else:
    # untested
    conv = convolution_layer

################################## Output ##################################
dropout_layer = Dropout(dropout_prob, name="dropout")(conv)

dense_layer = Dense(n_filters, activation='relu', name='dense')(dropout_layer)
output_layer = Dense(n_categories, activation='softmax', name='output_dense')(dense_layer)

model = Model(all_inputs, output_layer)
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["categorical_accuracy"])

In [0]:
model.fit([X_train, X_meta_train, X_h_train], Y_train, batch_size = batch_size, epochs=epochs, validation_data=([X_val, X_meta_val, X_h_val], Y_val))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fa5835dfb00>

In [0]:
prediction = model.predict([X_test, X_meta_test, X_h_test], batch_size = batch_size)
Y_labels_classes = df_test.label.to_numpy()
np.sum(np.argmax(prediction, axis=1) == Y_labels_classes)/Y_labels_classes.shape[0]

0.07261247040252565

In [0]:
# look into history

#Tunining - Hybrid CNN with History

In [0]:
#pip install talos  #dependancy
import talos

sentence_max_len = X_train.shape[1]
#filter_sizes = (2,3,4) #
#n_filters = 128 #
vocab_size = len(t.word_index) + 1
#dimensions = 300
we_input_shape = (sentence_max_len,)
n_categories = len(l2i)
#batch_size = 64 #
#epochs = 5 #
#dropout_prob = 0.8 #

number_of_meta_features = X_meta_train.shape[1]
md_input_shape = (number_of_meta_features,)
md_dimensions = params['dimensions'] #
# number_of_meta_features = 10
md_embedding_input_size = 2 #
#md_filter_sizes = (3,6) # max is the amount of features can also have more than 2 e.g. (3, 4, 5)
#md_n_filters = 10 #
lstm_size = 3 #
# max_pooling_pool_size = 1 #

history_size = (X_h_train.shape[1],)
h_out_dim = params['dimensions'] #

def fakeNewsModel(x_train, y_train, x_val, y_val, params):
  we_input = Input(shape=we_input_shape, name='we_input')
# Embedding layer
  we_embedding_layer = Embedding(vocab_size, dimensions = params['dimensions'], weights=[embedding_matrix], 
                input_length=sentence_max_len,trainable=False, name='we_embedding')(we_input)

  convolutional_blocks = []
  for i, filter_size in enumerate(filter_sizes = params['filter_sizes']):
      convolution_layer = Convolution1D(filters=params['n_filters'], kernel_size=filter_size, 
                                      padding="valid", activation="relu", strides = 1,
                                      name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                      + str(params['n_filters']))(we_embedding_layer)
      convolution_layer = GlobalMaxPool1D()(convolution_layer)
      convolutional_blocks.append(convolution_layer)

  all_inputs = []
  all_inputs.append(we_input)

  md_input = Input(shape=md_input_shape, name="md_input")
  all_inputs.append(md_input)
  md_input_split = Lambda(lambda x: tf.split(x, number_of_meta_features, axis=1))(md_input)

  history_input = Input(shape=history_size, name="history_input")
  all_inputs.append(history_input)

  dense_hist = Dense(h_out_dim, activation='tanh')(history_input)
  reshape_dense_hist = Reshape((1, dense_hist.shape[1]))(dense_hist)
# print(dense_hist)
  meta_embedding_layers = []
  for i, c in enumerate(cols):
#     md_input = Input(shape=(1,))
      vocab_dim = len(ts[i].word_index) + 1
#     md_inputs.append(md_input)
      meta_embedding_layer = Embedding(vocab_dim, md_dimensions)(md_input_split[i])
      meta_embedding_layers.append(meta_embedding_layer)

  meta_embedding_layers.append(reshape_dense_hist)
    
  md_embed_concat = Concatenate(axis=1, name='md_embed_concat')(meta_embedding_layers)

  md_convolutional_blocks = []
  for i, filter_size in enumerate(params['md_filter_sizes']):
      md_convolution_layer = Convolution1D(filters= params['md_n_filters'], kernel_size=filter_size, 
                                      padding="valid", activation="relu", strides = 1,
                                      name='conv_layer' + str(i) + "_" + str(filter_size) + "_" 
                                      + str(n_filters))(md_embed_concat)

      md_convolution_layer = GlobalMaxPool1D()(md_convolution_layer)

      md_convolutional_blocks.append(md_convolution_layer)
  if (len(filter_sizes = params['filter_sizes'])>1):
      md_conv = Concatenate()(md_convolutional_blocks)
  else:
    # untested
      md_conv = m_d_convolution_layer

  reshape_md_conv = Reshape((md_conv.shape[1], 1))(md_conv)

  md_bilstm = Bidirectional(LSTM(lstm_size, return_sequences=False))(reshape_md_conv)

################################## Concat ##################################
  convolutional_blocks.append(md_bilstm)

  if (len(filter_sizes = params['filter_sizes'])>1):
      conv = Concatenate()(convolutional_blocks)
  else:
    # untested
      conv = convolution_layer

################################## Output ##################################
  dropout_layer = Dropout(dropout_prob = params['dropout_prob'], name="dropout")(conv)

  dense_layer = Dense(n_filters = paraams['n_filters'], activation='relu', name='dense')(dropout_layer)
  output_layer = Dense(n_categories, activation='softmax', name='output_dense')(dense_layer)

  model = Model(all_inputs, output_layer)
  model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["categorical_accuracy"])
  out = model.fit([x_train[0], x_train[1], x_train[2]], y_train, batch_size = params ['batch_size'], epochs= params['epochs'], validation_data=([x_val[0], x_val[1], x_val[2]], y_val))

  return out, model

p = {'n_filters':[32, 64, 128],
     'filter_sizes' : [(2,3,4), (2,2,2), (3,3,3), (4,4,4), (5,5,5), (2,3,5)],
     'dimensions' : [100,200, 300],
     'batch_size': [50, 64, 100],
     'epochs' : [5,10,20],
     'dropout_prob' : [0.5, 0.8],
     'md_filter_sizes' : [(3,6), (3,4,5)],
     'md_n_filters' : [10, 15, 20]}

scan_object = talos.Scan(x=[X_train, X_meta_train, X_h_train],
                         y=Y_train,
                         x_val=[X_val, X_meta_val, X_h_val],
                         y_val=Y_val,
                         params=p,
                         model= fakeNewsModel,
                         experiment_name="test")

  0%|          | 0/3 [00:00<?, ?it/s]

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


 33%|███▎      | 1/3 [01:04<02:09, 64.90s/it]

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


 67%|██████▋   | 2/3 [02:11<01:05, 65.30s/it]

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


100%|██████████| 3/3 [03:26<00:00, 68.80s/it]
