In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import keras.backend as K
import tensorflow as tf

from scipy.spatial.distance import jensenshannon
from numpy import asarray

kl_div = tf.keras.losses.KLDivergence()
 
# calculate the js divergence
def js_divergence(p, q):
	m = 0.5 * (p + q)
	return 0.5 * kl_div(p, m) + 0.5 * kl_div(q, m)

def js_distance(y_true, y_pred):
  return K.sqrt(js_divergence(y_true, y_pred))


Using TensorFlow backend.


# Load Data

In [0]:
import pandas as pd
import numpy as np

def load_data():
  # load your data using this function
  # url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ReactDataCounts/2_No_Likes.csv'
  url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ReactDataCountsPre/2_No_Likes.csv'
  df = pd.read_csv(url, encoding='utf16')

  data = df['name']
  labels = df.select_dtypes(include=[np.number])

  data = data.values
  labels = labels.values

  return data, labels

# Create Model

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv1D, MaxPooling1D, Flatten, Embedding
from keras.utils import plot_model

metrics = ['mean_squared_error', 'mean_absolute_error', js_distance]

def create_model(embedding_layer):
  model = Sequential()
  model.add(embedding_layer)
  model.add(Dense(units=300, activation='relu'))
  model.add(Flatten())
  model.add(Dense(units=5, activation='relu'))

  # model.add(Dense(300, input_dim=300, activation='relu'))
  # model.add(Dropout(0.2))
  # model.add(BatchNormalization())

  # model.add(Dense(300, activation='relu'))
  # model.add(Dropout(0.3))
  # model.add(BatchNormalization())

  
  # model.add(Conv1D(64, 5, activation='relu'))
  # model.add(MaxPooling1D(5))
  # model.add(Flatten())
  # model.add(Dense(units=64, activation='relu'))
  # model.add(Dense(units=5, activation='relu'))
  
  # model.compile(loss='kullback_leibler_divergence', optimizer='adam', metrics=metrics)
  # model.compile(loss=js_divergence, optimizer='adam', metrics=metrics)
  model.compile(loss='mean_squared_error', optimizer='adam', metrics=metrics)
  print(model.summary())
  from keras.utils.vis_utils import plot_model
  plot_model(model, to_file='5_EWE.png', show_shapes=True, show_layer_names=True)

  return model


# Train and Evaluate Model

In [5]:
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


def train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test):
  print("Training:")
  data_train, data_val, labels_train, labels_val = train_test_split(data_train, labels_train, test_size=0.2, shuffle=True)

  model.fit(data_train, labels_train, 
        epochs=2, batch_size=128, verbose=1, shuffle=True,
        validation_data=(data_val, labels_val))
  
  print("Evaluating:")
  scores = model.evaluate(data_test, labels_test, verbose=1)
  print("Final scores for fold:")
  print(model.metrics_names, scores) 
  return scores

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Run Evaluation

In [6]:
data, labels = load_data()
print(len(data))
useHoldout = False

min_reacts = 1
# if (len(data) > 10000):
#   useHoldout = True

155696


Prep embeddings

In [7]:
from gensim.models.keyedvectors import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding


path = './drive/My Drive/Colab Notebooks/Store/ewe_uni_w2v_model.txt'

# embedding_model = KeyedVectors.load_word2vec_format(path, binary=True)
# embedding_layer = embedding_model.wv.get_keras_embedding(train_embeddings=False)

# vocabulary = {word: vector.index for word, vector in embedding_model.vocab.items()} 
# tk = Tokenizer(num_words=len(vocabulary)) 
# tk.word_index = vocabulary 
# encoded_data = tk.texts_to_sequences(data)
# max_length = len(max(encoded_data, key=len))
# padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')
# data = padded_data

embedding_dim = 300
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_data = t.texts_to_sequences(data)
max_length = len(max(encoded_data, key=len))
padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')
data = padded_data

# MAX_NB_WORDS = 200000
word2vec = KeyedVectors.load_word2vec_format(path, binary=False)

word_index = t.word_index
nb_words = min(len(word_index), len(word_index))+1
# nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, embedding_dim))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
# print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

embedding_layer = Embedding(embedding_matrix.shape[0], # or len(word_index) + 1
                            embedding_matrix.shape[1], # or EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
print(data.shape)
print(max_length)

(155696, 26)
26


K-Fold

In [9]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize

if not useHoldout:
  print("KFOLD")  
  n_folds = 5
  kf = KFold(n_folds, shuffle=True)
  i = 0

  # Define per-fold score containers
  scores_per_fold = []

  for train_index, test_index in kf.split(data):
    print("Running Fold", i+1, "/", n_folds)
    data_train, data_test = data[train_index], data[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]

    labels_train_sums = labels_train.sum(axis = 1)
    has_min_reacts = labels_train_sums >= min_reacts
    data_train = data_train[has_min_reacts]
    labels_train = labels_train[has_min_reacts]

    labels_train = labels_train/labels_train.sum(axis=1, keepdims=True)
    labels_test = labels_test/labels_test.sum(axis=1, keepdims=True)


    #process
    # vectorizer = CountVectorizer(max_features=5000)
    # # vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
    # #                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
    # data_train = vectorizer.fit_transform(data_train.astype('U'))

    # data_test = vectorizer.transform(data_test.astype('U'))
    # end

    
    
    model = None # Clearing the NN.
    model = create_model(embedding_layer)

    scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)
    scores_per_fold.append(scores)

    i += 1

  

KFOLD
Running Fold 1 / 5
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 26, 300)           12471900  
_________________________________________________________________
dense_1 (Dense)              (None, 26, 300)           90300     
_________________________________________________________________
flatten_1 (Flatten)          (None, 7800)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 39005     
Total params: 12,601,205
Trainable params: 129,305
Non-trainable params: 12,471,900
_________________________________________________________________
None
Training:
Train on 99644 samples, validate on 24912 samples
Epoch 1/2
Epoch 2/2
Evaluating:
Final scores for fold:
['loss', 'mean_squared_error', 'mean_absolute_error', 'js_distance'] [0.054481296864797, 

KeyboardInterrupt: ignored

In [0]:
if not useHoldout:

  print('Average scores across all folds:')
  for metric_index, metric_name in enumerate(metrics):
    metric_total = 0
    for scores in scores_per_fold:
      metric_total += scores[metric_index + 1]
    print(metric_name, metric_total/n_folds )
  print(scores_per_fold)
  

Holdout

In [0]:
from sklearn.model_selection import train_test_split

if useHoldout:
  print("HOLDOUT")

  data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2, shuffle=True)

  labels_train_sums = labels_train.sum(axis = 1)
  has_min_reacts = labels_train_sums >= min_reacts
  data_train = data_train[has_min_reacts]
  labels_train = labels_train[has_min_reacts]
  
  labels_train = labels_train/labels_train.sum(axis=1, keepdims=True)
  labels_test = labels_test/labels_test.sum(axis=1, keepdims=True)

  #process
  # vectorizer = CountVectorizer(max_features=5000)
  # # vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
  # #                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
  # data_train = vectorizer.fit_transform(data_train.astype('U'))

  # data_test = vectorizer.transform(data_test.astype('U'))
  # end

  # encoded_data_train = tk.texts_to_sequences(data_train)
  # padded_data_train = pad_sequences(encoded_data_train, maxlen=max_length, padding='post')
  # data_train = padded_data_train

  # encoded_data_test = tk.texts_to_sequences(data_test)
  # padded_data_test = pad_sequences(encoded_data_test, maxlen=max_length, padding='post')
  # data_test = padded_data_test

  model = None # Clearing the NN.
  model = create_model(embedding_layer)

  # scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)
  # print(model.metrics_names, scores) 