In [0]:
# pip install sklearn

In [2]:
import keras.backend as K
import tensorflow as tf

from scipy.spatial.distance import jensenshannon
from numpy import asarray

kl_div = tf.keras.losses.KLDivergence()
 
# calculate the js divergence
def js_divergence(p, q):
	m = 0.5 * (p + q)
	return 0.5 * kl_div(p, m) + 0.5 * kl_div(q, m)

def js_distance(y_true, y_pred):
  return K.sqrt(js_divergence(y_true, y_pred))


Using TensorFlow backend.


# Load Data

In [0]:
import pandas as pd
import numpy as np

def load_data():
  # load your data using this function
  # url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ReactDataCounts/2_No_Likes.csv'
  url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ReactDataCountsPre/2_No_Likes.csv'
  df = pd.read_csv(url, encoding='utf16')
  n = 100
  df = df.head(int(len(df)*(n/100)))

  data = df['name']
  labels = df.select_dtypes(include=[np.number])

  data = data.values
  labels = labels.values

  return data, labels

# Create Model

In [0]:
from keras.models import Sequential
from keras.layers import Dense

metrics = ['mean_squared_error', 'mean_absolute_error', 'categorical_crossentropy',js_distance]

def create_model(input_dim):
  model = Sequential()
  model.add(Dense(units=500, activation='relu', input_dim=input_dim))
  model.add(Dense(units=5, activation='relu'))
  
  # model.compile(loss='kullback_leibler_divergence', optimizer='adam', metrics=metrics)
  # model.compile(loss=js_divergence, optimizer='adam', metrics=metrics)
  model.compile(loss='mean_squared_error', optimizer='adam', metrics=metrics)
  # model.summary()
  print(model.summary())
  from keras.utils.vis_utils import plot_model
  plot_model(model, to_file='2_No_Likes.png', show_shapes=True, show_layer_names=True)

  return model


# Train and Evaluate Model

In [5]:
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import sklearn


def train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test):
  print("Training:")
  data_train, data_val, labels_train, labels_val = train_test_split(data_train, labels_train, test_size=0.2, shuffle=True)
  # data_test, data_val, labels_test, labels_val = train_test_split(data_test, labels_test, test_size=0.5, shuffle=True)

  model.fit(data_train, labels_train, 
        epochs=2, batch_size=128, verbose=1, shuffle=True,
        validation_data=(data_val, labels_val))
  
  print("Evaluating:")
  scores = model.evaluate(data_test, labels_test, verbose=1)
  print("Final scores for fold:")
  print(model.metrics_names, scores) 
  y_pred = model.predict(data_test)
  print(y_pred.shape)
  print('col 0', sklearn.metrics.mean_squared_error(labels_test[:,0], y_pred[:,0]))
  print('col 1', sklearn.metrics.mean_squared_error(labels_test[:,1], y_pred[:,1]))
  print('col 2', sklearn.metrics.mean_squared_error(labels_test[:,2], y_pred[:,2]))
  print('col 3', sklearn.metrics.mean_squared_error(labels_test[:,3], y_pred[:,3]))
  print('col 4', sklearn.metrics.mean_squared_error(labels_test[:,4], y_pred[:,4]))
  print('col 5', sklearn.metrics.mean_squared_error(labels_test[:,5], y_pred[:,5]))
  return scores

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Run Evaluation

In [6]:
data, labels = load_data()
print(len(data))
useHoldout = False

min_reacts = 1
if (len(data) > 10000):
  useHoldout = True

155696


In [7]:
print(labels)

[[  88    5    6   45    0]
 [ 109  186    1  499   44]
 [6634 5509 2854   19   10]
 ...
 [   2    0    1    0    0]
 [   0   11    1    0    0]
 [ 457 1109 3816   11   46]]


K-Fold

In [0]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize


if not useHoldout:
  print("KFOLD")  
  n_folds = 5
  kf = KFold(n_folds, shuffle=True)
  i = 0

  # Define per-fold score containers
  scores_per_fold = []

  for train_index, test_index in kf.split(data):
    print("Running Fold", i+1, "/", n_folds)
    data_train, data_test = data[train_index], data[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]

    labels_train_sums = labels_train.sum(axis = 1)
    has_min_reacts = labels_train_sums >= min_reacts
    data_train = data_train[has_min_reacts]
    labels_train = labels_train[has_min_reacts]

    labels_train = labels_train/labels_train.sum(axis=1, keepdims=True)
    labels_test = labels_test/labels_test.sum(axis=1, keepdims=True)


    #process
    vectorizer = CountVectorizer(max_features=20000, binary=False)
    # vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
    #                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
    data_train = vectorizer.fit_transform(data_train.astype('U'))

    data_test = vectorizer.transform(data_test.astype('U'))
    # end
    
    model = None # Clearing the NN.
    model = create_model(len(vectorizer.get_feature_names()))

    scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)
    scores_per_fold.append(scores)

    i += 1

  

In [0]:
if not useHoldout:

  print('Average scores across all folds:')
  for metric_index, metric_name in enumerate(metrics):
    metric_total = 0
    for scores in scores_per_fold:
      metric_total += scores[metric_index + 1]
    print(metric_name, metric_total/n_folds )
  print(scores_per_fold)
  

Holdout

In [10]:
from sklearn.model_selection import train_test_split

if useHoldout:
  print("HOLDOUT")

  data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2, shuffle=True)

  labels_train_sums = labels_train.sum(axis = 1)
  has_min_reacts = labels_train_sums >= min_reacts
  data_train = data_train[has_min_reacts]
  labels_train = labels_train[has_min_reacts]
  
  labels_train = labels_train/labels_train.sum(axis=1, keepdims=True)
  labels_test = labels_test/labels_test.sum(axis=1, keepdims=True)

  #process
  vectorizer = CountVectorizer(max_features=5000, binary=True)
  # vectorizer = CountVectorizer(max_features=5000, binary=True)
  # vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
  #                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
  data_train = vectorizer.fit_transform(data_train.astype('U'))

  data_test = vectorizer.transform(data_test.astype('U'))
  # end

  model = None # Clearing the NN.
  model = create_model(len(vectorizer.get_feature_names()))

  # scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)
  # print(model.metrics_names, scores) 
# ['loss', 'mean_squared_error', 'mean_absolute_error', 'js_distance'] [0.051870411047777766, 0.0518704317510128, 0.15690724551677704, 0.38542553782463074]
# softmax cce
# ['loss', 'mean_squared_error', 'mean_absolute_error', 'categorical_crossentropy', 'js_distance'] [1.3029636577680928, 0.0507490374147892, 0.1581949144601822, 1.3029628992080688, 0.38368284702301025]
# col 0 0.08069338785648082
# col 1 0.04315388244847084
# col 2 0.045754739576135
# col 3 0.04100759413744935
# col 4 0.04474218285218102
# ---------------------------

HOLDOUT
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 500)               2500500   
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 2505      
Total params: 2,503,005
Trainable params: 2,503,005
Non-trainable params: 0
_________________________________________________________________
None
