In [0]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 5000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

In [2]:
import keras.backend as K
import tensorflow as tf

from scipy.spatial.distance import jensenshannon
from numpy import asarray

kl_div = tf.keras.losses.KLDivergence()
 
# calculate the js divergence
def js_divergence(p, q):
	m = 0.5 * (p + q)
	return 0.5 * kl_div(p, m) + 0.5 * kl_div(q, m)

def js_distance(y_true, y_pred):
  return K.sqrt(js_divergence(y_true, y_pred))


Using TensorFlow backend.


# Load Data

In [0]:
import pandas as pd
import numpy as np

def load_data():
  # load your data using this function
  # url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ReactDataCounts/2_No_Likes.csv'
  url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ReactDataCountsPre/2_No_Likes.csv'
  df = pd.read_csv(url, encoding='utf16')

  data = df['name']
  labels = df.select_dtypes(include=[np.number])

  data = data.values
  labels = labels.values

  return data, labels

# Create Model

In [0]:
# from keras.models import Sequential
# from keras.layers import Dense

# metrics = ['mean_squared_error', 'mean_absolute_error', js_distance]

# def create_model(input_dim):
#   model = Sequential()
#   model.add(Dense(units=500, activation='relu', input_dim=input_dim))
#   model.add(Dense(units=5, activation='relu'))
  
#   # model.compile(loss='kullback_leibler_divergence', optimizer='adam', metrics=metrics)
#   # model.compile(loss=js_divergence, optimizer='adam', metrics=metrics)
#   model.compile(loss='mean_squared_error', optimizer='adam', metrics=metrics)
#   # model.summary()

#   return model

from keras.models import Sequential
from keras.layers import Dense, Dropout

metrics = ['mean_squared_error', 'mean_absolute_error', js_distance]

layers = 4
units = 128
dropout_rate = 0.2
def create_model(input_dim):
  model = Sequential()
  # model.add(Dropout(rate=dropout_rate, input_shape=(input_dim,1)))
  for x in range(layers - 1):
    model.add(Dense(units=units, activation='relu', input_dim=input_dim))
    model.add(Dropout(rate=dropout_rate))
  
  model.add(Dense(units=5, activation='relu'))
  
  
  # model.compile(loss='kullback_leibler_divergence', optimizer='adam', metrics=metrics)
  # model.compile(loss=js_divergence, optimizer='adam', metrics=metrics)
  model.compile(loss='mean_squared_error', optimizer='adam', metrics=metrics)
  
  print(model.summary())
  from keras.utils.vis_utils import plot_model
  plot_model(model, to_file='2_No_Likes_TFIDF.png', show_shapes=True, show_layer_names=True)

  return model


# Train and Evaluate Model

In [5]:
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


def train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test):
  print("Training:")
  data_train, data_val, labels_train, labels_val = train_test_split(data_train, labels_train, test_size=0.2, shuffle=True)
  # data_test, data_val, labels_test, labels_val = train_test_split(data_test, labels_test, test_size=0.5, shuffle=True)

  model.fit(data_train, labels_train, 
        epochs=1, batch_size=128, verbose=1, shuffle=True,
        validation_data=(data_val, labels_val))
  
  print("Evaluating:")
  scores = model.evaluate(data_test, labels_test, verbose=1)
  print("Final scores for fold:")
  print(model.metrics_names, scores) 
  return scores

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Run Evaluation

In [6]:
data, labels = load_data()
print(len(data))
useHoldout = False

min_reacts = 1
if (len(data) > 10000):
  useHoldout = True

155696


In [7]:
print(labels)

[[  88    5    6   45    0]
 [ 109  186    1  499   44]
 [6634 5509 2854   19   10]
 ...
 [   2    0    1    0    0]
 [   0   11    1    0    0]
 [ 457 1109 3816   11   46]]


K-Fold

In [0]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize


if not useHoldout:
  print("KFOLD")  
  n_folds = 5
  kf = KFold(n_folds, shuffle=True)
  i = 0

  # Define per-fold score containers
  scores_per_fold = []

  for train_index, test_index in kf.split(data):
    print("Running Fold", i+1, "/", n_folds)
    data_train, data_test = data[train_index], data[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]

    labels_train_sums = labels_train.sum(axis = 1)
    has_min_reacts = labels_train_sums >= min_reacts
    data_train = data_train[has_min_reacts]
    labels_train = labels_train[has_min_reacts]

    labels_train = labels_train/labels_train.sum(axis=1, keepdims=True)
    labels_test = labels_test/labels_test.sum(axis=1, keepdims=True)


    #process
    vectorizer = TfidfVectorizer(max_features=5000)
    # vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
    #                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
    data_train = vectorizer.fit_transform(data_train.astype('U'))

    data_test = vectorizer.transform(data_test.astype('U'))
    # end
    
    model = None # Clearing the NN.
    model = create_model(len(vectorizer.get_feature_names()))

    scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)
    scores_per_fold.append(scores)

    i += 1

  

In [0]:
if not useHoldout:

  print('Average scores across all folds:')
  for metric_index, metric_name in enumerate(metrics):
    metric_total = 0
    for scores in scores_per_fold:
      metric_total += scores[metric_index + 1]
    print(metric_name, metric_total/n_folds )
  print(scores_per_fold)
  

Holdout

In [10]:
from sklearn.model_selection import train_test_split

if useHoldout:
  print("HOLDOUT")

  data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2, shuffle=True)

  labels_train_sums = labels_train.sum(axis = 1)
  has_min_reacts = labels_train_sums >= min_reacts
  data_train = data_train[has_min_reacts]
  labels_train = labels_train[has_min_reacts]
  
  labels_train = labels_train/labels_train.sum(axis=1, keepdims=True)
  labels_test = labels_test/labels_test.sum(axis=1, keepdims=True)

  #process
  kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'min_df': MIN_DOCUMENT_FREQUENCY,
  }
  vectorizer = TfidfVectorizer(**kwargs)
  # vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
  #                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
  data_train = vectorizer.fit_transform(data_train.astype('U'))

  data_test = vectorizer.transform(data_test.astype('U'))
  # end

  model = None # Clearing the NN.
  model = create_model(len(vectorizer.get_feature_names()))

  # scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)

  #only 20k
# ['loss', 'mean_squared_error', 'mean_absolute_error', 'js_distance'] [0.049731985330294555, 0.04973198473453522, 0.15381042659282684, 0.3792986273765564]

HOLDOUT




Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               18617856  
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                

In [0]:
# filename = './drive/My Drive/Colab Notebooks/express_model.h5'
# model.save(filename)