In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import keras.backend as K
import tensorflow as tf

from scipy.spatial.distance import jensenshannon
from numpy import asarray

kl_div = tf.keras.losses.KLDivergence()
 
# calculate the js divergence
def js_divergence(p, q):
	m = 0.5 * (p + q)
	return 0.5 * kl_div(p, m) + 0.5 * kl_div(q, m)

def js_distance(y_true, y_pred):
  return K.sqrt(js_divergence(y_true, y_pred))


Using TensorFlow backend.


# Load Data

In [0]:
import pandas as pd
import numpy as np

def load_data():
  # load your data using this function
  # url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ExpressData/2_No_Likes.csv'
  url = 'https://raw.githubusercontent.com/jordanchtan/EvaluationData/master/ExpressDataPre/EmoBank_Writer_All.csv'
  df = pd.read_csv(url, encoding='utf8')
  df = df[df['text'].apply(lambda x: isinstance(x, str))]
  
  data = df['text']
  labels = df[['V','A','D']]

  data = data.values
  labels = labels.values

  return data, labels

# Create Model

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv1D, MaxPooling1D, Flatten, Embedding
from keras.utils import plot_model

metrics = ['mean_squared_error', 'mean_absolute_error', js_distance]

def create_model(embedding_layer):
  model = Sequential()
  model.add(embedding_layer)
  model.add(Dense(units=300, activation='relu'))
  model.add(Flatten())
  model.add(Dense(units=3, activation='relu'))

  model.compile(loss='mean_squared_error', optimizer='adam', metrics=metrics)
  
  print(model.summary())
  from keras.utils.vis_utils import plot_model
  plot_model(model, to_file='Express_Google.png', show_shapes=True, show_layer_names=True)

  return model


# Train and Evaluate Model

In [5]:
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


def train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test):
  print("Training:")
  # data_train, data_val, labels_train, labels_val = train_test_split(data_train, labels_train, test_size=0.2, shuffle=True)
  data_test, data_val, labels_test, labels_val = train_test_split(data_test, labels_test, test_size=0.5, shuffle=True)

  model.fit(data_train, labels_train, 
        epochs=10, batch_size=128, verbose=1, shuffle=True,
        validation_data=(data_val, labels_val))
  
  print("Evaluating:")
  scores = model.evaluate(data_test, labels_test, verbose=1)
  print("Final scores for fold:")
  print(model.metrics_names, scores) 
  return scores

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Run Evaluation

In [6]:
data, labels = load_data()
print(len(data))
print(data[0])
useHoldout = False

min_reacts = 1
if (len(data) > 10000):
  useHoldout = True

10277
today i kept it simple .


Prep embeddings

In [7]:
from gensim.models.keyedvectors import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

path = './drive/My Drive/Colab Notebooks/Store/GoogleNews-vectors-negative300.bin.gz'

embedding_dim = 300
t = Tokenizer()
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_data = t.texts_to_sequences(data)
max_length = len(max(encoded_data, key=len))
padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')
data = padded_data

# MAX_NB_WORDS = 200000
word2vec = KeyedVectors.load_word2vec_format(path, binary=True)

word_index = t.word_index
nb_words = min(len(word_index), len(word_index))+1
# nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, embedding_dim))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
# print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

embedding_layer = Embedding(embedding_matrix.shape[0], # or len(word_index) + 1
                            embedding_matrix.shape[1], # or EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
print(data.shape)
print(max_length)

(10277, 124)
124


K-Fold

In [0]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize


if not useHoldout:
  print("KFOLD")  
  n_folds = 5
  kf = KFold(n_folds, shuffle=True)
  i = 0

  # Define per-fold score containers
  scores_per_fold = []

  for train_index, test_index in kf.split(data):
    print("Running Fold", i+1, "/", n_folds)
    data_train, data_test = data[train_index], data[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    model = None # Clearing the NN.
    model = create_model(embedding_layer)

    scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)
    scores_per_fold.append(scores)

    i += 1

  

In [0]:
if not useHoldout:

  print('Average scores across all folds:')
  for metric_index, metric_name in enumerate(metrics):
    metric_total = 0
    for scores in scores_per_fold:
      metric_total += scores[metric_index + 1]
    print(metric_name, metric_total/n_folds )
  print(scores_per_fold)
  

Holdout

In [11]:
from sklearn.model_selection import train_test_split

if useHoldout:
  print("HOLDOUT")

  data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2, shuffle=True)

  model = None # Clearing the NN.
  model = create_model(embedding_layer)

  # scores = train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test)
  # print(model.metrics_names, scores) 

HOLDOUT
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 124, 300)          4896300   
_________________________________________________________________
dense_1 (Dense)              (None, 124, 300)          90300     
_________________________________________________________________
flatten_1 (Flatten)          (None, 37200)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 111603    
Total params: 5,098,203
Trainable params: 201,903
Non-trainable params: 4,896,300
_________________________________________________________________
None


In [12]:
# data, labels = load_data()
example_data = ['fuck you', 'i love you']
print(example_data)
encoded_example_data = t.texts_to_sequences(example_data)
padded_example_data = pad_sequences(encoded_example_data, maxlen=max_length, padding='post')
example_data = padded_example_data
print(model.predict(example_data, verbose=0))


['fuck you', 'i love you']
[[0.00307089 0.00742555 0.00421632]
 [0.         0.0025522  0.00883772]]
