## Read clean data

In [1]:
import pandas as pd

In [2]:
positive = pd.read_csv("data/clean/positive.csv", encoding= "utf-8")
positive = list(positive["text"])
slightly_positive = pd.read_csv("data/clean/slightly_positive.csv", encoding= "utf-8")
slightly_positive = list(slightly_positive["text"])
slightly_negative = pd.read_csv("data/clean/slightly_negative.csv", encoding= "utf-8")
slightly_negative = list(slightly_negative["text"])
negative = pd.read_csv("data/clean/negative.csv", encoding= "utf-8")
negative = list(negative["text"])

## Use DeepMoji to encode texts into emotional feature vectors

In [3]:
from __future__ import print_function, division
import example_helper
import json
import csv
import numpy as np
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.model_def import deepmoji_feature_encoding
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

Using TensorFlow backend.


In [4]:
maxlen = 30
batch_size = 32

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)

Tokenizing using dictionary from /Users/xjh/Github/DeepMoji/model/vocabulary.json


In [5]:
print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
model.summary()

Loading model from /Users/xjh/Github/DeepMoji/model/deepmoji_weights.hdf5.



Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Loading weights for embedding


Loading weights for bi_lstm_0
Loading weights for bi_lstm_1
Loading weights for attlayer
Ignoring weights for softmax
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 256)      12800000  

In [7]:
print('Encoding texts..')
tokenized_positive, _, _ = st.tokenize_sentences(positive)
encoding_positive = model.predict(tokenized_positive)
print("Positive shape {}".format(encoding_positive.shape))
print("Positive done")

tokenized_slightly_positive, _, _ = st.tokenize_sentences(slightly_positive)
encoding_slightly_positive = model.predict(tokenized_slightly_positive)
print("Slightly positive shape {}".format(encoding_slightly_positive.shape))
print("Slightly positive done")

tokenized_slightly_negative, _, _ = st.tokenize_sentences(slightly_negative)
encoding_slightly_negative = model.predict(tokenized_slightly_negative)
print("Slightly negative shape {}".format(encoding_slightly_negative.shape))
print("Slightly negative done")

tokenized_negative, _, _ = st.tokenize_sentences(negative)
encoding_negative = model.predict(tokenized_negative)
print("Negative shape {}".format(encoding_negative.shape))
print("Negative done")

Encoding texts..
Positive shape (31549, 2304)
Positive done
Slightly positive shape (34806, 2304)
Slightly positive done
Slightly negative shape (28486, 2304)
Slightly negative done
Negative shape (37283, 2304)
Negative done


## Train, Validation and Test Sets Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_size = 0.8
valid_size = 0.1
test_size = 0.1
random_state = 888
total_labels = 4

In [10]:
def train_valid_test_split(input_matrix, label, train_size, valid_size, test_size, 
                           random_state, total_labels):
    n = input_matrix.shape[0]
    X_train_idx, X_test_idx = train_test_split(list(range(n)), 
                                               test_size = valid_size + test_size, 
                                               random_state = random_state)
    X_valid_idx, X_test_idx = train_test_split(X_test_idx, 
                                               test_size = test_size /(valid_size + test_size), 
                                               random_state = random_state)
    # training set
    X_train = input_matrix[X_train_idx]
    Y_train = np.eye(total_labels, dtype=int)[np.repeat(label, len(X_train_idx))]
    
    # validation set
    X_valid = input_matrix[X_valid_idx]
    Y_valid = np.eye(total_labels, dtype=int)[np.repeat(label, len(X_valid_idx))]
    
    # test set
    X_test = input_matrix[X_test_idx]
    Y_test = np.eye(total_labels, dtype=int)[np.repeat(label, len(X_test_idx))]

    return {"X_train": X_train, 
            "Y_train": Y_train, 
            "X_valid": X_valid, 
            "Y_valid": Y_valid, 
            "X_test": X_test, 
            "Y_test": Y_test,
            "train_idx": X_train_idx,
            "valid_idx": X_valid_idx,
            "test_idx": X_test_idx}


In [11]:
splitted_data_0 = train_valid_test_split(encoding_positive, 
                                         0, train_size, valid_size, test_size, 
                                         random_state, total_labels)
splitted_data_1 = train_valid_test_split(encoding_slightly_positive, 
                                         1, train_size, valid_size, test_size, 
                                         random_state, total_labels)
splitted_data_2 = train_valid_test_split(encoding_slightly_negative, 
                                         2, train_size, valid_size, test_size, 
                                         random_state, total_labels)
splitted_data_3 = train_valid_test_split(encoding_negative, 
                                         3, train_size, valid_size, test_size, 
                                         random_state, total_labels)

## Save train data set

In [12]:
train_data = np.concatenate((splitted_data_0.get("X_train"),
                             splitted_data_1.get("X_train"),
                             splitted_data_2.get("X_train"),
                             splitted_data_3.get("X_train")), axis=0)
train_label = np.concatenate((splitted_data_0.get("Y_train"),
                              splitted_data_1.get("Y_train"),
                              splitted_data_2.get("Y_train"),
                              splitted_data_3.get("Y_train")), axis=0)
train_index = np.concatenate((np.array(list(map(lambda x: [x,0], splitted_data_0.get("train_idx")))),
                              np.array(list(map(lambda x: [x,1], splitted_data_1.get("train_idx")))),
                              np.array(list(map(lambda x: [x,2], splitted_data_2.get("train_idx")))),
                              np.array(list(map(lambda x: [x,3], splitted_data_3.get("train_idx"))))), axis=0)
np.save("data/train_data.npy", train_data)
np.save("data/train_label.npy", train_label)
np.save("data/train_index.npy", train_index)

## Save valid data set

In [13]:
valid_data = np.concatenate((splitted_data_0.get("X_valid"),
                             splitted_data_1.get("X_valid"),
                             splitted_data_2.get("X_valid"),
                             splitted_data_3.get("X_valid")), axis=0)
valid_label = np.concatenate((splitted_data_0.get("Y_valid"),
                              splitted_data_1.get("Y_valid"),
                              splitted_data_2.get("Y_valid"),
                              splitted_data_3.get("Y_valid")), axis=0)
valid_index = np.concatenate((np.array(list(map(lambda x: [x,0], splitted_data_0.get("valid_idx")))),
                              np.array(list(map(lambda x: [x,1], splitted_data_1.get("valid_idx")))),
                              np.array(list(map(lambda x: [x,2], splitted_data_2.get("valid_idx")))),
                              np.array(list(map(lambda x: [x,3], splitted_data_3.get("valid_idx"))))), axis=0)
np.save("data/valid_data.npy", valid_data)
np.save("data/valid_label.npy", valid_label)
np.save("data/valid_index.npy", valid_index)

## Save test data set

In [14]:
test_data = np.concatenate((splitted_data_0.get("X_test"),
                             splitted_data_1.get("X_test"),
                             splitted_data_2.get("X_test"),
                             splitted_data_3.get("X_test")), axis=0)
test_label = np.concatenate((splitted_data_0.get("Y_test"),
                              splitted_data_1.get("Y_test"),
                              splitted_data_2.get("Y_test"),
                              splitted_data_3.get("Y_test")), axis=0)
test_index = np.concatenate((np.array(list(map(lambda x: [x,0], splitted_data_0.get("train_idx")))),
                             np.array(list(map(lambda x: [x,1], splitted_data_1.get("train_idx")))),
                             np.array(list(map(lambda x: [x,2], splitted_data_2.get("train_idx")))),
                             np.array(list(map(lambda x: [x,3], splitted_data_3.get("train_idx"))))), axis=0)
np.save("data/test_data.npy", test_data)
np.save("data/test_label.npy", test_label)
np.save("data/test_index.npy", test_index)