## Data Loading

In [1]:
import pandas as pd
import numpy as np
import nltk

pd.set_option('max_colwidth', 500)

In [46]:
df = pd.read_csv('../data/emotion.csv', sep=";")

### Emotion Mapping

+ Happiness: 0
+ Sadness: 1
+ Fear: 2
+ Anger: 3
+ Surprise: 4
+ Disgust: 5
+ Undefined: 10

### Data description

In [47]:
df.count()

text       11342
emotion    11342
dtype: int64

In [48]:
df.groupby(['emotion']).count()

Unnamed: 0_level_0,text
emotion,Unnamed: 1_level_1
0,2557
1,356
2,147
3,1338
4,3253
5,3420
10,271


In [50]:
df[df.text.str.contains('bachelet', case=False)].count()

text       957
emotion    957
dtype: int64

## Text Cleaning

### Text preprocessing utils

In [5]:
import sys
sys.path.append('..')

from utils import text_preprocessing

In [6]:
df['text'] = df['text'].map(text_preprocessing.normalize)
df.head()

Unnamed: 0,text,emotion
0,abogado michelle bachelet otorgó asesoría jurídica mujer realizó denuncia,4
1,faltan dias acabe pesadilla llamada michelle bachelet,5
2,alcalde pozo almonte josé fernando muñoz junto presidenta michelle bachelet intendenta tarapacá,4
3,presidenta michelle bachelet promulga ley equidad tarifaria servicios eléctricos,0
4,serio nombre completo michelle bachelet queen,4


### Remove texts with only one word

In [7]:
df = df[df.apply(lambda r: len(r.text.split()) > 1, axis=1)]
df.count()

text       11156
emotion    11156
dtype: int64

## Dataset Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train, test = train_test_split(df, test_size=0.1, random_state=42)

In [10]:
train.count()

text       10040
emotion    10040
dtype: int64

In [11]:
test.count()

text       1116
emotion    1116
dtype: int64

## Scikit Learn Models

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [13]:
X_train = train.text.values
y_train = train.emotion.values
X_test = test.text.values
y_test = test.emotion.values

### Class weights balancing

In [14]:
from sklearn.utils import class_weight

In [15]:
classes = np.unique(y_train)
classes

array([ 0,  1,  2,  3,  4,  5, 10])

In [16]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights

array([ 0.6366115 ,  4.61185117, 11.11849391,  1.20124432,  0.49853518,
        0.47133937,  6.15573268])

### Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [19]:
y_pred = nb.predict(X_test)

In [20]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.532258064516129


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.48      0.57       244
           1       0.00      0.00      0.00        35
           2       0.00      0.00      0.00        17
           3       0.00      0.00      0.00       131
           4       0.55      0.65      0.60       321
           5       0.47      0.78      0.59       344
          10       0.00      0.00      0.00        24

    accuracy                           0.53      1116
   macro avg       0.25      0.27      0.25      1116
weighted avg       0.46      0.53      0.48      1116



  'precision', 'predicted', average, warn_for)


### Support Vector Machine

In [22]:
from sklearn.svm import SVC

In [23]:
svm_rbf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SVC(class_weight='balanced')),
               ])
svm_rbf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
                     decision_function_shape='ovr', degree=3,
           

In [24]:
y_pred = nb.predict(X_test)

In [25]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.532258064516129


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.48      0.57       244
           1       0.00      0.00      0.00        35
           2       0.00      0.00      0.00        17
           3       0.00      0.00      0.00       131
           4       0.55      0.65      0.60       321
           5       0.47      0.78      0.59       344
          10       0.00      0.00      0.00        24

    accuracy                           0.53      1116
   macro avg       0.25      0.27      0.25      1116
weighted avg       0.46      0.53      0.48      1116



## Deep Learning (Keras)

In [60]:
BATCH_SIZE = 32
EPOCHS = 10
CHECKPOINTS_PATH = '../model_checkpoints/'

### Integer labels to categorical

In [28]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [29]:
y_train = pd.get_dummies(train.emotion).values
y_test = pd.get_dummies(test.emotion).values

### Encoding

In [30]:
from keras.preprocessing.text import Tokenizer

In [31]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [32]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
max_sequence_length = max(len(t) for t in X_train_sequences)

In [33]:
vocab_size

25852

In [34]:
max_sequence_length

578

### Padding sequences

In [35]:
from keras.preprocessing.sequence import pad_sequences

In [36]:
X_train_padded_sequences = pad_sequences(X_train_sequences, padding='post', maxlen=max_sequence_length)
X_test_padded_sequences = pad_sequences(X_test_sequences, padding='post', maxlen=max_sequence_length)

### Loading Embeddings

In [37]:
from gensim.models.keyedvectors import KeyedVectors

In [38]:
glove_vectors = KeyedVectors.load_word2vec_format('../embeddings/glove-sbwc.i25.vec')

In [39]:
EMBEDDING_DIM = 300

In [40]:
glove_embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = glove_vectors[word]
        glove_embedding_matrix[i] = embedding_vector
    except KeyError:
        glove_embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

### CNN

In [57]:
from datetime import datetime
from keras.models import Sequential
from keras import layers
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [44]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length, 
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=5),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=5),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(7, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 578, 300)          7755600   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 574, 128)          192128    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 114, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 110, 128)          82048     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 22, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 18, 128)           82048     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
__________

In [56]:
model_name = 'cnn-yk'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.5, patience=5)

history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=10,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 10040 samples, validate on 1116 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.50000, saving model to ../model_checkpoints/cnn-yk-best.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.50000 to 0.50448, saving model to ../model_checkpoints/cnn-yk-best.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.50448 to 0.50896, saving model to ../model_checkpoints/cnn-yk-best.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.50896
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.50896
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.50896
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.50896
Epoch 8/10

Epoch 00008: val_acc improved from 0.50896 to 0.50986, saving model to ../model_checkpoints/cnn-yk-best.hdf5
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.50986
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.50986


### BiGru + Glove

In [63]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(7, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 578, 300)          7755600   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 7)                 903       
Total params: 7,896,663
Trainable params: 7,896,663
Non-trainable params: 0
_________________________________________________________________


In [65]:
model_name = 'bigru'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.5, patience=5)

history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=10,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 10040 samples, validate on 1116 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.49821, saving model to ../model_checkpoints/bigru-2019-05-30T13:33:50.624456.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.49821 to 0.50179, saving model to ../model_checkpoints/bigru-2019-05-30T13:33:50.624456.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.50179 to 0.50538, saving model to ../model_checkpoints/bigru-2019-05-30T13:33:50.624456.hdf5
Epoch 4/10

Epoch 00004: val_acc improved from 0.50538 to 0.50538, saving model to ../model_checkpoints/bigru-2019-05-30T13:33:50.624456.hdf5
Epoch 5/10

Epoch 00005: val_acc improved from 0.50538 to 0.50896, saving model to ../model_checkpoints/bigru-2019-05-30T13:33:50.624456.hdf5
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.50896
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.50896
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.50896
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.50

### BiGRU + CNN + Glove

In [69]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(7, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 578, 300)          7755600   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 574, 64)           41024     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_10 (Dense)             (None, 7)                 455       
Total params: 7,941,399
Trainable params: 7,941,399
Non-trainable params: 0
_________________________________________________________________


In [70]:
model_name = 'bigru-cnn'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.5, patience=5)

history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=10,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 10040 samples, validate on 1116 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.54301, saving model to ../model_checkpoints/bigru-cnn-2019-05-30T15:26:29.718466.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.54301 to 0.55287, saving model to ../model_checkpoints/bigru-cnn-2019-05-30T15:26:29.718466.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.55287
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.55287
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.55287
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.55287
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.55287
