## Data Loading

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
pd.set_option('max_colwidth', 500)

In [3]:
df = pd.read_csv('../data/polarity.csv', sep=";")
df.polarity.unique()

array([ 1, -1,  0])

In [4]:
df.count()

text        12627
polarity    12627
dtype: int64

In [5]:
df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5485
0,4645
1,2497


## Text Cleaning

### Text preprocessing utils

In [6]:
import sys
sys.path.append('..')

from utils import text_preprocessing

In [7]:
df['text'] = df['text'].map(text_preprocessing.normalize)

In [8]:
df.head()

Unnamed: 0,text,polarity
0,abogado michelle bachelet otorgó asesoría jurídica mujer realizó denuncia,1
1,faltan dias acabe pesadilla llamada michelle bachelet,-1
2,michelle bachelet trotando forma michelle bachelet tratando aprobar reformas chavo,0
3,hace pocos días amigos masificaron imagen supuesto vino michelle bachelet,-1
4,alcalde pozo almonte josé fernando muñoz junto presidenta michelle bachelet intendenta tarapacá,0


### Remove texts with only one word

In [9]:
df = df[df.apply(lambda r: len(r.text.split()) > 1, axis=1)]

In [10]:
df.count()

text        12443
polarity    12443
dtype: int64

## Dataset split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train, test = train_test_split(df, test_size=0.1, random_state=42)

In [13]:
train.count()

text        11198
polarity    11198
dtype: int64

In [14]:
test.count()

text        1245
polarity    1245
dtype: int64

## Scikit Learn Models

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [16]:
X_train = train.text.values
y_train = train.polarity.values
X_test = test.text.values
y_test = test.polarity.values

### Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [19]:
y_pred = nb.predict(X_test)

In [20]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.59      0.87      0.70       560
           0       0.60      0.48      0.53       461
           1       0.80      0.17      0.29       224

    accuracy                           0.60      1245
   macro avg       0.66      0.51      0.51      1245
weighted avg       0.63      0.60      0.56      1245



### Support Vector Machine

In [22]:
from sklearn.linear_model import SGDClassifier

In [23]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [24]:
y_pred = sgd.predict(X_test)

In [25]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5943775100401606


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.58      0.90      0.70       560
           0       0.66      0.35      0.46       461
           1       0.57      0.33      0.42       224

    accuracy                           0.59      1245
   macro avg       0.60      0.53      0.53      1245
weighted avg       0.61      0.59      0.56      1245



### Regresión Logística

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [29]:
y_pred = logreg.predict(X_test)

In [30]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.58714859437751


In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.66      0.68      0.67       560
           0       0.53      0.52      0.52       461
           1       0.50      0.51      0.51       224

    accuracy                           0.59      1245
   macro avg       0.57      0.57      0.57      1245
weighted avg       0.59      0.59      0.59      1245



### Naive Bayes - solo positivos y negativos

In [32]:
no_neutral_df = df[~(df.polarity == 0)].copy()

In [33]:
no_neutral_df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5434
1,2449


In [34]:
train_no_neutral, test_no_neutral = train_test_split(no_neutral_df, test_size=0.2, random_state=42)
X_train_no_neutral = train_no_neutral.text.values
X_test_no_neutral = test_no_neutral.text.values
y_train_no_neutral = train_no_neutral.polarity.values
y_test_no_neutral = test_no_neutral.polarity.values

In [35]:
nb_no_neutral = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb_no_neutral.fit(X_train_no_neutral, y_train_no_neutral)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [36]:
y_pred_no_neutral = nb_no_neutral.predict(X_test_no_neutral)

In [37]:
print('accuracy %s' % accuracy_score(y_pred_no_neutral, y_test_no_neutral))

accuracy 0.7805960684844642


In [38]:
print(classification_report(y_test_no_neutral, y_pred_no_neutral))

              precision    recall  f1-score   support

          -1       0.76      0.99      0.86      1096
           1       0.94      0.30      0.46       481

    accuracy                           0.78      1577
   macro avg       0.85      0.65      0.66      1577
weighted avg       0.82      0.78      0.74      1577



### Naive Bayes - Ngrams

In [39]:
nb = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [40]:
y_pred = nb.predict(X_test)

In [41]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6040160642570281


## Keras Deep Learning Models

In [42]:
BATCH_SIZE = 32
EPOCHS = 5

### Integer labels to categorical

In [43]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [44]:
y_train = pd.get_dummies(train.polarity).values
y_test = pd.get_dummies(test.polarity).values

### Encoding

In [45]:
from keras.preprocessing.text import Tokenizer

In [46]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [47]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)

In [48]:
X_train[2]

'grande pije queridovamos temuco mierrrr'

In [49]:
X_train_sequences[2]

[158, 10797, 10798, 2197, 10799]

In [50]:
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [51]:
for word in ['bachelet', 'final', 'orrego']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

bachelet: 4
final: 253
orrego: 8188


In [52]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

Dado que las secuencias generadas con texts_to_sequences no poseen un largo uniforme, se utiliza pad_sequence para remediar dicho resultado mediante la adición de ceros a las secuencias hasta homogeneizar el largo de estas últimas.

In [53]:
max_sequence_length = max(len(t) for t in X_train_sequences)
max_sequence_length

578

In [54]:
from keras.preprocessing.sequence import pad_sequences

In [55]:
X_train_padded_sequences = pad_sequences(X_train_sequences, padding='post', maxlen=max_sequence_length)
X_test_padded_sequences = pad_sequences(X_test_sequences, padding='post', maxlen=max_sequence_length)

In [56]:
X_train_padded_sequences.shape

(11198, 578)

In [57]:
X_train_padded_sequences[1,:]

array([  561, 10795,  4145,   349,  1093, 10796,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

### Embeding Layer

+ [Artículo relevante](https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer)
+ [Documentación embedding layer](https://keras.io/layers/embeddings/)

In [58]:
from keras.models import Sequential
from keras import layers

In [59]:
embedding_dim = 50

In [60]:
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_sequence_length))
model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 578, 50)           1405300   
_________________________________________________________________
flatten_1 (Flatten)          (None, 28900)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               2890100   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 4,295,703
Trainable params: 4,295,703
Non-trainable params: 0
_________________________________________________________________


In [61]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Instructions for updating:
Use tf.cast instead.
Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [62]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9931
Testing Accuracy:  0.5960


## Embeddings Preentrenados

### Word2Vec

In [63]:
from gensim.models.keyedvectors import KeyedVectors

Características del embedding: 
+ #dimensions = 300
+ #vectors = 1000653

In [64]:
wordvectors_file_vec = '../embeddings/SBW-vectors-300-min5.txt'
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec)

### Palabra dentro del vocabulario

In [65]:
wordvectors['de']

array([-2.96480e-02,  1.13360e-02,  1.99490e-02, -8.88320e-02,
       -2.52250e-02,  5.68440e-02,  2.54730e-02,  1.40680e-02,
        1.63694e-01, -6.71540e-02,  1.47380e-02,  2.71340e-02,
        6.64430e-02, -4.48460e-02, -4.49870e-02, -4.08980e-02,
        3.03110e-02,  3.41960e-02, -4.92400e-02,  8.53700e-03,
       -6.80910e-02, -8.79380e-02,  3.53000e-02,  1.49385e-01,
       -1.23500e-02,  1.26130e-02,  2.93500e-02,  6.95960e-02,
        3.91110e-02,  5.76520e-02,  6.99540e-02, -6.62170e-02,
       -4.17840e-02,  2.86230e-02,  2.67720e-02, -6.63920e-02,
        2.95300e-03, -1.21880e-02, -3.03630e-02,  4.02220e-02,
        3.48580e-02,  2.74690e-02, -2.90340e-02, -4.87480e-02,
       -3.85820e-02, -5.15530e-02, -3.35010e-02, -1.90080e-02,
        3.04300e-03,  1.10712e-01, -2.50960e-02,  1.11082e-01,
        3.52440e-02,  1.14207e-01,  1.01950e-02,  5.15110e-02,
       -4.06490e-02, -1.13944e-01,  4.48730e-02,  5.20110e-02,
        6.73600e-02,  4.90540e-02, -1.27085e-01, -3.184

### Embedding matrix creation

In [66]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
embedding_matrix.shape

(28106, 300)

Obtención de los vectores para el vocabulario del corpus de entrenamiento, desde el modelo word2vect preentrenado. Si no se encuentra el vector para alguna palabra (Out of Vocabulary Word), se genera uno aleatorio. 

In [67]:
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = wordvectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [68]:
embedding_matrix.shape

(28106, 300)

In [69]:
del(wordvectors)

### FFN + word2vector

In [70]:
model = Sequential()
model.add(layers.Embedding(
                    input_dim=vocab_size, 
                    output_dim=EMBEDDING_DIM, 
                    input_length=max_sequence_length,
                    weights=[embedding_matrix],
                    trainable=True
                )
)
model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
flatten_2 (Flatten)          (None, 173400)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               17340100  
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 303       
Total params: 25,772,203
Trainable params: 25,772,203
Non-trainable params: 0
_________________________________________________________________


In [71]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [72]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9935
Testing Accuracy:  0.6080


### CNN - Yoon Kim Model + word2vec

[model reference](https://arxiv.org/abs/1408.5882)

In [73]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length, 
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 576, 128)          115328    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 189, 128)          65664     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 63, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 59, 128)           82048     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
__________

In [74]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [75]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9910
Testing Accuracy:  0.5904


### CNN Yoon Kim model (padding = 'same') + word2vec

In [77]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length, 
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 578, 300)          8165700   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 578, 128)          115328    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 192, 128)          65664     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 64, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 64, 128)           82048     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
__________

In [80]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11121 samples, validate on 1236 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
  736/11121 [>.............................] - ETA: 1:47 - loss: 0.0123 - acc: 0.9905

KeyboardInterrupt: 

### CNN Yoon Kim model + Glove

In [76]:
glove_vectors = KeyedVectors.load_word2vec_format('../embeddings/glove-sbwc.i25.vec')

In [77]:
glove_embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = glove_vectors[word]
        glove_embedding_matrix[i] = embedding_vector
    except KeyError:
        glove_embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [78]:
del(glove_vectors)

In [79]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 576, 128)          115328    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 189, 128)          65664     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 63, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 59, 128)           82048     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
__________

In [80]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [81]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9914
Testing Accuracy:  0.6112


### CNN (padding 'same') + Glove

In [105]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 578, 300)          8346900   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 578, 128)          115328    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 192, 128)          65664     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 64, 128)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 64, 128)           82048     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
__________

In [106]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11192 samples, validate on 1244 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### LSTM

In [109]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length),
    layers.LSTM(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 578, 300)          8346900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_19 (Dense)             (None, 3)                 303       
Total params: 8,507,603
Trainable params: 8,507,603
Non-trainable params: 0
_________________________________________________________________


In [110]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11192 samples, validate on 1244 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### LSTM + Glove

In [95]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.LSTM(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 303       
Total params: 7,878,503
Trainable params: 7,878,503
Non-trainable params: 0
_________________________________________________________________


In [96]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=64)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.4344
Testing Accuracy:  0.4469


### BiLSTM

In [82]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
    ),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 578, 128)          186880    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 387       
Total params: 8,619,067
Trainable params: 8,619,067
Non-trainable params: 0
_________________________________________________________________


In [83]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiLSTM + word2vec

In [84]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 578, 128)          186880    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 3)                 387       
Total params: 8,619,067
Trainable params: 8,619,067
Non-trainable params: 0
_________________________________________________________________


In [85]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiLSTM + Glove

In [86]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 578, 128)          186880    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 387       
Total params: 8,619,067
Trainable params: 8,619,067
Non-trainable params: 0
_________________________________________________________________


In [87]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### GRU

In [93]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length),
    layers.GRU(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 578, 300)          7717800   
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               120300    
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 303       
Total params: 7,838,403
Trainable params: 7,838,403
Non-trainable params: 0
_________________________________________________________________


In [94]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=64)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### GRU + Glove

In [105]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.SpatialDropout1D(0.2),
    layers.GRU(64),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 578, 300)          0         
_________________________________________________________________
gru_3 (GRU)                  (None, 100)               120300    
_________________________________________________________________
dense_18 (Dense)             (None, 3)                 303       
Total params: 7,838,403
Trainable params: 7,838,403
Non-trainable params: 0
_________________________________________________________________


In [107]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=32)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiGRU + word2vec

In [90]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 387       
Total params: 8,572,347
Trainable params: 8,572,347
Non-trainable params: 0
_________________________________________________________________


In [91]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiGRU + Glove

In [88]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 578, 300)          8431800   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 387       
Total params: 8,572,347
Trainable params: 8,572,347
Non-trainable params: 0
_________________________________________________________________


In [89]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### CNN +  LSTM + Glove

In [118]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=4),
    layers.LSTM(64),
    layers.Dense(64),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 574, 64)           96064     
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 143, 64)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 50)                23000     
_________________________________________________________________
dense_27 (Dense)             (None, 64)                3264      
_________________________________________________________________
dense_28 (Dense)             (None, 3)                 195       
Total params: 7,840,323
Trainable params: 7,840,323
Non-trainable params: 0
_________________________________________________________________


In [119]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiGRU + CNN + word2vec

In [94]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 578, 300)          8431800   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 574, 64)           41024     
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 64)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_17 (Dense)             (None, 3)                 195       
Total params: 8,617,339
Trainable params: 8,617,339
Non-trainable params: 0
_________________________________________________________________


In [95]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiGRU + CNN + Glove

In [92]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 578, 300)          8431800   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 574, 64)           41024     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 64)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_15 (Dense)             (None, 3)                 195       
Total params: 8,617,339
Trainable params: 8,617,339
Non-trainable params: 0
_________________________________________________________________


In [93]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11198 samples, validate on 1245 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Very deep convolutional neural network (VDCNN)

[Paper](https://arxiv.org/abs/1606.01781)