## Data Loading

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
pd.set_option('max_colwidth', 500)

In [3]:
df = pd.read_csv('../data/polarity.csv', sep=";")
df.polarity.unique()

array([ 1, -1,  0])

In [4]:
df.count()

text        12627
polarity    12627
dtype: int64

In [5]:
df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5485
0,4645
1,2497


## Text Cleaning

### Text preprocessing utils

In [6]:
import sys
sys.path.append('..')

from utils import text_preprocessing

In [7]:
df['text'] = df['text'].map(text_preprocessing.normalize)

In [8]:
df.head()

Unnamed: 0,text,polarity
0,abogado michelle bachelet otorgó asesoría jurídica mujer realizó denuncia,1
1,faltan dias acabe pesadilla llamada michelle bachelet,-1
2,michelle bachelet trotando forma michelle bachelet tratando aprobar reformas chavo,0
3,hace pocos días amigos masificaron imagen supuesto vino michelle bachelet,-1
4,alcalde pozo almonte josé fernando muñoz junto presidenta michelle bachelet intendenta tarapacá,0


### Remove texts with only one word

In [9]:
df = df[df.apply(lambda r: len(r.text.split()) > 1, axis=1)]

In [10]:
df.count()

text        12436
polarity    12436
dtype: int64

## Dataset split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

## Scikit Learn Models

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [15]:
X_train = train.text.values
y_train = train.polarity.values
X_test = test.text.values
y_test = test.polarity.values

### Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [18]:
y_pred = nb.predict(X_test)

In [19]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.610128617363344


In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.60      0.87      0.71      1112
           0       0.59      0.53      0.56       881
           1       0.85      0.17      0.29       495

    accuracy                           0.61      2488
   macro avg       0.68      0.52      0.52      2488
weighted avg       0.65      0.61      0.57      2488



### Support Vector Machine

In [21]:
from sklearn.linear_model import SGDClassifier

In [22]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [23]:
y_pred = sgd.predict(X_test)

In [24]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6089228295819936


In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.59      0.89      0.71      1112
           0       0.63      0.41      0.50       881
           1       0.67      0.34      0.45       495

    accuracy                           0.61      2488
   macro avg       0.63      0.55      0.55      2488
weighted avg       0.62      0.61      0.58      2488



### Regresión Logística

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [28]:
y_pred = logreg.predict(X_test)

In [29]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5727491961414791


In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.66      0.66      0.66      1112
           0       0.50      0.54      0.52       881
           1       0.51      0.43      0.47       495

    accuracy                           0.57      2488
   macro avg       0.56      0.54      0.55      2488
weighted avg       0.57      0.57      0.57      2488



### Naive Bayes - solo positivos y negativos

In [31]:
no_neutral_df = df[~(df.polarity == 0)].copy()

In [32]:
no_neutral_df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5433
1,2446


In [33]:
train_no_neutral, test_no_neutral = train_test_split(no_neutral_df, test_size=0.2, random_state=42)
X_train_no_neutral = train_no_neutral.text.values
X_test_no_neutral = test_no_neutral.text.values
y_train_no_neutral = train_no_neutral.polarity.values
y_test_no_neutral = test_no_neutral.polarity.values

In [34]:
nb_no_neutral = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb_no_neutral.fit(X_train_no_neutral, y_train_no_neutral)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [35]:
y_pred_no_neutral = nb_no_neutral.predict(X_test_no_neutral)

In [36]:
print('accuracy %s' % accuracy_score(y_pred_no_neutral, y_test_no_neutral))

accuracy 0.7848984771573604


In [37]:
print(classification_report(y_test_no_neutral, y_pred_no_neutral))

              precision    recall  f1-score   support

          -1       0.77      0.99      0.87      1106
           1       0.93      0.30      0.45       470

    accuracy                           0.78      1576
   macro avg       0.85      0.65      0.66      1576
weighted avg       0.82      0.78      0.74      1576



### Naive Bayes - Ngrams

In [38]:
nb = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [39]:
y_pred = nb.predict(X_test)

In [40]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5928456591639871


### Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', RandomForestClassifier(n_estimators=100)),
])
rf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0

In [43]:
y_pred = nb.predict(X_test)

In [44]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5928456591639871


## Keras Deep Learning Models

### Integer labels to categorical

In [45]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [46]:
y_train = pd.get_dummies(train.polarity).values
y_test = pd.get_dummies(test.polarity).values

### Encoding

In [47]:
from keras.preprocessing.text import Tokenizer

In [48]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [49]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)

In [50]:
X_train[2]

'final nefasto gobierno bachelet vamos terminar comiéndonos mocos orrego'

In [51]:
X_train_sequences[2]

[262, 826, 15, 3, 107, 473, 9910, 6360, 6361]

In [52]:
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [53]:
for word in ['bachelet', 'final', 'orrego']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

bachelet: 3
final: 262
orrego: 6361


In [54]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

Dado que las secuencias generadas con texts_to_sequences no poseen un largo uniforme, se utiliza pad_sequence para remediar dicho resultado mediante la adición de ceros a las secuencias hasta homogeneizar el largo de estas últimas.

In [55]:
max_sequence_length = max(len(t) for t in X_train_sequences)
max_sequence_length

578

In [56]:
from keras.preprocessing.sequence import pad_sequences

In [57]:
X_train_padded_sequences = pad_sequences(X_train_sequences, padding='post', maxlen=max_sequence_length)
X_test_padded_sequences = pad_sequences(X_test_sequences, padding='post', maxlen=max_sequence_length)

In [58]:
X_train_padded_sequences.shape

(9948, 578)

In [59]:
X_train_padded_sequences[1,:]

array([   3,  530,   57,  128,   50,    1,   77, 9909,   23,  222,  195,
         94,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Embeding Layer

+ [Artículo relevante](https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer)
+ [Documentación embedding layer](https://keras.io/layers/embeddings/)

In [60]:
from keras.models import Sequential
from keras import layers

In [61]:
embedding_dim = 50

In [62]:
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_sequence_length))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 578, 50)           1286300   
_________________________________________________________________
flatten_1 (Flatten)          (None, 28900)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                289010    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 33        
Total params: 1,575,343
Trainable params: 1,575,343
Non-trainable params: 0
_________________________________________________________________


In [63]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=10)

Instructions for updating:
Use tf.cast instead.
Train on 9948 samples, validate on 2488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [124]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9953
Testing Accuracy:  0.5856


## Embeddings Preentrenados

### Word2Vec

In [64]:
from gensim.models.keyedvectors import KeyedVectors

Características del embedding: 
+ #dimensions = 300
+ #vectors = 1000653

In [65]:
wordvectors_file_vec = '../embeddings/SBW-vectors-300-min5.txt'
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec)

### Palabra dentro del vocabulario

In [66]:
wordvectors['de']

array([-2.96480e-02,  1.13360e-02,  1.99490e-02, -8.88320e-02,
       -2.52250e-02,  5.68440e-02,  2.54730e-02,  1.40680e-02,
        1.63694e-01, -6.71540e-02,  1.47380e-02,  2.71340e-02,
        6.64430e-02, -4.48460e-02, -4.49870e-02, -4.08980e-02,
        3.03110e-02,  3.41960e-02, -4.92400e-02,  8.53700e-03,
       -6.80910e-02, -8.79380e-02,  3.53000e-02,  1.49385e-01,
       -1.23500e-02,  1.26130e-02,  2.93500e-02,  6.95960e-02,
        3.91110e-02,  5.76520e-02,  6.99540e-02, -6.62170e-02,
       -4.17840e-02,  2.86230e-02,  2.67720e-02, -6.63920e-02,
        2.95300e-03, -1.21880e-02, -3.03630e-02,  4.02220e-02,
        3.48580e-02,  2.74690e-02, -2.90340e-02, -4.87480e-02,
       -3.85820e-02, -5.15530e-02, -3.35010e-02, -1.90080e-02,
        3.04300e-03,  1.10712e-01, -2.50960e-02,  1.11082e-01,
        3.52440e-02,  1.14207e-01,  1.01950e-02,  5.15110e-02,
       -4.06490e-02, -1.13944e-01,  4.48730e-02,  5.20110e-02,
        6.73600e-02,  4.90540e-02, -1.27085e-01, -3.184

### palabra fuera del vocabulario

In [67]:
wordvectors['bachelet']

KeyError: "word 'bachelet' not in vocabulary"

### Embedding matrix creation

In [68]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
embedding_matrix.shape

(25726, 300)

Obtención de los vectores para el vocabulario del corpus de entrenamiento, desde el modelo word2vect preentrenado. Si no se encuentra el vector para alguna palabra (Out of Vocabulary Word), se genera uno aleatorio. 

In [69]:
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = wordvectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [70]:
embedding_matrix.shape

(25726, 300)

In [71]:
del(wordvectors)

### Feed Forward Network

In [72]:
model = Sequential()
model.add(layers.Embedding(
                    input_dim=vocab_size, 
                    output_dim=EMBEDDING_DIM, 
                    input_length=max_sequence_length,
                    weights=[embedding_matrix],
                    trainable=True
                )
)
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 578, 300)          7717800   
_________________________________________________________________
flatten_2 (Flatten)          (None, 173400)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1734010   
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 33        
Total params: 9,451,843
Trainable params: 9,451,843
Non-trainable params: 0
_________________________________________________________________


In [73]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=10)

Train on 9948 samples, validate on 2488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [74]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9948
Testing Accuracy:  0.5740


### Convolutional Neural Networks

[model reference](https://arxiv.org/abs/1408.5882)

In [75]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[embedding_matrix], trainable=True),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 578, 300)          7717800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 576, 128)          115328    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 189, 128)          65664     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 63, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 59, 128)           82048     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
__________

In [76]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=50)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 9948 samples, validate on 2488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [77]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9950
Testing Accuracy:  0.5920


### CNN + Glove

In [78]:
glove_vectors = KeyedVectors.load_word2vec_format('../embeddings/glove-sbwc.i25.vec')

In [79]:
glove_embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = glove_vectors[word]
        glove_embedding_matrix[i] = embedding_vector
    except KeyError:
        glove_embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [81]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 578, 300)          7717800   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 576, 128)          115328    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 189, 128)          65664     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 63, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 59, 128)           82048     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
__________

In [82]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=50)

Train on 9948 samples, validate on 2488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [83]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9956
Testing Accuracy:  0.6141


### CNN (padding 'same') + Glove

In [108]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 578, 128)          115328    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 192, 128)          65664     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 64, 128)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 64, 128)           82048     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
__________

In [109]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=50)

Train on 9948 samples, validate on 2488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### LSTM

In [90]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length),
    layers.LSTM(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 578, 300)          7717800   
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 303       
Total params: 7,878,503
Trainable params: 7,878,503
Non-trainable params: 0
_________________________________________________________________


In [91]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=64)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [92]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.4344
Testing Accuracy:  0.4469


### LSTM with pretrained embeddings

In [95]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.LSTM(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 303       
Total params: 7,878,503
Trainable params: 7,878,503
Non-trainable params: 0
_________________________________________________________________


In [96]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=64)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.4344
Testing Accuracy:  0.4469


### GRU

In [93]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length),
    layers.GRU(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 578, 300)          7717800   
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               120300    
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 303       
Total params: 7,838,403
Trainable params: 7,838,403
Non-trainable params: 0
_________________________________________________________________


In [94]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=64)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### GRU with pretrained embeddings

In [105]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.SpatialDropout1D(0.2),
    layers.GRU(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 578, 300)          0         
_________________________________________________________________
gru_3 (GRU)                  (None, 100)               120300    
_________________________________________________________________
dense_18 (Dense)             (None, 3)                 303       
Total params: 7,838,403
Trainable params: 7,838,403
Non-trainable params: 0
_________________________________________________________________


In [107]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=32)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### CNN +  LSTM

In [111]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=4),
    layers.LSTM(50),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 574, 64)           96064     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 143, 64)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 50)                23000     
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 153       
Total params: 7,837,017
Trainable params: 7,837,017
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=32)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5