## Data Loading

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
pd.set_option('max_colwidth', 500)

In [24]:
df = pd.read_csv('../data/polarity-extended.csv', sep=";")
df.polarity.unique()

array([ 1, -1,  0, 10])

In [25]:
df.head()

Unnamed: 0,text,polarity
0,Abogado de Michelle Bachelet otorgó asesoría jurídica a mujer que realizó la denuncia. https://t.co/lV5gnWcfmm,1
1,RT @Alitop_: Faltan 635 dias para que se acabe esta pesadilla llamada Michelle Bachelet #CuentaRegresiva #ChaoBachelet,-1
2,Michelle Bachelet está trotando para estar en forma. Michelle Bachelet está tratando de aprobar sus reformas chavo!! https://t.co/0QAX2Hu2Gh,0
3,"RT @ElLibido: 2/15 Hace pocos días, los “amigos” de @derechatuitera masificaron imagen, sobre supuesto vino de Michelle Bachelet. https://t…",-1
4,"Alcalde de Pozo Almonte, José Fernando Muñoz junto a la Presidenta, Michelle Bachelet e Intendenta de Tarapacá. https://t.co/v1IxZ4D3aG",0


In [26]:
df.count()

text        13691
polarity    13691
dtype: int64

In [27]:
df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5903
0,4778
1,2815
10,195


## Text Cleaning

### Text preprocessing utils

In [28]:
import sys
sys.path.append('..')

from utils import text_preprocessing

In [29]:
df['text'] = df['text'].apply(text_preprocessing.normalize, no_tweet_hashtags=True, no_camel_case=False)

In [30]:
df.head()

Unnamed: 0,text,polarity
0,abogado michelle bachelet otorgó asesoría jurídica mujer realizó denuncia,1
1,faltan dias acabe pesadilla llamada michelle bachelet,-1
2,michelle bachelet trotando forma michelle bachelet tratando aprobar reformas chavo,0
3,hace pocos días amigos masificaron imagen supuesto vino michelle bachelet,-1
4,alcalde pozo almonte josé fernando muñoz junto presidenta michelle bachelet intendenta tarapacá,0


### Remove texts with only one word

In [31]:
df = df[df.apply(lambda r: len(r.text.split()) > 1, axis=1)]

In [32]:
df.count()

text        13475
polarity    13475
dtype: int64

## Dataset split

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
train, test = train_test_split(df, test_size=0.1, random_state=42)

In [35]:
train.count()

text        12127
polarity    12127
dtype: int64

In [36]:
test.count()

text        1348
polarity    1348
dtype: int64

## Scikit Learn Models

In [37]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [38]:
X_train = train.text.values
y_train = train.polarity.values
X_test = test.text.values
y_test = test.polarity.values

### Naive Bayes

In [39]:
from sklearn.naive_bayes import MultinomialNB

In [40]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [41]:
y_pred = nb.predict(X_test)

In [42]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6016320474777448


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.58      0.90      0.71       609
           0       0.61      0.45      0.51       453
           1       0.78      0.23      0.36       265
          10       0.00      0.00      0.00        21

    accuracy                           0.60      1348
   macro avg       0.49      0.39      0.40      1348
weighted avg       0.62      0.60      0.56      1348



  'precision', 'predicted', average, warn_for)


### Support Vector Machine

In [22]:
from sklearn.linear_model import SGDClassifier

In [23]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [24]:
y_pred = sgd.predict(X_test)

In [25]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6023738872403561


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.59      0.89      0.71       609
           0       0.65      0.38      0.47       453
           1       0.61      0.37      0.46       265
          10       0.00      0.00      0.00        21

    accuracy                           0.60      1348
   macro avg       0.46      0.41      0.41      1348
weighted avg       0.60      0.60      0.57      1348



### Regresión Logística

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [29]:
y_pred = logreg.predict(X_test)

In [30]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5719584569732937


In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.67      0.66      0.67       609
           0       0.50      0.53      0.52       453
           1       0.50      0.48      0.49       265
          10       0.00      0.00      0.00        21

    accuracy                           0.57      1348
   macro avg       0.42      0.42      0.42      1348
weighted avg       0.57      0.57      0.57      1348



### Naive Bayes - solo positivos y negativos

In [32]:
no_neutral_df = df[~(df.polarity == 0)].copy()

In [33]:
no_neutral_df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5846
1,2758
10,185


In [34]:
train_no_neutral, test_no_neutral = train_test_split(no_neutral_df, test_size=0.2, random_state=42)
X_train_no_neutral = train_no_neutral.text.values
X_test_no_neutral = test_no_neutral.text.values
y_train_no_neutral = train_no_neutral.polarity.values
y_test_no_neutral = test_no_neutral.polarity.values

In [35]:
nb_no_neutral = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb_no_neutral.fit(X_train_no_neutral, y_train_no_neutral)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [36]:
y_pred_no_neutral = nb_no_neutral.predict(X_test_no_neutral)

In [37]:
print('accuracy %s' % accuracy_score(y_pred_no_neutral, y_test_no_neutral))

accuracy 0.754835039817975


In [38]:
print(classification_report(y_test_no_neutral, y_pred_no_neutral))

              precision    recall  f1-score   support

          -1       0.74      0.98      0.84      1180
           1       0.89      0.30      0.45       544
          10       0.00      0.00      0.00        34

    accuracy                           0.75      1758
   macro avg       0.54      0.43      0.43      1758
weighted avg       0.77      0.75      0.71      1758



  'precision', 'predicted', average, warn_for)


### Naive Bayes - Ngrams

In [39]:
nb = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [40]:
y_pred = nb.predict(X_test)

In [41]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5838278931750742


## Keras Deep Learning Models

In [42]:
BATCH_SIZE = 32
EPOCHS = 10
CHECKPOINTS_PATH = '../model_checkpoints/polarity/'

### Integer labels to categorical

In [43]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [44]:
y_train = pd.get_dummies(train.polarity).values
y_test = pd.get_dummies(test.polarity).values

### Encoding

In [45]:
from keras.preprocessing.text import Tokenizer

In [46]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [47]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)

In [48]:
X_train[2]

'ahora mismo dicen oye lista clinton atenta libertad expresión'

In [49]:
X_train_sequences[2]

[19, 40, 149, 1160, 457, 28, 7252, 371, 2607]

In [50]:
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [51]:
for word in ['bachelet', 'final', 'orrego']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

bachelet: 4
final: 218
orrego: 9228


In [52]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

Dado que las secuencias generadas con texts_to_sequences no poseen un largo uniforme, se utiliza pad_sequence para remediar dicho resultado mediante la adición de ceros a las secuencias hasta homogeneizar el largo de estas últimas.

In [53]:
max_sequence_length = max(len(t) for t in X_train_sequences)
max_sequence_length

578

### Padding sequences

In [54]:
from keras.preprocessing.sequence import pad_sequences

In [55]:
X_train_padded_sequences = pad_sequences(X_train_sequences, padding='post', maxlen=max_sequence_length)
X_test_padded_sequences = pad_sequences(X_test_sequences, padding='post', maxlen=max_sequence_length)

In [56]:
X_train_padded_sequences.shape

(12127, 578)

In [57]:
X_train_padded_sequences[1,:]

array([ 295,  524, 1414, 1237, 2606,  546, 3623,  444, 1849, 1238,    3,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Embeding Layer

+ [Artículo relevante](https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer)
+ [Documentación embedding layer](https://keras.io/layers/embeddings/)

In [58]:
from datetime import datetime
from keras.models import Sequential
from keras import layers
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [59]:
embedding_dim = 50

In [60]:
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_sequence_length))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(4, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 578, 50)           1450700   
_________________________________________________________________
flatten_1 (Flatten)          (None, 28900)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                1849664   
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 260       
Total params: 3,300,624
Trainable params: 3,300,624
Non-trainable params: 0
_________________________________________________________________


In [63]:
model_name = 'ffn'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)

history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.59718, saving model to ../model_checkpoints/polarity/ffn-2019-06-04T11:24:47.043451.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.59718 to 0.60237, saving model to ../model_checkpoints/polarity/ffn-2019-06-04T11:24:47.043451.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.60237 to 0.60460, saving model to ../model_checkpoints/polarity/ffn-2019-06-04T11:24:47.043451.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.60460


In [64]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9938
Testing Accuracy:  0.5979


## Embeddings Preentrenados

### Word2Vec

In [65]:
from gensim.models.keyedvectors import KeyedVectors

Características del embedding: 
+ #dimensions = 300
+ #vectors = 1000653

In [66]:
wordvectors_file_vec = '../embeddings/SBW-vectors-300-min5.txt'
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec)

### Palabra dentro del vocabulario

In [67]:
wordvectors['de']

array([-2.96480e-02,  1.13360e-02,  1.99490e-02, -8.88320e-02,
       -2.52250e-02,  5.68440e-02,  2.54730e-02,  1.40680e-02,
        1.63694e-01, -6.71540e-02,  1.47380e-02,  2.71340e-02,
        6.64430e-02, -4.48460e-02, -4.49870e-02, -4.08980e-02,
        3.03110e-02,  3.41960e-02, -4.92400e-02,  8.53700e-03,
       -6.80910e-02, -8.79380e-02,  3.53000e-02,  1.49385e-01,
       -1.23500e-02,  1.26130e-02,  2.93500e-02,  6.95960e-02,
        3.91110e-02,  5.76520e-02,  6.99540e-02, -6.62170e-02,
       -4.17840e-02,  2.86230e-02,  2.67720e-02, -6.63920e-02,
        2.95300e-03, -1.21880e-02, -3.03630e-02,  4.02220e-02,
        3.48580e-02,  2.74690e-02, -2.90340e-02, -4.87480e-02,
       -3.85820e-02, -5.15530e-02, -3.35010e-02, -1.90080e-02,
        3.04300e-03,  1.10712e-01, -2.50960e-02,  1.11082e-01,
        3.52440e-02,  1.14207e-01,  1.01950e-02,  5.15110e-02,
       -4.06490e-02, -1.13944e-01,  4.48730e-02,  5.20110e-02,
        6.73600e-02,  4.90540e-02, -1.27085e-01, -3.184

### Embedding matrix creation

In [68]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
embedding_matrix.shape

(29014, 300)

Obtención de los vectores para el vocabulario del corpus de entrenamiento, desde el modelo word2vect preentrenado. Si no se encuentra el vector para alguna palabra (Out of Vocabulary Word), se genera uno aleatorio. 

In [69]:
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = wordvectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [70]:
embedding_matrix.shape

(29014, 300)

In [71]:
del(wordvectors)

### FFN + word2vector

In [72]:
model = Sequential()
model.add(layers.Embedding(
                    input_dim=vocab_size, 
                    output_dim=EMBEDDING_DIM, 
                    input_length=max_sequence_length,
                    weights=[embedding_matrix],
                    trainable=True
                )
)
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(4, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 578, 300)          8704200   
_________________________________________________________________
flatten_2 (Flatten)          (None, 173400)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                11097664  
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 260       
Total params: 19,802,124
Trainable params: 19,802,124
Non-trainable params: 0
_________________________________________________________________


In [73]:
model_name = 'ffn-w2v'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.61499, saving model to ../model_checkpoints/polarity/ffn-w2v-2019-06-04T11:33:18.613130.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.61499
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.61499
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.61499
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.61499


In [89]:
y_pred = model.predict(X_test_padded_sequences)
y_test

array([[1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       ...,
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]], dtype=uint8)

### CNN - Yoon Kim Model + word2vec

[model reference](https://arxiv.org/abs/1408.5882)

In [74]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length, 
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=5),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 578, 300)          8704200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 574, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 191, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 187, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 37, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 33, 128)           82048     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
__________

In [75]:
model_name = 'cnn-w2v'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.59941, saving model to ../model_checkpoints/polarity/cnn-w2v-2019-06-04T11:42:59.698089.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.59941 to 0.61721, saving model to ../model_checkpoints/polarity/cnn-w2v-2019-06-04T11:42:59.698089.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.61721
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.61721
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.61721
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.61721


In [76]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9917
Testing Accuracy:  0.5675


### CNN Yoon Kim model (padding = 'same') + word2vec

In [79]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length, 
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 578, 300)          8704200   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 578, 128)          115328    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 192, 128)          65664     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 64, 128)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 64, 128)           82048     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
__________

In [80]:
model_name = 'cnn-w2v-padding-same'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.63131, saving model to ../model_checkpoints/polarity/cnn-w2v-padding-same-2019-06-04T12:06:03.234815.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.63131
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.63131
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.63131
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.63131


### CNN Yoon Kim model + Glove

In [81]:
glove_vectors = KeyedVectors.load_word2vec_format('../embeddings/glove-sbwc.i25.vec')

In [82]:
glove_embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = glove_vectors[word]
        glove_embedding_matrix[i] = embedding_vector
    except KeyError:
        glove_embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [83]:
del(glove_vectors)

In [84]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 578, 300)          8704200   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 576, 128)          115328    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 189, 128)          65664     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 63, 128)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 59, 128)           82048     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
__________

In [85]:
model_name = 'cnn-glove'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.64763, saving model to ../model_checkpoints/polarity/cnn-glove-2019-06-04T12:25:03.286500.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.64763
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.64763
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.64763
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.64763


In [86]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9898
Testing Accuracy:  0.6187


### CNN (padding 'same') + Glove

In [87]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=4, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 578, 300)          8704200   
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 578, 128)          115328    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 192, 128)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 192, 128)          65664     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 64, 128)           0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 64, 128)           82048     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
__________

In [88]:
model_name = 'cnn-glove-padding-same'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)

history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.63724, saving model to ../model_checkpoints/polarity/cnn-glove-padding-same-2019-06-04T12:41:03.316601.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.63724 to 0.64466, saving model to ../model_checkpoints/polarity/cnn-glove-padding-same-2019-06-04T12:41:03.316601.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.64466
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.64466
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.64466
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.64466


### LSTM

In [109]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length),
    layers.LSTM(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 578, 300)          8346900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_19 (Dense)             (None, 3)                 303       
Total params: 8,507,603
Trainable params: 8,507,603
Non-trainable params: 0
_________________________________________________________________


In [110]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 11192 samples, validate on 1244 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### LSTM + Glove

In [95]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, weights=[glove_embedding_matrix], trainable=True),
    layers.LSTM(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 303       
Total params: 7,878,503
Trainable params: 7,878,503
Non-trainable params: 0
_________________________________________________________________


In [96]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=64)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.4344
Testing Accuracy:  0.4469


### BiLSTM

In [89]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
    ),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 578, 300)          8704200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 578, 128)          186880    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 4)                 516       
Total params: 8,891,596
Trainable params: 8,891,596
Non-trainable params: 0
_________________________________________________________________


In [90]:
model_name = 'biLSTM'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)

history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.63947, saving model to ../model_checkpoints/polarity/biLSTM-2019-06-04T12:57:07.503644.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.63947
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.63947
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.63947
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.63947


### BiLSTM + word2vec

In [91]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 578, 300)          8704200   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 578, 128)          186880    
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 128)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 4)                 516       
Total params: 8,891,596
Trainable params: 8,891,596
Non-trainable params: 0
_________________________________________________________________


In [92]:
model_name = 'bilstm-w2v'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)

history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.64318, saving model to ../model_checkpoints/polarity/bilstm-w2v-2019-06-04T13:26:10.587398.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.64318
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.64318
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.64318
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.64318


### BiLSTM + Glove

In [93]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 578, 300)          8704200   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 578, 128)          186880    
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 128)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 4)                 516       
Total params: 8,891,596
Trainable params: 8,891,596
Non-trainable params: 0
_________________________________________________________________


In [94]:
model_name = 'bilstm-glove'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.64911, saving model to ../model_checkpoints/polarity/bilstm-glove-2019-06-04T13:57:37.018938.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.64911 to 0.66766, saving model to ../model_checkpoints/polarity/bilstm-glove-2019-06-04T13:57:37.018938.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.66766
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.66766
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.66766
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.66766


### GRU

In [93]:
model = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_sequence_length),
    layers.GRU(100),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 578, 300)          7717800   
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               120300    
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 303       
Total params: 7,838,403
Trainable params: 7,838,403
Non-trainable params: 0
_________________________________________________________________


In [94]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=64)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### GRU + Glove

In [105]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.SpatialDropout1D(0.2),
    layers.GRU(64),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 578, 300)          0         
_________________________________________________________________
gru_3 (GRU)                  (None, 100)               120300    
_________________________________________________________________
dense_18 (Dense)             (None, 3)                 303       
Total params: 7,838,403
Trainable params: 7,838,403
Non-trainable params: 0
_________________________________________________________________


In [107]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=32)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiGRU + word2vec

In [95]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 578, 300)          8704200   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 128)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 4)                 516       
Total params: 8,844,876
Trainable params: 8,844,876
Non-trainable params: 0
_________________________________________________________________


In [96]:
model_name = 'bigru-w2v'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.63353, saving model to ../model_checkpoints/polarity/bigru-w2v-2019-06-04T14:32:32.358771.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.63353 to 0.64243, saving model to ../model_checkpoints/polarity/bigru-w2v-2019-06-04T14:32:32.358771.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.64243
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.64243
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.64243
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.64243


### BiGRU + Glove

In [97]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 578, 300)          8704200   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 128)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 4)                 516       
Total params: 8,844,876
Trainable params: 8,844,876
Non-trainable params: 0
_________________________________________________________________


In [98]:
model_name = 'bigru-glove'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.64837, saving model to ../model_checkpoints/polarity/bigru-glove-2019-06-04T15:49:00.634142.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.64837
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.64837
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.64837
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.64837


### CNN +  LSTM + Glove

In [118]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=4),
    layers.LSTM(64),
    layers.Dense(64),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 574, 64)           96064     
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 143, 64)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 50)                23000     
_________________________________________________________________
dense_27 (Dense)             (None, 64)                3264      
_________________________________________________________________
dense_28 (Dense)             (None, 3)                 195       
Total params: 7,840,323
Trainable params: 7,840,323
Non-trainable params: 0
_________________________________________________________________


In [119]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=EPOCHS,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=BATCH_SIZE)

Train on 9948 samples, validate on 2488 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### BiGRU + CNN + word2vec

In [99]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 578, 300)          8704200   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 574, 64)           41024     
_________________________________________________________________
global_max_pooling1d_11 (Glo (None, 64)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_21 (Dense)             (None, 4)                 260       
Total params: 8,889,804
Trainable params: 8,889,804
Non-trainable params: 0
_________________________________________________________________


In [100]:
model_name = 'bigru-cnn-w2v'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.61795, saving model to ../model_checkpoints/polarity/bigru-cnn-w2v-2019-06-04T16:15:32.057807.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.61795 to 0.61944, saving model to ../model_checkpoints/polarity/bigru-cnn-w2v-2019-06-04T16:15:32.057807.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.61944
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.61944
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.61944
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.61944


### BiGRU + CNN + Glove

In [101]:
model = Sequential([
    layers.Embedding(
        input_dim=vocab_size, 
        output_dim=EMBEDDING_DIM, 
        input_length=max_sequence_length,
        weights=[glove_embedding_matrix], 
        trainable=True
    ),
    layers.Bidirectional(layers.GRU(64, return_sequences=True)),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 578, 300)          8704200   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 578, 128)          140160    
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 574, 64)           41024     
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 64)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_23 (Dense)             (None, 4)                 260       
Total params: 8,889,804
Trainable params: 8,889,804
Non-trainable params: 0
_________________________________________________________________


In [102]:
model_name = 'bigru-cnn-glove'
filepath = CHECKPOINTS_PATH + model_name + '-{}.hdf5'.format(datetime.today().isoformat())
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_acc', baseline=0.61, patience=4)


history = model.fit(
    X_train_padded_sequences, y_train,
    epochs=EPOCHS,
    verbose=True,
    validation_data=(X_test_padded_sequences, y_test),
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, es]
)

Train on 12127 samples, validate on 1348 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.64540, saving model to ../model_checkpoints/polarity/bigru-cnn-glove-2019-06-04T16:48:58.751815.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.64540 to 0.65059, saving model to ../model_checkpoints/polarity/bigru-cnn-glove-2019-06-04T16:48:58.751815.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.65059
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.65059
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.65059
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.65059


## Query Based

In [53]:
df = pd.read_csv('../data/polarity-extended.csv', sep=";")
df.polarity.unique()

array([ 1, -1,  0, 10])

In [54]:
df.head()

Unnamed: 0,text,polarity
0,Abogado de Michelle Bachelet otorgó asesoría jurídica a mujer que realizó la denuncia. https://t.co/lV5gnWcfmm,1
1,RT @Alitop_: Faltan 635 dias para que se acabe esta pesadilla llamada Michelle Bachelet #CuentaRegresiva #ChaoBachelet,-1
2,Michelle Bachelet está trotando para estar en forma. Michelle Bachelet está tratando de aprobar sus reformas chavo!! https://t.co/0QAX2Hu2Gh,0
3,"RT @ElLibido: 2/15 Hace pocos días, los “amigos” de @derechatuitera masificaron imagen, sobre supuesto vino de Michelle Bachelet. https://t…",-1
4,"Alcalde de Pozo Almonte, José Fernando Muñoz junto a la Presidenta, Michelle Bachelet e Intendenta de Tarapacá. https://t.co/v1IxZ4D3aG",0


In [55]:
import sys
sys.path.append('..')

from utils import text_preprocessing

In [56]:
df['text'] = df['text'].apply(text_preprocessing.normalize, no_tweet_hashtags=True, no_camel_case=False)
df.head()

Unnamed: 0,text,polarity
0,abogado michelle bachelet otorgó asesoría jurídica mujer realizó denuncia,1
1,faltan dias acabe pesadilla llamada michelle bachelet,-1
2,michelle bachelet trotando forma michelle bachelet tratando aprobar reformas chavo,0
3,hace pocos días amigos masificaron imagen supuesto vino michelle bachelet,-1
4,alcalde pozo almonte josé fernando muñoz junto presidenta michelle bachelet intendenta tarapacá,0


In [57]:
df = df[df.text.str.contains('bachelet')]
df.count()

text        993
polarity    993
dtype: int64

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
train, test = train_test_split(df, test_size=0.25, random_state=42)

In [60]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [61]:
y_pred = nb.predict(X_test)

In [62]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6016320474777448
