# Sentiment Analysis CNN
Use an CNN to predict sentiments using tweets.

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
# s3 module importing
import sys
sys.path.append('..')
from utils.s3_class import S3Functions
from utils.utils_funcs import cleaner, classify

s3_funcs = S3Functions(bucket_name='jdgallegoq-sentiment-analysis')

In [3]:
# load training data
train = pd.read_csv(
    s3_funcs.read_object(key='train_2kmZucJ.csv.xls')
)
print(train.info())
test = pd.read_csv(
    s3_funcs.read_object(key='test_12QyDcx.csv.xls')
)
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1953 non-null   int64 
 1   tweet   1953 non-null   object
dtypes: int64(1), object(1)
memory usage: 30.6+ KB
None


In [4]:
# explore data
display(train.sample(4))

Unnamed: 0,id,label,tweet
4357,4358,0,Cool girls for you! http://ow.ly/K7W8307ox4a #...
4857,4858,0,Follow @capetownsup on Instagram http://mf.tt/...
1485,1486,0,More than happy <3 / #Samsung #Galaxy #S3 #Gal...
3872,3873,0,Charge your device wirelessly. Available at ou...


In [5]:
# text cleaning
train['tweet_cleaned'] = train['tweet'].apply(cleaner)
test['tweet_cleaned'] = test['tweet'].apply(cleaner)

In [6]:
# explore again
display(train.sample(4))

Unnamed: 0,id,label,tweet,tweet_cleaned
560,561,1,Oh thanks apple you wiped my phone said that e...,oh thanks apple you wiped my phone said that e...
5736,5737,0,Photo: #my #sony #live #watch #finally #on #me...,photo my sony live watch finally on me hands i...
7809,7810,0,I buy all the newest gadgets and $&@*# from ap...,i buy all the newest gadgets and from apple i ...
6128,6129,0,#Aruba #aruba #travel #travelblogger #blogger ...,aruba aruba travel travelblogger blogger vacat...


In [7]:
# target distribution
train.label.value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.tweet_cleaned.values,
    train.label.values,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [10]:
# text representation
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(X_train)

# unique tokens in corpus
len(x_tokenizer.word_index)

19428

In [11]:
# filter rare tokens
threshold = 1
c = 0
for key, value in x_tokenizer.word_counts.items():
    if value > threshold:
        c+=1
print(c)

6003


In [12]:
# define tokenizer knowing the number of words to keep
x_tokenizer = Tokenizer(
    num_words=c,
    oov_token='unk'
)
x_tokenizer.fit_on_texts(X_train)

In [13]:
# explore some stats on tweet lenght to fix a max lenght for
# padding sequences
pd.Series([len(t.split()) for t in X_train]).describe()

count    6336.000000
mean       20.146149
std         6.459303
min         1.000000
25%        17.000000
50%        19.000000
75%        22.000000
max        59.000000
dtype: float64

In [14]:
# pad sequences
max_len = 30
X_train_seq = x_tokenizer.texts_to_sequences(X_train)
X_train_seq = pad_sequences(X_train_seq, padding='post', maxlen=max_len)
X_val_seq = x_tokenizer.texts_to_sequences(X_val)
X_val_seq = pad_sequences(X_val_seq, padding='post', maxlen=max_len)

In [15]:
# model architecture
from keras.models import Sequential
from keras.layers import (
    Dense,
    Embedding,
    Conv1D,
    Dropout,
    GlobalMaxPooling1D
)
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [16]:
# get vocab size to fix the embedding layer size
x_voc_size = x_tokenizer.num_words + 1

In [17]:
model = Sequential()
model.add(Embedding(
    x_voc_size,
    50,
    input_shape=(max_len,),
    mask_zero=True
))
model.add(Conv1D(64, 3, padding='same'))
model.add(Dropout(0.1))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 50)            300200    
                                                                 
 conv1d (Conv1D)             (None, 30, 64)            9664      
                                                                 
 dropout (Dropout)           (None, 30, 64)            0         
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                        

In [19]:
# define optimizer
adam = Adam(learning_rate=1e-3, clipvalue=1)

# define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    min_delta=0.01,
    patience=15,
    verbose=1,
    mode='min'
)
model_checkpoint = ModelCheckpoint(
    filepath='best_weights_cnn.best.hdf5',
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)
# compile
model.compile(
    optimizer=adam,
    loss='binary_crossentropy'
)

callbacks = [early_stopping, model_checkpoint]



In [20]:
# train
model_history = model.fit(
    X_train_seq,
    y_train,
    batch_size=128,
    epochs=150,
    verbose=1,
    validation_data=(X_val_seq, y_val),
    callbacks=callbacks
)

Epoch 1/150
Epoch 1: val_loss improved from inf to 0.40241, saving model to best_weights_cnn.best.hdf5
Epoch 2/150

  saving_api.save_model(



Epoch 2: val_loss improved from 0.40241 to 0.28345, saving model to best_weights_cnn.best.hdf5
Epoch 3/150
Epoch 3: val_loss improved from 0.28345 to 0.25187, saving model to best_weights_cnn.best.hdf5
Epoch 4/150
Epoch 4: val_loss did not improve from 0.25187
Epoch 5/150
Epoch 5: val_loss did not improve from 0.25187
Epoch 6/150
Epoch 6: val_loss did not improve from 0.25187
Epoch 7/150
Epoch 7: val_loss did not improve from 0.25187
Epoch 8/150
Epoch 8: val_loss did not improve from 0.25187
Epoch 9/150
Epoch 9: val_loss did not improve from 0.25187
Epoch 10/150
Epoch 10: val_loss did not improve from 0.25187
Epoch 11/150
Epoch 11: val_loss did not improve from 0.25187
Epoch 12/150
Epoch 12: val_loss did not improve from 0.25187
Epoch 13/150
Epoch 13: val_loss did not improve from 0.25187
Epoch 14/150
Epoch 14: val_loss did not improve from 0.25187
Epoch 15/150
Epoch 15: val_loss did not improve from 0.25187
Epoch 16/150
Epoch 16: val_loss did not improve from 0.25187
Epoch 17/150
Epo

In [21]:
# model eval
model.load_weights("best_weights_cnn.best.hdf5")

# predict
pred_prob = model.predict(X_val_seq)
pred_prob[:5]



array([[0.00105089],
       [0.3672048 ],
       [0.00553458],
       [0.6922485 ],
       [0.02518514]], dtype=float32)

In [22]:
from sklearn import metrics

In [23]:
# get best threshold value
thres = np.arange(0, 0.5, 0.01)

score = []
y_true = np.array(y_val).ravel()
for t in thres:
    y_pred_seq = classify(pred_prob, t)
    y_pred = np.array(y_pred_seq).ravel()
    score.append(metrics.f1_score(y_true, y_pred))

# find optimum
opt = thres[score.index(max(score))]
opt

0.25

In [24]:
# get metrics based on optimum
y_pred_seq = classify(pred_prob, opt)
y_pred = np.array(y_pred_seq).ravel()
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92      1152
           1       0.72      0.94      0.82       432

    accuracy                           0.89      1584
   macro avg       0.85      0.90      0.87      1584
weighted avg       0.91      0.89      0.89      1584

