# Sentiment Analysis GRU
Use an GRU RNN to predict sentiments using tweets.

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
# s3 module importing
import sys
sys.path.append('..')
from utils.s3_class import S3Functions
from utils.utils_funcs import cleaner, classify

s3_funcs = S3Functions(bucket_name='jdgallegoq-sentiment-analysis')

In [3]:
# load training data
train = pd.read_csv(
    s3_funcs.read_object(key='train_2kmZucJ.csv.xls')
)
print(train.info())
test = pd.read_csv(
    s3_funcs.read_object(key='test_12QyDcx.csv.xls')
)
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1953 non-null   int64 
 1   tweet   1953 non-null   object
dtypes: int64(1), object(1)
memory usage: 30.6+ KB
None


In [4]:
# explore data
display(train.sample(4))

Unnamed: 0,id,label,tweet
2558,2559,0,Came home to a pink case in the post #pink #pl...
4496,4497,0,My dude @FusRohDave. #comic #bff #bestfriend #...
6073,6074,0,After reinstalling iTunes 7 times on my comput...
3844,3845,0,I my #iPhone. It's the purfect way to ignor me...


In [5]:
# text cleaning
train['tweet_cleaned'] = train['tweet'].apply(cleaner)
test['tweet_cleaned'] = test['tweet'].apply(cleaner)

In [6]:
# explore again
display(train.sample(4))

Unnamed: 0,id,label,tweet,tweet_cleaned
5857,5858,0,This song makes me happy. #Big #Booty #Bitches...,this song makes me happy big booty bitches i g...
6904,6905,0,Gain Followers RT This MUST FOLLOW ME I FOLLOW...,gain followers rt this must follow me i follow...
6900,6901,0,Jaylin making some sandart. #kids #family #bw ...,jaylin making some sandart kids family bw bwlo...
1323,1324,0,RT @FollowBacg: #i #justinbieber #apple #ipad ...,rt followbacg i justinbieber apple ipad iphone...


In [7]:
# target distribution
train.label.value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.tweet_cleaned.values,
    train.label.values,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [10]:
# text representation
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(X_train)

# unique tokens in corpus
len(x_tokenizer.word_index)

19428

In [13]:
# filter rare tokens
threshold = 1
c = 0
for key, value in x_tokenizer.word_counts.items():
    if value > threshold:
        c+=1
print(c)

6003


In [14]:
# define tokenizer knowing the number of words to keep
x_tokenizer = Tokenizer(
    num_words=c,
    oov_token='unk'
)
x_tokenizer.fit_on_texts(X_train)

In [15]:
# explore some stats on tweet lenght to fix a max lenght for
# padding sequences
pd.Series([len(t.split()) for t in X_train]).describe()

count    6336.000000
mean       20.146149
std         6.459303
min         1.000000
25%        17.000000
50%        19.000000
75%        22.000000
max        59.000000
dtype: float64

In [16]:
# pad sequences
max_len = 30
X_train_seq = x_tokenizer.texts_to_sequences(X_train)
X_train_seq = pad_sequences(X_train_seq, padding='post', maxlen=max_len)
X_val_seq = x_tokenizer.texts_to_sequences(X_val)
X_val_seq = pad_sequences(X_val_seq, padding='post', maxlen=max_len)

In [22]:
# model architecture
from keras.models import Sequential
from keras.layers import (
    Dense,
    Embedding,
    GRU
)
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [18]:
# get vocab size to fix the embedding layer size
x_voc_size = x_tokenizer.num_words + 1

In [20]:
model = Sequential()
model.add(Embedding(
    x_voc_size,
    50,
    input_shape=(max_len,),
    mask_zero=True
))
model.add(GRU(128,))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 50)            300200    
                                                                 
 gru (GRU)                   (None, 128)               69120     
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 385961 (1.47 MB)
Trainable params: 385961 (1.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
# define optimizer
adam = Adam(learning_rate=1e-3, clipvalue=1)

# define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    min_delta=0.01,
    patience=15,
    verbose=1,
    mode='min'
)
model_checkpoint = ModelCheckpoint(
    filepath='best_weights.best.hdf5',
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)
# compile
model.compile(
    optimizer=adam,
    loss='binary_crossentropy'
)

callbacks = [early_stopping, model_checkpoint]



In [24]:
# train
model_history = model.fit(
    X_train_seq,
    y_train,
    batch_size=128,
    epochs=150,
    verbose=1,
    validation_data=(X_val_seq, y_val),
    callbacks=callbacks
)

Epoch 1/150
Epoch 1: val_loss improved from inf to 0.28849, saving model to best_weights.best.hdf5
Epoch 2/150
 3/50 [>.............................] - ETA: 1s - loss: 0.2736

  saving_api.save_model(


Epoch 2: val_loss improved from 0.28849 to 0.25670, saving model to best_weights.best.hdf5
Epoch 3/150
Epoch 3: val_loss did not improve from 0.25670
Epoch 4/150
Epoch 4: val_loss did not improve from 0.25670
Epoch 5/150
Epoch 5: val_loss did not improve from 0.25670
Epoch 6/150
Epoch 6: val_loss did not improve from 0.25670
Epoch 7/150
Epoch 7: val_loss did not improve from 0.25670
Epoch 8/150
Epoch 8: val_loss did not improve from 0.25670
Epoch 9/150
Epoch 9: val_loss did not improve from 0.25670
Epoch 10/150
Epoch 10: val_loss did not improve from 0.25670
Epoch 11/150
Epoch 11: val_loss did not improve from 0.25670
Epoch 12/150
Epoch 12: val_loss did not improve from 0.25670
Epoch 13/150
Epoch 13: val_loss did not improve from 0.25670
Epoch 14/150
Epoch 14: val_loss did not improve from 0.25670
Epoch 15/150
Epoch 15: val_loss did not improve from 0.25670
Epoch 16/150
Epoch 16: val_loss did not improve from 0.25670
Epoch 17/150
Epoch 17: val_loss did not improve from 0.25670
Epoch 17

In [25]:
# model eval
model.load_weights("best_weights.best.hdf5")

# predict
pred_prob = model.predict(X_val_seq)
pred_prob[:5]



array([[2.0231753e-05],
       [6.3735437e-01],
       [3.3466167e-05],
       [5.6402409e-01],
       [2.2533596e-03]], dtype=float32)

In [26]:
from sklearn import metrics

In [27]:
# get best threshold value
thres = np.arange(0, 0.5, 0.01)

score = []
y_true = np.array(y_val).ravel()
for t in thres:
    y_pred_seq = classify(pred_prob, t)
    y_pred = np.array(y_pred_seq).ravel()
    score.append(metrics.f1_score(y_true, y_pred))

# find optimum
opt = thres[score.index(max(score))]
opt

0.44

In [28]:
# get metrics based on optimum
y_pred_seq = classify(pred_prob, opt)
y_pred = np.array(y_pred_seq).ravel()
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      1152
           1       0.77      0.89      0.83       432

    accuracy                           0.90      1584
   macro avg       0.86      0.90      0.88      1584
weighted avg       0.91      0.90      0.90      1584

