In [1]:
import pandas as pd
import numpy as np
import os
import re
import string
import warnings

In [2]:
import tensorflow
os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Data Loading
### Load cleaned data from data_preprocessing pipeline

In [3]:
train_data_path = "./clean_train.csv"
df_train = pd.read_csv(train_data_path)
df_train.head()

Unnamed: 0,0,words
0,0,awww bummer shoulda got david carr third day
1,0,upset updat facebook text might cri result sch...
2,0,dive mani time ball manag save rest bound
3,0,whole bodi feel itchi like fire
4,0,behav mad see


In [4]:
test_data_path = "./clean_test.csv"
df_test = pd.read_csv(test_data_path)
df_test.head()

Unnamed: 0,0,words
0,0,awww bummer shoulda got david carr third day
1,0,upset updat facebook text might cri result sch...
2,0,dive mani time ball manag save rest bound
3,0,whole bodi feel itchi like fire
4,0,behav mad see


In [5]:
# remove empty entries
df_train = df_train[df_train['words'].notna()]
df_test = df_test[df_test['words'].notna()]

## Tokenize for model input

In [6]:
tokenizer = Tokenizer(num_words = 2000, split = ' ')
tokenizer.fit_on_texts(df_train['words'].astype(str).values)
train_tweets = tokenizer.texts_to_sequences(df_train['words'].astype(str).values)
max_len = max([len(i) for i in train_tweets])
train_tweets = pad_sequences(train_tweets, maxlen = max_len)
test_tweets = tokenizer.texts_to_sequences(df_test['words'].astype(str).values)
test_tweets = pad_sequences(test_tweets, maxlen = max_len)

## Deep Learning methods
### LSTM Model

In [7]:
# model architecture
model = Sequential()
model.add(Embedding(2000, 256, input_length = train_tweets.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(256, dropout = 0.2))
model.add(Dense(2, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 27, 256)           512000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 27, 256)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 2)                 514       
Total params: 1,037,826
Trainable params: 1,037,826
Non-trainable params: 0
_________________________________________________________________


In [8]:
# train model
model.fit(train_tweets, pd.get_dummies(df_train['0']).values, epochs = 10, batch_size = 512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efe28b4cd30>

In [9]:
# test model
score, accuracy = model.evaluate(test_tweets, pd.get_dummies(df_test['0']).values, batch_size = 512)
print("Test accuracy: {}".format(accuracy))

Test accuracy: 0.8029493093490601


### RNN Model

In [10]:
# model architecture
RNNmodel = Sequential()
RNNmodel.add(Embedding(2000, 256, input_length = train_tweets.shape[1]))
RNNmodel.add(SpatialDropout1D(0.4))
RNNmodel.add(SimpleRNN(256, dropout = 0.2))
RNNmodel.add(Dense(2, activation = 'softmax'))
RNNmodel.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
RNNmodel.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 27, 256)           512000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 27, 256)           0         
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 256)               131328    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 643,842
Trainable params: 643,842
Non-trainable params: 0
_________________________________________________________________


In [11]:
# train model
RNNmodel.fit(train_tweets, pd.get_dummies(df_train['0']).values, epochs = 10, batch_size = 512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efe1c4a9a90>

In [12]:
# test model
RNNscore, RNNaccuracy = RNNmodel.evaluate(test_tweets, pd.get_dummies(df_test['0']).values, batch_size = 512)
print("Test accuracy: {}".format(RNNaccuracy))

Test accuracy: 0.7740947008132935


Overall, the best performing model is the LSTM model and the worst performing model is SVM. The deep learning models performed better than the classical machine learning methods. This is probably due to their ability to process recurrence. In particular, LSTM performed better than RNN because of its ability to remember information from previous timesteps. As discovered in data_preprocessing, some words occur in both positive and negative tweets, hence the sentiment really depends on context, not individual words, hence RNN and LSTM which takes in timestep information is more useful in predicting sentiments.