In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('mode.chained_assignment', None)
palette=sns.color_palette('magma')
sns.set(palette=palette)

In [2]:
train_data=pd.read_csv('train.csv', encoding='latin-1')
test_data=pd.read_csv('test.csv', encoding='latin-1')

train_data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
import string
import re
import nltk
stopwords = nltk.corpus.stopwords.words('english')
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    #remove links starting with http
    text1 = re.sub(r'http\S+', ' ', text)
    #remove digits
    text2 = re.sub(r'\d+',' ', text1)
    tokens = re.split('\W+', text2)
    text = [word for word in tokens if word not in stopwords]
    return text

In [4]:
train_data['OriginalTweetClean'] = train_data['Text'].apply(lambda x: clean_text(x))
train_data['OriginalTweetClean'] = train_data['Text'].apply(lambda x: clean_text(x))

train_data.head()
train_data.to_csv("dataset.csv")

In [5]:
X_train = train_data['OriginalTweetClean']
X_test = train_data['OriginalTweetClean']

y_train = train_data['Sentiment']
y_test = train_data['Sentiment']

In [6]:
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)
y_test.head()

Unnamed: 0,-1,1
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True


In [7]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
#The maximum number of words to be used(most frequent)
MAX_NB_WORDS = 10000
#Max number of words in each Tweet
MAX_SEQUENCE_LENGTH = 100


# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ')
tokenizer.fit_on_texts(X_train)

In [8]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_seq[10]

[94, 1811, 307, 274, 93, 777, 21, 18, 113, 9, 158, 1]

In [9]:
#Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq,44)
X_test_seq_padded = pad_sequences(X_test_seq,44)

X_train_seq_padded[10]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   94,
       1811,  307,  274,   93,  777,   21,   18,  113,    9,  158,    1])

In [10]:
print('X_train_seq_padded:', X_train_seq_padded.shape)
print('X_test_seq_padded:', X_test_seq_padded.shape)

print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train_seq_padded: (5791, 44)
X_test_seq_padded: (5791, 44)
y_train: (5791, 2)
y_test: (5791, 2)


# Model Building

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [12]:
model=Sequential()
model.add(Embedding(input_dim=MAX_NB_WORDS, output_dim=64, input_length=X_train_seq_padded.shape[1]))
#input_dim: Size of the vocabulary.
#output_dim: Dimension of the dense embedding.

model.add(SpatialDropout1D(0.4))
model.add(LSTM(128, activation='relu', dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

model.add(LSTM(128, activation='relu', dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

model.add(LSTM(128, activation='relu', dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(2, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 44, 64)            640000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 44, 64)           0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 44, 128)           98816     
                                                                 
 lstm_1 (LSTM)               (None, 44, 128)           131584    
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 2)                 258       
                                                        

In [13]:
#Compile the model
model.compile(optimizer=Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

#Adding an early stopping
es = EarlyStopping(monitor='val_accuracy', 
                   mode='max', 
                   patience=4, #Stop the model training if the validation accuracy doesnt increase in 4 consecutive Epochs
                   restore_best_weights=True)

In [14]:
#Fit the RNN
history = model.fit(X_train_seq_padded, y_train, 
                    batch_size=32, epochs=8, callbacks =[es],
                    validation_data=(X_test_seq_padded, y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [15]:
model.save("trainmodel.h5")

In [16]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)