In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


#### Loading the data Set

In [4]:
data = pd.read_csv('first-gop-debate-twitter-sentiment/Sentiment.csv')

In [5]:
df = data.copy()
df.head(2)

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,


* Only relevant columns are considered for the analysis

In [6]:
df = df[['text','sentiment']]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13871 entries, 0 to 13870
Data columns (total 2 columns):
text         13871 non-null object
sentiment    13871 non-null object
dtypes: object(2)
memory usage: 216.8+ KB


In [8]:
df = df[df.sentiment != 'Neutral']
# Dropping Neutral sentiment tweets

In [9]:
df['text'] = df['text'].apply(lambda x : x.lower()) # Making the text lower cased
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]',' ',x))) 
# Except alphabets and digts and spaces between discarding everything from the text column

In [10]:
print(df[ df['sentiment'] == 'Positive'].size)
print(df[ df['sentiment'] == 'Negative'].size)

4472
16986


> There are many Negative comments than the Positive tweets

In [11]:
# removing the 'rt' from gthe tweets
for idx,row in df.iterrows():
    row[0] = row[0].replace('rt',' ')

* Using this Tokenizer for tokenizing the data into features
* > from keras.preprocessing.text import Tokenizer

In [12]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

> Creating the LSTM model

In [13]:
embed_dim = 128
lstm_out = 196
voc=max(np.array([embed_dim,lstm_out]))+1

In [21]:
voc

197

> * LSTM model with Embedding layer
* embed_dim, lstm_out, batch_size, droupout_x are just hyperparameters play with them to get better results..

In [24]:
model = Sequential()

model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
# Embedding : Turns positive integers (indexes) into dense vectors of fixed size.
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out,dropout=0.2,recurrent_dropout=0.2)) # First layer of LSTM
model.add(Dense(units=2,activation='softmax')) # Output has to be only 2 outcomes Dense makes the output
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 29, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


> Declaration of Train and Test

In [17]:
Y = pd.get_dummies(df['sentiment']).values

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)

In [None]:
x_

In [19]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7188, 29) (7188, 2)
(3541, 29) (3541, 2)


In [25]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 12, batch_size=batch_size, verbose = 2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/12
 - 29s - loss: 0.4342 - accuracy: 0.8179
Epoch 2/12
 - 27s - loss: 0.3239 - accuracy: 0.8657
Epoch 3/12
 - 26s - loss: 0.2918 - accuracy: 0.8790
Epoch 4/12
 - 26s - loss: 0.2629 - accuracy: 0.8930
Epoch 5/12
 - 26s - loss: 0.2441 - accuracy: 0.8982
Epoch 6/12
 - 25s - loss: 0.2186 - accuracy: 0.9126
Epoch 7/12
 - 24s - loss: 0.1974 - accuracy: 0.9225
Epoch 8/12
 - 24s - loss: 0.1838 - accuracy: 0.9265
Epoch 9/12
 - 24s - loss: 0.1746 - accuracy: 0.9316
Epoch 10/12
 - 25s - loss: 0.1622 - accuracy: 0.9341
Epoch 11/12
 - 29s - loss: 0.1444 - accuracy: 0.9423
Epoch 12/12
 - 29s - loss: 0.1443 - accuracy: 0.9431


<keras.callbacks.callbacks.History at 0x1fcf0223470>

In [None]:
# Validation
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))