In [20]:
# Sentiment Analysis on Twitter data - predict positive/negative sentiment of tweet
# using Twitter data from: 
# "The Twitter Sentiment Analysis Dataset contains 1,578,627 classified tweets, 
#  each row is marked as 1 for positive sentiment and 0 for negative sentiment"
# linked in
# http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/
# http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [17]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
numpy.random.seed(1)

In [18]:
# read in data
import pandas as pd
import zipfile
zf = zipfile.ZipFile('C:/twitter/Sentiment-Analysis-Dataset.zip') 
df = pd.read_csv(zf.open('Sentiment Analysis Dataset.csv'), quotechar='"', sep=',', error_bad_lines=False)
print('Loaded lines:')
print(len(df))

# 2 lines fail and are skipped
# csv line 8835,1,Kaggle,""" Brokeback Mountain "" is a great short story and explains more, oddly enough, than the movie does, even though both cover the same chronological ground."


b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


Loaded lines:
1578612


In [19]:
# how many tweets to use, large amount may run out of memory, 200.000 use around 8 GB 
# 100.000 sample 1000 top_words use around 8 GB
# 200.000 sample 3000 top_words around 16 GB
sample_size = 100000
# how many most common words to use
top_words = 1000

sample = df.sample(sample_size)
mydataX = sample['SentimentText']
y = sample['Sentiment']

# transform pandas.core.series.Series to list
y = pd.Series.tolist(y)
mydataX = pd.Series.tolist(mydataX)

# translate text words into integer vectors
from keras.preprocessing.text import Tokenizer
# create the tokenizer
t = Tokenizer(num_words=top_words,
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=" ",
                char_level=False)

# fit the tokenizer on the documents
t.fit_on_texts(mydataX)
# encode the texts
encoded = t.texts_to_matrix(mydataX)

# lenght: number of docs * number of words used
print('Number of inputs:')
print(len(encoded))
print('Number of words used:')
print(len(encoded[1])) # same as print(t.num_words)

# split datasample into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded, y, test_size=0.1, random_state=1)

print('X_train length:')
print(len(X_train))
print('X_test length:')
print(len(X_test))

# truncate and pad input sequences
max_review_length = top_words
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# confirm length of each item is max_review_length
print('Length of features in each item')
print(len(X_train[0]))

Number of inputs:
100000
Number of words used:
1000
X_train length:
90000
X_test length:
10000
Length of features in each item
1000


In [20]:
# create the model
model = Sequential()
model.add(Dense(input_dim=top_words, units=100, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 100)               100100    
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101       
Total params: 100,201
Trainable params: 100,201
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
model.fit(X_train, y_train, epochs=10, batch_size=100)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 76.95%


In [None]:
model.fit(X_train, y_train, epochs=5, batch_size=100)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
# accuracy
# 100.000 sample 1000 top words : 76,7% accuracy on 10 epochs
# 150.000 sample 2000 top words : 77,8 % on 10 epochs, 20 epochs overfits
# 200.00         3000           : 79.1 % on 10 epochs, 15 epochs overfits