## Twitter Sentiment Analysis with Gensim Word2Vec and Keras Convolutional Network
Giuseppe Bonaccorso (https://www.bonaccorso.eu)

In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [2]:
import dill
import keras.backend as K
import multiprocessing
import tensorflow as tf

from gensim.models.word2vec import Word2Vec


from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# Set random seed (for reproducibility)
np.random.seed(1000)

In [4]:
# Select whether using Keras with or without GPU support
# See: https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
use_gpu = True

config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), 
                        inter_op_parallelism_threads=multiprocessing.cpu_count(), 
                        allow_soft_placement=True, 
                        device_count = {'CPU' : 1, 
                                        'GPU' : 1 if use_gpu else 0})

session = tf.Session(config=config)
K.set_session(session)

Download the dataset from: http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [5]:
dataset_location = './data.csv'
model_location = './model/'

### Parse tweet corpus and sentiments

In [6]:

corpus = []
labels = []

In [7]:
'''
with open(dataset_location, 'r', encoding='utf-8') as df:
    for i, line in enumerate(df):
        if i == 0:
            # Skip the header
            continue

        parts = line.strip().split(',')
        
        # Sentiment (0 = Negative, 1 = Positive)
        labels.append(int(parts[0].strip()))
        
        # Tweet
        tweet = parts[1].strip()
        if tweet.startswith('"'):
            tweet = tweet[1:]
        if tweet.endswith('"'):
            tweet = tweet[::-1]
        
        corpus.append(tweet.strip().lower())
        
print('Corpus size: {}'.format(len(corpus)))
print(corpus)

'''


filepath = 'first.txt' 
with open(filepath) as fp:  
   line = fp.readline()
   while line:
        corpus.append(line)
        labels.append(1)
        line = fp.readline()

filepath = 'second.txt' 
with open(filepath) as fp:  
   line = fp.readline()
   while line:
        corpus.append(line)
        labels.append(0)
        line = fp.readline()
        
l1= len(corpus)


filepath = 'first1.txt' 
with open(filepath) as fp:  
   line = fp.readline()
   while line:
        corpus.append(line)
        labels.append(1)
        line = fp.readline()

filepath = 'second1.txt' 
with open(filepath) as fp:  
   line = fp.readline()
   while line:
        corpus.append(line)
        labels.append(0)
        line = fp.readline()
        
l2 = len(corpus)
diff = l2-l1

print(l1,diff)



35297 4482


### Tokenize and remove stopwords

In [8]:
tkr = RegexpTokenizer('[a-zA-Z0-9@]+')
stemmer = LancasterStemmer()

In [9]:
tokenized_corpus = []

for i, tweet in enumerate(corpus):
    tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('@')]
    tokenized_corpus.append(tokens)
#print(tokenized_corpus)

#### Save tokenized corpus

In [10]:
with open(model_location + 'tokenized_corpus.dill', 'wb') as f:
    dill.dump(tokenized_corpus, f)

#### Load tokenized corpus

In [11]:
with open(model_location + 'tokenized_corpus.dill', 'rb') as f:
    tokenized_corpus = dill.load(f)

### Gensim Word2Vec model

In [12]:
vector_size = 512
window_size = 10

In [13]:
# Create Word2Vec
word2vec = Word2Vec(sentences=tokenized_corpus,
                    size=vector_size, 
                    window=window_size, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=multiprocessing.cpu_count())

#### Save Word2Vec model

In [14]:
word2vec.save(model_location + 'word2vec.model')

#### Load Word2Vec model

In [15]:
word2vec = Word2Vec.load(model_location + 'word2vec.model')

### Copy word vectors and delete Word2Vec model  and original corpus to save memory

In [16]:
X_vecs = word2vec.wv

del word2vec
del corpus

#### Train subset size (0 < size < len(tokenized_corpus))

In [17]:
train_size = l1

#### Test subset size (0 < size < len(tokenized_corpus) - train_size)

In [18]:
test_size = diff

#### Compute average and max tweet length

In [19]:
avg_length = 0.0
max_length = 0

for tweet in tokenized_corpus:
    if len(tweet) > max_length:
        max_length = len(tweet)
    avg_length += float(len(tweet))
    
print('Average tweet length: {}'.format(avg_length / float(len(tokenized_corpus))))
print('Max tweet length: {}'.format(max_length))

Average tweet length: 16.50685034817366
Max tweet length: 45


#### Tweet max length (number of tokens)

In [20]:
max_tweet_length = 15

### Create train and test sets

In [21]:
# Generate random indexes
indexes = np.random.choice(len(tokenized_corpus), train_size + test_size, replace=False)

X_train = np.zeros((train_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_train = np.zeros((train_size, 2), dtype=np.int32)
X_test = np.zeros((test_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_test = np.zeros((test_size, 2), dtype=np.int32)

     
#print(X_train)
#print(Y_train)
for i, index in enumerate(indexes):
    for t, token in enumerate(tokenized_corpus[index]):
        if t >= max_tweet_length:
            break
        
        if token not in X_vecs:
            continue
    
        if i < train_size:
            X_train[i, t, :] = X_vecs[token]
        else:
            X_test[i - train_size, t, :] = X_vecs[token]
            
    if i < train_size:
        Y_train[i, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
    else:
        Y_test[i - train_size, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
        
   

### Keras Convolutional model

In [22]:
batch_size = 32
nb_epochs = 100

In [23]:
model = Sequential()
input_shape=(max_tweet_length, vector_size)
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_tweet_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))
print(input_shape)

(15, 512)


In [24]:
# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001, decay=1e-6),
              metrics=['accuracy'])

In [27]:
# Fit the model
model.fit(X_train, Y_train,
          batch_size=batch_size,
          shuffle=True,
          epochs=10,
          validation_data=(X_test, Y_test),
          )

Train on 35297 samples, validate on 4482 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22aca957ba8>