# Sentiment analysis on Twitter using word2vec and keras

In [3]:
#import libraries
import sys
import pandas as pd # provide data manipulation tools.
import numpy as np # high dimensional vector computing library.
from string import punctuation
from random import shuffle
from tqdm import tqdm    #Shows progress bar
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

## Preprocess Data

In [4]:
def preprocess():
    data = pd.read_csv("tweets.csv",index_col=False,names=["Sentiment","SentimentText"])
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map(int)
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print 'dataset loaded with shape', data.shape    
    return data

data=preprocess()
data.head(3)

dataset loaded with shape (1231862, 2)


Unnamed: 0,Sentiment,SentimentText
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww\n0"""
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire


In [5]:
def tokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return tokens
    except:
        return 'NC'
    
def postprocess(data, n=100000):
    data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    del data['index']
    return data

data = postprocess(data)

progress-bar: 100%|██████████| 100000/100000 [00:08<00:00, 11553.71it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
data.head(4)

Unnamed: 0,Sentiment,SentimentText,tokens
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww\n0""","[-, awww, 0, ""]"
1,0,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, ., man..."
2,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
3,0,"@nationwideclass no \n0""","[no, 0, ""]"


In [7]:
print data.shape

(100000, 3)


## Lets start with Word2vec :

In [8]:
import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
TaggedDocument = gensim.models.doc2vec.TaggedDocument 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#Split the data into testing data and traing data
n=100000
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)

In [10]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(TaggedDocument(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

80000it [00:00, 181281.34it/s]
20000it [00:00, 295691.08it/s]


In [11]:
x_train[:3]

[TaggedDocument(words=[u'is', u'sickies', u'will', u'soilder', u'on', u'for', u'after', u'work', u'drinks', u'though', u'!'], tags=['TRAIN_0']),
 TaggedDocument(words=[u'kid', u'vomit', u'...', u'dog', u'vomit', u'...', u'happy', u'mothers', u'day'], tags=['TRAIN_1']),
 TaggedDocument(words=[u'does', u'your', u'sack', u'weep', u'when', u'it', u'rains', u'?', u'mine', u'does', u'..', u'and', u'boy', u'does', u'it', u'let', u'me', u'know', u'about', u'it'], tags=['TRAIN_2'])]

In [12]:
#Fit data into Word2Vec model

n_dim=50  #no. of features

tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs)


100%|██████████| 80000/80000 [00:00<00:00, 1210944.77it/s]
100%|██████████| 80000/80000 [00:00<00:00, 1273113.42it/s]


(3344801, 5046105)

### Now that we have a vector representation of every world , lets a method named most_similar provided by Word2Vec gensim. 

### Given a word, this method returns the top n similar ones. 

In [13]:
print tweet_w2v.wv.most_similar('facebook')

[(u'myspace', 0.8617261052131653), (u'comment', 0.8304721713066101), (u'youtube', 0.8236387372016907), (u'twitter', 0.8197606801986694), (u'fb', 0.8139217495918274), (u'itunes', 0.8088420033454895), (u'iphone', 0.8043081164360046), (u'tweetdeck', 0.7998829483985901), (u'site', 0.7863008379936218), (u'website', 0.7754982113838196)]


In [14]:
print tweet_w2v.wv.most_similar('happy')

[(u"mother's", 0.6230425834655762), (u'nice', 0.6164500117301941), (u'busy', 0.6096088290214539), (u'mothers', 0.6003849506378174), (u'excited', 0.5783932209014893), (u'bummed', 0.5708513259887695), (u'boring', 0.5586500763893127), (u'sad', 0.5523676872253418), (u'quiet', 0.5413385629653931), (u'beautiful', 0.53265380859375)]


## Visualizing the Dataset 

* step1: Convert n-dimensional vector to 2-D vector using TSNE
* step2: vizualize these vector on a plane using Bokeh

In [15]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)

In [16]:
# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:4000]]


  from ipykernel import kernelapp as app


In [17]:
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:4000]

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4000 samples in 0.008s...
[t-SNE] Computed neighbors for 4000 samples in 1.774s...
[t-SNE] Computed conditional probabilities for sample 1000 / 4000
[t-SNE] Computed conditional probabilities for sample 2000 / 4000
[t-SNE] Computed conditional probabilities for sample 3000 / 4000
[t-SNE] Computed conditional probabilities for sample 4000 / 4000
[t-SNE] Mean sigma: 0.169372
[t-SNE] KL divergence after 250 iterations with early exaggeration: 82.398155
[t-SNE] Error after 1000 iterations: 2.441172


In [18]:
print tsne_df.head(5)

           x          y     words
0  10.164096  -9.821815     stock
1   9.867121 -45.394291   raining
2  13.771583 -11.498062    todays
3 -31.686312  10.150613  showered
4 -32.879318  29.907715    whoops


In [19]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook                #output_file

# defining Chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

## Creating a single vector representing a whole tweet by using TF-idf

In [20]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [21]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

100%|██████████| 80000/80000 [00:19<00:00, 4079.98it/s]
100%|██████████| 20000/20000 [00:04<00:00, 4090.35it/s]


## Build a Sentiment Classifier using Keras simple Neural Network

In [22]:
from keras.models import Sequential
from keras.layers import Dense, Activation


model = Sequential()
model.add(Dense(32, activation='relu', input_dim=n_dim))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


model.fit(train_vecs_w2v, y_train, epochs=5, batch_size=32, verbose=2)
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print "Score:"+str(score[1])

Using TensorFlow backend.


Epoch 1/5
 - 4s - loss: 0.0238 - acc: 0.9945
Epoch 2/5
 - 3s - loss: 2.1940e-04 - acc: 1.0000
Epoch 3/5
 - 3s - loss: 4.1026e-05 - acc: 1.0000
Epoch 4/5
 - 3s - loss: 9.5823e-06 - acc: 1.0000
Epoch 5/5
 - 3s - loss: 2.4225e-06 - acc: 1.0000
Score:1.0
