# Imports

In [1]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle
import pickle
import h5py
import json
import matplotlib.pyplot as plt 

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk import word_tokenize

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import Callback
from keras.models import model_from_json


# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from pandas import Panel


# Loading Data

In [2]:
def ingest():
    data = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='latin-1') # Enter your file location
    data.columns=["Sentiment","ItemID","Date","Blank","SentimentSource","SentimentText"]
    data.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map( {4:1, 0:0}) #Converting 4 to 1
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape  )  
    return data

data = ingest()

dataset loaded with shape (1599999, 4)


# Processing Data

In [3]:
tokenizer = TweetTokenizer()
def tokenize(tweet):
    try:
        tweet = tweet.lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        return 'NC'

In [4]:
def postprocess(data):
#     , n=1000000
#     data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)

progress-bar: 100%|██████████| 1599999/1599999 [02:42<00:00, 9854.12it/s] 


# Labelise Data

In [5]:
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [6]:
#Splitting for training and testing
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(100000).tokens),
                                                    np.array(data.head(100000).Sentiment), test_size=0.2)


In [7]:
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST') 

  labelized.append(LabeledSentence(v, [label]))
80000it [00:00, 162220.17it/s]
20000it [00:00, 218910.54it/s]


In [8]:
data_labellised= labelizeTweets(np.array(data.tokens), 'data')

  labelized.append(LabeledSentence(v, [label]))
1599999it [00:14, 108060.75it/s]


# Builidng word2vec vocabulary and training

In [9]:
n=100000
n_dim = 200
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(data_labellised)])

100%|██████████| 1599999/1599999 [00:00<00:00, 1827885.45it/s]


In [10]:
tweet_w2v.train([x.words for x in tqdm(data_labellised)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter) 

100%|██████████| 1599999/1599999 [00:00<00:00, 2049111.82it/s]
  tweet_w2v.train([x.words for x in tqdm(data_labellised)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)


(85662381, 117742340)

In [11]:
#Save the w2v model
#tweet_w2v.save('w2vmodel')
#Load the w2v model
#new_w2vmodel = gensim.models.Word2Vec.load('w2vmodel')

In [12]:
#convert any word(present in vocabulary) to vector.
tweet_w2v['bye']

  tweet_w2v['bye']


array([-0.30251047, -1.6405699 , -0.03402726,  0.06171053,  2.072614  ,
       -0.80636126, -0.8875673 ,  0.6213735 , -0.8549891 , -0.5925667 ,
       -0.2054568 ,  1.9618661 ,  0.2551082 ,  0.71145564,  0.55248886,
        1.1340448 , -1.3346654 ,  0.49977866,  0.6335914 ,  0.60645866,
        0.8246588 ,  0.19022721, -0.4694658 ,  0.08777889, -0.5309277 ,
        0.28843787,  1.4564227 , -0.14684612,  0.76662695,  1.4012446 ,
        0.3229195 ,  0.04194264, -0.39449295, -0.2928253 ,  0.42130405,
       -0.1199276 , -1.6242939 , -1.3620796 ,  0.49121478,  0.6751612 ,
       -0.54766166,  0.02130029, -0.82963926,  0.7152768 , -0.29126403,
        0.82934654,  0.7953976 ,  1.9167562 ,  0.7727084 , -1.499936  ,
       -0.9232202 , -1.6494724 , -0.3109122 , -0.49838433,  0.8297867 ,
       -1.6871886 ,  0.13586158, -0.72929054, -0.58872765, -0.14110719,
        0.83960795,  0.90627295,  1.7784247 , -0.19003321, -0.9883626 ,
       -1.9902493 ,  1.2869697 , -0.34673524,  0.9200596 , -0.09

In [13]:
#Find similar words
tweet_w2v.most_similar('fever')

  tweet_w2v.most_similar('fever')


[('migraine', 0.7162464261054993),
 ('cough', 0.6996488571166992),
 ('headache', 0.6861199140548706),
 ('bronchitis', 0.6843030452728271),
 ('toothache', 0.6728112697601318),
 ('flu', 0.6262699365615845),
 ('hayfever', 0.6159099340438843),
 ('chills', 0.6056267023086548),
 ('virus', 0.6038968563079834),
 ('migrane', 0.5986521244049072)]

# TF-IDF matrix of data

In [14]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in data_labellised])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 34988


In [15]:
#Save the tfidf 
# with open("tfidfdict.txt", "wb") as myFile:
#     pickle.dump(tfidf, myFile)
# with open("tfidfdict.txt", "rb") as myFile:
#     tfidf = pickle.load(myFile)


# Build tweet vector to give input to FFNN

In [16]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [17]:
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

  vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
80000it [00:16, 4941.87it/s]
  vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
20000it [00:03, 5308.20it/s]


# Training 3 layered FFNN

In [18]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=10000, verbose=2)

Epoch 1/100
8/8 - 1s - loss: 0.7017 - accuracy: 0.5681
Epoch 2/100
8/8 - 0s - loss: 0.4200 - accuracy: 0.9515
Epoch 3/100
8/8 - 0s - loss: 0.2719 - accuracy: 0.9952
Epoch 4/100
8/8 - 0s - loss: 0.1727 - accuracy: 0.9996
Epoch 5/100
8/8 - 0s - loss: 0.1072 - accuracy: 0.9999
Epoch 6/100
8/8 - 0s - loss: 0.0662 - accuracy: 1.0000
Epoch 7/100
8/8 - 0s - loss: 0.0412 - accuracy: 1.0000
Epoch 8/100
8/8 - 0s - loss: 0.0258 - accuracy: 1.0000
Epoch 9/100
8/8 - 0s - loss: 0.0163 - accuracy: 1.0000
Epoch 10/100
8/8 - 0s - loss: 0.0103 - accuracy: 1.0000
Epoch 11/100
8/8 - 0s - loss: 0.0066 - accuracy: 1.0000
Epoch 12/100
8/8 - 0s - loss: 0.0042 - accuracy: 1.0000
Epoch 13/100
8/8 - 0s - loss: 0.0027 - accuracy: 1.0000
Epoch 14/100
8/8 - 0s - loss: 0.0017 - accuracy: 1.0000
Epoch 15/100
8/8 - 0s - loss: 0.0011 - accuracy: 1.0000
Epoch 16/100
8/8 - 0s - loss: 7.2141e-04 - accuracy: 1.0000
Epoch 17/100
8/8 - 0s - loss: 4.6616e-04 - accuracy: 1.0000
Epoch 18/100
8/8 - 0s - loss: 3.0138e-04 - accura

<tensorflow.python.keras.callbacks.History at 0x7fa1c880db80>

In [19]:
# Evaluating accuracy score

score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

157/157 - 0s - loss: 2.6638e-09 - accuracy: 1.0000
loss :  2.663778975176001e-09 
 accuracy :  1.0


# Saving model

In [20]:
#Saving the model
model_json = model.to_json() # serialize model to JSON
with open("model.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("smodel.h5") # serialize weights to HDF5
print("Saved model to disk")

#Loading the model
# newmodel = model_from_json(open('model.json').read())
# newmodel.load_weights('smodel.h5')

Saved model to disk


# Predicting for test file (Validation)

In [21]:
def ingesttest():
    testdata = pd.read_csv('data/testdata.manual.2009.06.14.csv', encoding='latin-1')
    testdata.columns=["Sentiment","ItemID","Date","Blank","SentimentSource","SentimentText"]
    testdata.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    testdata = testdata[testdata.Sentiment.isnull() == False]
    testdata['Sentiment'] = testdata['Sentiment'].map( {4:1, 0:0, 2:1})
    testdata = testdata[testdata['SentimentText'].isnull() == False]
    testdata.reset_index(inplace=True)
    testdata.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', testdata.shape  )  
    return testdata

testdata = ingesttest()

dataset loaded with shape (497, 4)


In [22]:
testdata = postprocess(testdata)
testdata.head(5)

progress-bar: 100%|██████████| 497/497 [00:00<00:00, 10932.12it/s]


Unnamed: 0,Sentiment,Date,Blank,SentimentText,tokens
0,1,Mon May 11 03:18:03 UTC 2009,kindle2,Reading my kindle2... Love it... Lee childs i...,"[reading, my, kindle, 2, ..., love, it, ..., l..."
1,1,Mon May 11 03:18:54 UTC 2009,kindle2,"Ok, first assesment of the #kindle2 ...it fuck...","[ok, ,, first, assesment, of, the, ..., it, fu..."
2,1,Mon May 11 03:19:04 UTC 2009,kindle2,@kenburbary You'll love your Kindle2. I've had...,"[you'll, love, your, kindle, 2, ., i've, had, ..."
3,1,Mon May 11 03:21:41 UTC 2009,kindle2,@mikefish Fair enough. But i have the Kindle2...,"[fair, enough, ., but, i, have, the, kindle, 2..."
4,1,Mon May 11 03:22:00 UTC 2009,kindle2,@richardebaker no. it is too big. I'm quite ha...,"[no, ., it, is, too, big, ., i'm, quite, happy..."


In [23]:
test_X=np.array(testdata.tokens)
test_y=np.array(testdata.Sentiment)

In [24]:
test_w2v_vecs = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x:x, test_X))])

  vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
497it [00:00, 5528.25it/s]


In [25]:
test_w2v_vecs.shape

(497, 200)

In [26]:
# model.predict_classes(test_w2v_vecs)
score = model.evaluate(test_w2v_vecs,test_y, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])


4/4 - 0s - loss: 48.5454 - accuracy: 0.3561
loss :  48.545372009277344 
 accuracy :  0.35613682866096497
