### In this excercise we are goin to see how we can use word to vector model for sentiment classification

#### Before starting let's download the google's word to vector "GoogleNews-vectors-negative300"

In [1]:
# load the google word2vec model this takes time and memory and hence it's wise to do it first and them move to other
from gensim.models import KeyedVectors
filename = 'Word2vec\\GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)



In [2]:
import csv
import glob
import pandas as pd
import os
from collections import Counter
from sklearn.svm import SVC, NuSVC, LinearSVC
import numpy as np 
import scipy as sp 
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 
import pandas as pd 
import nltk
import re
import csv
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from time import time



### We have the text files in two seperate folders.
    * Neg (negatative)
    * Pos (positive)
    
Below function will go to each folder and read the files individually and assign the label based on the folder name. so that all files in Neg(negative) folder would be assigned with Negative label and sinmilary for Pos(positive) folder all the text files will be labeled as Positive

In [3]:
senti_data = pd.DataFrame(columns = ("Text", "Class")) 
for cla in glob.glob("C:/Users/inkpathak/Desktop/Anuj Gupta/Word2vec/txt_sentoken/*"): # Here the folder name is data set in which there are two sub-folder "Spam" & "Ham"
    clas = cla.split(os.sep)[1]    # We are splitting the folder names as class using OS-Seperator and taking the 2nd item in the list
    for file in glob.glob(cla + "/*.txt"): # Here we are deep diving in each of the folder and reading the text files one by one
        text = open(file, "r", encoding = "ISO-8859-1").read() # Reading the file , for Windows generally we need to mention the encoding 
        text = " ".join(text.split("\n")) # Splitting the text files and rejoining into a single text
        senti_data = senti_data.append(pd.Series([text, clas], index = ["Text", "Class"]), ignore_index = True) # continious append to the data frame

### Let's view the data sample

In [4]:
senti_data.head()

Unnamed: 0,Text,Class
0,"plot : two teen couples go to a church party ,...",neg
1,the happy bastard's quick movie review damn t...,neg
2,it is movies like these that make a jaded movi...,neg
3,""" quest for camelot "" is warner bros . ' firs...",neg
4,synopsis : a mentally unstable man undergoing ...,neg


### converting label to a numerical variable is on of the good practice for binary classification.


In [5]:
# convert label to a numerical variable
senti_data['Class'] = senti_data.Class.map({'neg':0, 'pos':1})

In [6]:
senti_data.head()

Unnamed: 0,Text,Class
0,"plot : two teen couples go to a church party ,...",0
1,the happy bastard's quick movie review damn t...,0
2,it is movies like these that make a jaded movi...,0
3,""" quest for camelot "" is warner bros . ' firs...",0
4,synopsis : a mentally unstable man undergoing ...,0


### Let's quickly do standard data cleaning:
####    Following cleaning is applied on the current dataset-
        * Removal of stopwords
        * removal of alpha neumeric character 
        * removal of numbers
        * removal of special symbols
        

In [7]:
stop = set(stopwords.words('english'))

In [8]:
def clean(doc):
    doc = " ".join([i.replace('*', '') for i in doc.lower().split()])
    doc = " ".join([i.replace(':', ' ') for i in doc.split()])
    doc = " ".join([i.replace('.', ' ') for i in doc.split()])
    doc = " ".join([i.replace('=', '') for i in doc.split()])
    doc = " ".join([i.replace('/', ' ') for i in doc.split()])
    doc = " ".join([i.replace(')', ' ') for i in doc.split()])
    doc = " ".join([i.replace('(', ' ') for i in doc.split()])
    doc = " ".join([i.replace('"', ' ') for i in doc.split()])
    doc = " ".join([i.replace('-', ' ') for i in doc.split()])
    doc = " ".join([i.replace('_', ' ') for i in doc.split()])
    doc = " ".join([i for i in doc.split() if not i.isdigit()])
    doc = " ".join([i for i in doc.split() if i.isalpha()])
    doc = " ".join([i for i in doc.split() if i not in stop])
    return doc

In [12]:
review_clear = [clean(doc) for doc in senti_data['Text']]
senti_data['clean_text']=review_clear
senti_data.head()

Unnamed: 0,Text,Class,clean_text
0,"plot : two teen couples go to a church party ,...",0,plot two teen couples go church party drink dr...
1,the happy bastard's quick movie review damn t...,0,happy quick movie review damn bug got head sta...
2,it is movies like these that make a jaded movi...,0,movies like make jaded movie viewer thankful i...
3,""" quest for camelot "" is warner bros . ' firs...",0,quest camelot warner bros first feature length...
4,synopsis : a mentally unstable man undergoing ...,0,synopsis mentally unstable man undergoing psyc...


### Build sentense vector for training data-set by using the overall sum value of all word vectors in the Clean text column.

This is just one of the way , you might take average as well or build other strategy. 

In [13]:
# Below is a custom function to build sentense vector for training set by using the total value of all word vectors in the Clean text column

def buildSentenceVector(text):
    sent_vec = np.zeros(300).reshape((1, 300))
    count = 0.
    for word in text:
        try:
            sent_vec += model[word].reshape((1, 300))
            count += 1.
        except KeyError:
            continue
    #if count != 0:
    #    sent_vec /= count
    return sent_vec

In [14]:
review_vec = [buildSentenceVector(doc) for doc in senti_data['clean_text']]

In [15]:
senti_data['sentense_vector']=review_vec

In [16]:
senti_data.head()

Unnamed: 0,Text,Class,clean_text,sentense_vector
0,"plot : two teen couples go to a church party ,...",0,plot two teen couples go church party drink dr...,"[[-293.95513916015625, 192.2789306640625, -4.9..."
1,the happy bastard's quick movie review damn t...,0,happy quick movie review damn bug got head sta...,"[[-112.78167724609375, 65.20263671875, 0.33660..."
2,it is movies like these that make a jaded movi...,0,movies like make jaded movie viewer thankful i...,"[[-242.486083984375, 163.73614501953125, 0.711..."
3,""" quest for camelot "" is warner bros . ' firs...",0,quest camelot warner bros first feature length...,"[[-250.0531005859375, 168.8553466796875, -4.26..."
4,synopsis : a mentally unstable man undergoing ...,0,synopsis mentally unstable man undergoing psyc...,"[[-378.0009765625, 267.97918701171875, 1.84680..."


### Now we are going to create a data frame with the sum of the word vectors of length 300 representing the sentence along with the binary dependent variable

In [17]:
review_vec1 = [i[0] for i in review_vec] # changing from list of list to single list
review_vec2 = np.array(review_vec1) # Changing from single list to array

In [18]:
review_vec2.shape

(2000, 300)

In [19]:
review_df = pd.DataFrame(review_vec2) # changing the array to a dataframe

In [20]:
review_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-293.955139,192.278931,-4.972412,246.479492,-76.376129,52.847504,-149.930115,-68.62085,-81.728027,26.597717,...,116.883545,-33.414062,-166.789688,156.87439,-55.962769,-263.559921,-162.363647,-40.535278,-190.335449,264.554199
1,-112.781677,65.202637,0.336609,92.041504,-27.3685,18.075989,-55.470612,-36.053345,-30.21344,6.324829,...,41.426025,2.624512,-56.628387,61.097534,-24.956787,-102.484375,-68.589966,-16.991699,-71.407837,101.894714
2,-242.486084,163.736145,0.711853,198.322754,-68.430664,43.944977,-130.381378,-61.356079,-73.543579,25.141663,...,109.481201,-21.97876,-149.334961,124.054443,-40.955688,-225.843277,-138.615723,-27.549072,-161.180542,220.218445
3,-250.053101,168.855347,-4.266174,195.756836,-72.587067,44.229919,-127.833618,-64.55603,-76.520386,25.055115,...,100.104736,-21.081909,-139.455933,126.58252,-42.903931,-221.10289,-135.423828,-37.146851,-160.887817,230.279663
4,-378.000977,267.979187,1.846802,302.576904,-102.295349,59.879395,-188.510895,-105.857422,-103.271912,29.568604,...,149.544434,-43.59375,-232.855728,211.853516,-71.249023,-353.665085,-206.748779,-61.833618,-227.511963,374.038635


In [22]:
review_df["sentiment"] = senti_data["Class"]

In [23]:
review_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,sentiment
0,-293.955139,192.278931,-4.972412,246.479492,-76.376129,52.847504,-149.930115,-68.62085,-81.728027,26.597717,...,-33.414062,-166.789688,156.87439,-55.962769,-263.559921,-162.363647,-40.535278,-190.335449,264.554199,0
1,-112.781677,65.202637,0.336609,92.041504,-27.3685,18.075989,-55.470612,-36.053345,-30.21344,6.324829,...,2.624512,-56.628387,61.097534,-24.956787,-102.484375,-68.589966,-16.991699,-71.407837,101.894714,0
2,-242.486084,163.736145,0.711853,198.322754,-68.430664,43.944977,-130.381378,-61.356079,-73.543579,25.141663,...,-21.97876,-149.334961,124.054443,-40.955688,-225.843277,-138.615723,-27.549072,-161.180542,220.218445,0
3,-250.053101,168.855347,-4.266174,195.756836,-72.587067,44.229919,-127.833618,-64.55603,-76.520386,25.055115,...,-21.081909,-139.455933,126.58252,-42.903931,-221.10289,-135.423828,-37.146851,-160.887817,230.279663,0
4,-378.000977,267.979187,1.846802,302.576904,-102.295349,59.879395,-188.510895,-105.857422,-103.271912,29.568604,...,-43.59375,-232.855728,211.853516,-71.249023,-353.665085,-206.748779,-61.833618,-227.511963,374.038635,0


In [24]:
X = review_df.iloc[:,0:300]
y = review_df.sentiment
print(X.shape)
print(y.shape)

(2000, 300)
(2000,)


Now let's devide the data into training and testing:
* Training=75%
* Testing =25%

In [25]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1500, 300)
(500, 300)
(1500,)
(500,)


In [26]:
# 1. import
from sklearn.linear_model import LogisticRegression

# 2. instantiate a logistic regression model
logreg = LogisticRegression()

In [27]:
%time logreg.fit(X_train, y_train)

Wall time: 622 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Now the model is built and it's time to test the classification accuracy:


In [28]:
y_pred_class = logreg.predict(X_test)

In [29]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.588

In [30]:
y_pred_prob = logreg.predict_proba(X_test)

### Let's look at the confusion matrix:

In [32]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[165,  90],
       [116, 129]])

### Checking how many words are there in google Word to vec and out of the words we have in our corpus what is the percentage common. 

If the % common is low between the google's word vector and the corpus we have, then the classification model would not work properly.

In [34]:

word2vec_vocab = model.vocab.keys()
word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]

print(len(word2vec_vocab))

3000000


In [46]:
senti_data["clean_text"][4].split()

['synopsis',
 'mentally',
 'unstable',
 'man',
 'undergoing',
 'psychotherapy',
 'saves',
 'boy',
 'potentially',
 'fatal',
 'accident',
 'falls',
 'love',
 'mother',
 'fledgling',
 'restauranteur',
 'unsuccessfully',
 'attempting',
 'gain',
 'favor',
 'takes',
 'pictures',
 'kills',
 'number',
 'people',
 'way',
 'comments',
 'stalked',
 'yet',
 'another',
 'seemingly',
 'endless',
 'string',
 'spurned',
 'psychos',
 'getting',
 'revenge',
 'type',
 'movies',
 'stable',
 'category',
 'film',
 'industry',
 'theatrical',
 'direct',
 'video',
 'proliferation',
 'may',
 'due',
 'part',
 'fact',
 'typically',
 'inexpensive',
 'produce',
 'special',
 'effects',
 'big',
 'name',
 'stars',
 'serve',
 'vehicles',
 'flash',
 'nudity',
 'allowing',
 'frequent',
 'late',
 'night',
 'cable',
 'television',
 'stalked',
 'wavers',
 'slightly',
 'norm',
 'one',
 'respect',
 'psycho',
 'never',
 'actually',
 'affair',
 'contrary',
 'rejected',
 'rather',
 'quickly',
 'psycho',
 'typically',
 'ex',
 'l

In [40]:
senti_data.shape[0]

2000

In [48]:
word_list = [item for sublist in word_list for item in sublist] unlisting

In [49]:
len(word_list)

689259

In [50]:


unique_words = list(set(word_list))  #this will give unique list of words

In [51]:
print(len(unique_words))

38333


In [54]:
kk=0
for word in unique_words:
    try:
        
        kp= model[word]
        kk +=1
    except KeyError:
        continue
print(kk) 


31474


In [55]:
print(kk/len(unique_words))

0.8210680092870373


#### We can see we have close to 82% words has vector representation in google word vector. This is a good representation