## Load word to vect model

In [1]:
from gensim.models import word2vec
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# load word2vec model to fnd similarities between diffrent word's vectors 
model = gensim.models.Word2Vec.load('bible_word2vec_gensim')  

2017-12-03 18:44:03,550 : INFO : loading Word2Vec object from bible_word2vec_gensim
2017-12-03 18:44:05,825 : INFO : loading wv recursively from bible_word2vec_gensim.wv.* with mmap=None
2017-12-03 18:44:05,834 : INFO : loading syn0 from bible_word2vec_gensim.wv.syn0.npy with mmap=None
2017-12-03 18:44:10,491 : INFO : setting ignored attribute syn0norm to None
2017-12-03 18:44:10,498 : INFO : loading syn1neg from bible_word2vec_gensim.syn1neg.npy with mmap=None
2017-12-03 18:44:18,587 : INFO : setting ignored attribute cum_table to None
2017-12-03 18:44:18,598 : INFO : loaded bible_word2vec_gensim


In [3]:
# import all the required libraries
import nltk, re, csv
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer
from random import shuffle
from nltk.tokenize import RegexpTokenizer

## Read input data 

In [4]:
# load the sentence level sentiments csv to dataframe
inp_df = pd.read_csv('output_sent_update.csv', encoding="Latin1")
print inp_df.head()
# workaround for warnings
pd.options.mode.chained_assignment = None 

   Unnamed: 0                                            Reviews  \
0           0                        Crave those crazy squares!!   
1           1        Not breakfast cereal, White Castle burgers!   
2           2  \\nIf you didn't grow up with this chain, you ...   
3           3  Yeah, the meat is a super-thin mystery but it'...   
4           4  Summers in Ohio with my cousins would always e...   

                   userId             Business_Id               Review_Id  \
0  bLbSNkLggFnqwNNzzq-Ijw  dKdApYVFDSNYsNOso6NYlA  LwszgYoywAhMaIdt3zPgug   
1  bLbSNkLggFnqwNNzzq-Ijw  dKdApYVFDSNYsNOso6NYlA  LwszgYoywAhMaIdt3zPgug   
2  bLbSNkLggFnqwNNzzq-Ijw  dKdApYVFDSNYsNOso6NYlA  LwszgYoywAhMaIdt3zPgug   
3  bLbSNkLggFnqwNNzzq-Ijw  dKdApYVFDSNYsNOso6NYlA  LwszgYoywAhMaIdt3zPgug   
4  bLbSNkLggFnqwNNzzq-Ijw  dKdApYVFDSNYsNOso6NYlA  LwszgYoywAhMaIdt3zPgug   

   Sentiment  
0         -1  
1          0  
2         -1  
3          0  
4          1  


In [5]:
# function to remove stopwords from list 
def remove_stopwords(word_list):
        processed_word_list = []
        for word in word_list:
            word = word.lower() # in case they arenet all lower cased
            if word not in stopwords.words("english"):
                processed_word_list.append(word)
        return processed_word_list

## Make columns for different categories in data frame

In [6]:
# make 5 different columns in our dataframe to save features for sentiments, initilaize them to 0 now
inp_df["food"] = 0
inp_df["service"] = 0
inp_df["ambience"]= 0
inp_df["cost"]= 0
inp_df["misc"]=0

inp_df.head()

Unnamed: 0.1,Unnamed: 0,Reviews,userId,Business_Id,Review_Id,Sentiment,food,service,ambience,cost,misc
0,0,Crave those crazy squares!!,bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,-1,0,0,0,0,0
1,1,"Not breakfast cereal, White Castle burgers!",bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,0,0,0,0,0,0
2,2,"\\nIf you didn't grow up with this chain, you ...",bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,-1,0,0,0,0,0
3,3,"Yeah, the meat is a super-thin mystery but it'...",bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,0,0,0,0,0,0
4,4,Summers in Ohio with my cousins would always e...,bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,1,0,0,0,0,0


## sentiment analysis for categories

In [7]:
# we need to check the cosine similiarity for given word with all the four features
feature_1= model.wv["food"]
feature_2 = model.wv["service"]
feature_3 = model.wv["ambience"]
feature_4 = model.wv["cost"]

# iterate over datafrem to fill all the columns i.e food,service, ambience,cost and misc
for i in range(0,inp_df.shape[0]):
    # calculate only if sentiment value of sentence is either positive or negative
    if inp_df['Sentiment'][i]!=0:
        
        # extract the review
        rev = inp_df['Reviews'][i]
        tokenizer = RegexpTokenizer(r'\w+')
        tokenized=tokenizer.tokenize(rev)
        # assign POS for each tokens
        postag = nltk.pos_tag(tokenized)
        
        # find out the noun tags for each sentence
        noun_tag1 = []
        for k in postag:
            if len(k[0])>1:
            # Filter out the nouns and the noun phrase
                if (k[1] == 'NN' or k[1] == 'NNS' or k[1] == 'NNP'):
                    noun_tag1.append(k[0])
        
        # remove stop words from noun tag
        noun_tag =remove_stopwords(noun_tag1)
        #print noun_tag
             
        # iterate only if there are any words tagged as noun otherwise we will assign sentiment in the misc category
        if (len(noun_tag)>0):
            # to calculate cosine similiarity
            sim_index = []
            for j in noun_tag:
                j = j.lower()
                # check if word is there in model vocabulary
                if j in model.wv.vocab:
                    # find out the 200 dimensional vector for word
                    noun_vec = model.wv[j]
                    similarity = [0.0,0.0,0.0,0.0];
                    
                    # calculate cosine similarity
                    similarity[0] =scipy.spatial.distance.cosine(feature_1, noun_vec)
                    similarity[1] =scipy.spatial.distance.cosine(feature_2, noun_vec)
                    similarity[2] =scipy.spatial.distance.cosine(feature_3,noun_vec)
                    similarity[3] =scipy.spatial.distance.cosine(feature_4,noun_vec)
                    
                    # check out the index for minimum similarity (1- cosine_distance)
                    index =  similarity.index(min(similarity))
                    
                    # save the index for each of the words 
                    sim_index.append(index)

            count = [0,0,0,0]
            # count the number of index 
            for n in sim_index:
                count[n]=sim_index.count(n)

            list_ind = np.where(count==np.max(count))[0]
            #print (list_ind)
            
            # assign sentiment according to the maximun repeating index
            for ind in list_ind:
                if(ind==0):
                    inp_df["food"][i]=inp_df["Sentiment"][i]
                if(ind==1):
                    inp_df["service"][i]=inp_df["Sentiment"][i]
                if(ind==2):
                    inp_df["ambience"][i]=inp_df["Sentiment"][i]
                if(ind==3):
                    inp_df["cost"][i]=inp_df["Sentiment"][i]

        else:
            inp_df['misc'][i]=inp_df["Sentiment"][i]            

In [12]:
inp_df.head()

Unnamed: 0.1,Unnamed: 0,Reviews,userId,Business_Id,Review_Id,Sentiment,food,service,ambience,cost,misc
0,0,Crave those crazy squares!!,bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,-1,0,0,0,-1,0
1,1,"Not breakfast cereal, White Castle burgers!",bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,0,0,0,0,0,0
2,2,"\\nIf you didn't grow up with this chain, you ...",bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,-1,-1,0,0,0,0
3,3,"Yeah, the meat is a super-thin mystery but it'...",bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,0,0,0,0,0,0
4,4,Summers in Ohio with my cousins would always e...,bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,LwszgYoywAhMaIdt3zPgug,1,1,0,0,0,0


In [28]:
n_df = inp_df[['Reviews','Sentiment','food','service','ambience','cost','misc']]

In [32]:
n_df.iloc[21:29,:]

Unnamed: 0,Reviews,Sentiment,food,service,ambience,cost,misc
21,What a disappointment!,-1,0,0,0,-1,0
22,I was really looking forward to eating here......,1,1,0,0,0,0
23,Those little hamburger / cheeseburgers were de...,1,1,0,0,0,0
24,Even as an adult now I would pick up a box fro...,-1,-1,0,0,0,0
25,Sadly this was not the case.,-1,0,0,0,-1,0
26,I'm not really sure how to describe it... the ...,-1,0,-1,0,0,0
27,I'm not sure where it was coming from.,-1,0,0,0,0,-1
28,"Either the casino or the actual ""restaurant"" (...",-1,0,0,-1,0,0


## Save results

In [9]:
# save results in csv file 
inp_df.to_csv('category_new_update.csv', encoding='utf-8')