# The below code shows the extraction of more features in the YELP dataset

In [197]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame

%matplotlib inline
pd.options.display.mpl_style = 'default'

pd.set_option('display.max_columns', 36)
print pd.__version__

0.18.0


We will use NLTK to process and tokenize the data

In [39]:
import nltk
import string
from nltk import word_tokenize
from nltk.util import bigrams, trigrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter



In [198]:
# I have downloaded the required files for NLTK and we are doing preprocessing of data to perform a sentiment analysis later
POSITIVE_WORDS = set([line.strip() for line in open('positive-words.txt', 'r')])
NEGATIVE_WORDS = set([line.strip() for line in open('negative-words.txt', 'r')])
NLTK_STOPWORDS = set(stopwords.words('english'))
MORE_STOPWORDS = set([line.strip() for line in open('more_stopwords.txt', 'r')])

def remove_numbers_in_string(s):
    return s.translate(None, string.digits)

def lowercase_remove_punctuation(s):
    s = s.lower()
    s = s.translate(None, string.punctuation)
    return s
    
def remove_stopwords(s):
    token_list = nltk.word_tokenize(s)
    exclude_stopwords = lambda token : token not in NLTK_STOPWORDS
    return ' '.join(filter(exclude_stopwords, token_list))

def filter_out_more_stopwords(token_list):
    return filter(lambda tok : tok not in MORE_STOPWORDS, token_list)

def stem_token_list(token_list):
    STEMMER = PorterStemmer()
    return [STEMMER.stem(tok.decode('utf-8')) for tok in token_list]

def restring_tokens(token_list):
    return ' '.join(token_list)

def lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem_and_restring(s):
    s = remove_numbers_in_string(s)
    s = lowercase_remove_punctuation(s)
    s = remove_stopwords(s)
    token_list = nltk.word_tokenize(s)
    token_list = filter_out_more_stopwords(token_list)
    token_list = stem_token_list(token_list)
    return restring_tokens(token_list)

In [199]:
turbo_csv_filename = os.path.join("./", 'TURBO_p2.csv')
TURBO_DF = pd.read_csv(turbo_csv_filename)


In [200]:
for idx in range(5):
    print TURBO_DF.review_text[idx]
    print

I think Zinc Brasserie will get better and better. I visited the restaurant only 6 days after they opened. 

The good:  The food is amazing.  I had a sandwich and dessert.  I can't remember what the sandwich was called but it was a pork sandwich. It was cooked to perfection.  Dessert was great too!  (Creme brûlée). 

The atmosphere is lovely!  Much more comfortable and elegant than other airport restaurants.  

The bad:  I noticed some issues with cleanliness (staff practices, dirty dish ware, dirty tables).  The staff also was somewhat unfriendly and borderline rude.   


Love love love Matt Carter concepts. The Mission, Zinc and House Brasserie are among some of my favorite restaurants.  I can't wait o try this location again and see how staff (and cleanliness) is settling in and improving.

I've always heard this is one of the special places in Phoenix . . . but, I wasn't prepared for how much I would love it! Lots of seating, but still it feels cozy and comfortable. The ki

In [202]:
initial_features = ['business_id', 'business_name', 'review_stars', 'review_text','business_review_count']
df_with_initial_features_and_preprocessed_review_text = TURBO_DF[initial_features]
%time df_with_initial_features_and_preprocessed_review_text['review_text'] = df_with_initial_features_and_preprocessed_review_text['review_text'].apply(lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem_and_restring)

for idx in range(5):
    print df_with_initial_features_and_preprocessed_review_text.review_text[idx]
    print

Wall time: 28.4 s
zinc brasseri visit restaur day open good food amaz sandwich dessert rememb sandwich call pork sandwich cook perfect dessert great creme brûlée atmospher love comfort eleg airport restaur bad notic issu cleanli staff practic dirti dish ware dirti tabl staff unfriendli borderlin rude love love love matt carter concept mission zinc hous brasseri favorit restaur wait locat staff cleanli settl improv

ive heard special place phoenix wasnt prepar love lot seat feel cozi comfort kind place sit stay awhil food ah flare back kitchen talent thing food talent folk call chef tast bump back cook categori fez chef recommend sit bar enjoy meet sarah bartend custom servic smile red hair pzazz fez pzazz

sunday june az republ list fez top spot yum bun jump tractor beemer tricycl yo truck ho silver mode transport rush central avenu juici halfpound angu beef burger add cinnamonsp pear feta chees crisp onion wonder bbq sauc ahhhh mouth water good top dont top

back time feel light lunch



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [46]:
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import *
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score

In [205]:
import json
from collections import OrderedDict
 
def convert_json_dict_string_to_python_dict(json_string_dict):
    return json.loads(json_string_dict.replace('\'', '"').replace('u"', '"'))

def get_number_of_review_votes_that_are(cool_or_funny_or_useful):
    assert cool_or_funny_or_useful in ['cool', 'funny', 'useful']
    return lambda review_votes_dict : review_votes_dict[cool_or_funny_or_useful]

def get_total_review_votes(review_votes_dict):
    return review_votes_dict['cool'] + review_votes_dict['funny'] + review_votes_dict['useful']

def get_number_of_compliments_that_are(cool_or_cute_or_funny_or_hot_or_list_or_more_or_note_or_photos_or_plain_or_profile_or_writer):
    key = cool_or_cute_or_funny_or_hot_or_list_or_more_or_note_or_photos_or_plain_or_profile_or_writer
    assert cool_or_cute_or_funny_or_hot_or_list_or_more_or_note_or_photos_or_plain_or_profile_or_writer in [u'cool',
                                                                                                            u'cute',
                                                                                                            u'funny',
                                                                                                            u'hot',
                                                                                                            u'list',
                                                                                                            u'more',
                                                                                                            u'note',
                                                                                                            u'photos',
                                                                                                            u'plain',
                                                                                                            u'profile',
                                                                                                            u'writer']
    return lambda compliments_dict : compliments_dict[key] if key in compliments_dict else 0

def count_number_of_positive_words(document):
    return len(filter(lambda tok : tok in POSITIVE_WORDS, document.split()))

def count_number_of_negative_words(document):
    return len(filter(lambda tok : tok in NEGATIVE_WORDS, document.split()))

def check_that_userid_does_not_appear_in_its_friendid_list(df):
    for i, x in df.iterrows():
        d = OrderedDict(x)
        assert d['business_id'] not in d['friends_id_list']

def get_average_friends_average_stars(df):
    
    def apply_func(friend_user_id_list):
        friend_ids_that_exist_in_df = set(df['user_id']).intersection(set(friend_user_id_list))
        indices_of_records_with_friend_ids = df['user_id'].isin(friend_ids_that_exist_in_df)
        userid_and_averagestars_df = df[['user_id', 'user_average_stars']]
        friends_average_stars_indexed_by_userid_df = userid_and_averagestars_df.loc[indices_of_records_with_friend_ids].groupby('user_id').mean()
        return np.mean(friends_average_stars_indexed_by_userid_df['user_average_stars'])
    
    return apply_func

In [206]:
#The below code generates most of the subset of features mentioned in the report to be used later for prediction.

df_with_refeature_engineered = df_with_initial_features_and_preprocessed_review_text
df_with_refeature_engineered['review_votes'] = TURBO_DF.review_votes.apply(convert_json_dict_string_to_python_dict)
df_with_refeature_engineered['review_cool'] = df_with_refeature_engineered.review_votes.apply(get_number_of_review_votes_that_are('cool'))
df_with_refeature_engineered['review_funny'] = df_with_refeature_engineered.review_votes.apply(get_number_of_review_votes_that_are('funny'))
df_with_refeature_engineered['review_useful'] = df_with_refeature_engineered.review_votes.apply(get_number_of_review_votes_that_are('useful'))
df_with_refeature_engineered['review_votes_count'] = df_with_refeature_engineered.review_votes.apply(get_total_review_votes)
df_with_refeature_engineered['review_length'] = df_with_refeature_engineered.review_text.apply(len)

df_with_refeature_engineered['user_id'] = TURBO_DF.user_id
df_with_refeature_engineered['user_votes'] = TURBO_DF.user_votes.apply(convert_json_dict_string_to_python_dict)
df_with_refeature_engineered['user_cool'] = df_with_refeature_engineered.user_votes.apply(get_number_of_review_votes_that_are('cool'))
df_with_refeature_engineered['user_funny'] = df_with_refeature_engineered.user_votes.apply(get_number_of_review_votes_that_are('funny'))
df_with_refeature_engineered['user_useful'] = df_with_refeature_engineered.user_votes.apply(get_number_of_review_votes_that_are('useful'))
df_with_refeature_engineered['user_votes_count'] = df_with_refeature_engineered.user_votes.apply(get_total_review_votes)
df_with_refeature_engineered['user_review_count'] = TURBO_DF.user_review_count
df_with_refeature_engineered['user_fans'] = TURBO_DF.fans
df_with_refeature_engineered['user_friends_count'] = TURBO_DF.friends.apply(len)
df_with_refeature_engineered['user_years_elite_count'] = TURBO_DF.elite.apply(len)
df_with_refeature_engineered['user_compliments'] = TURBO_DF.compliments.apply(convert_json_dict_string_to_python_dict)
df_with_refeature_engineered['user_compliments_cool'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('cool'))
df_with_refeature_engineered['user_compliments_cute'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('cute'))
df_with_refeature_engineered['user_compliments_funny'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('funny'))
df_with_refeature_engineered['user_compliments_hot'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('hot'))
df_with_refeature_engineered['user_compliments_list'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('list'))
df_with_refeature_engineered['user_compliments_more'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('more'))
df_with_refeature_engineered['user_compliments_note'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('note'))
df_with_refeature_engineered['user_compliments_photos'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('photos'))
df_with_refeature_engineered['user_compliments_plain'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('plain'))
df_with_refeature_engineered['user_compliments_profile'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('profile'))
df_with_refeature_engineered['user_compliments_writer'] = df_with_refeature_engineered.user_compliments.apply(get_number_of_compliments_that_are('writer'))

df_with_refeature_engineered['user_average_stars'] = TURBO_DF.average_stars
df_with_refeature_engineered['business_stars'] = TURBO_DF.business_stars

df_with_refeature_engineered['positive_words_count'] = TURBO_DF.review_text.apply(count_number_of_positive_words)
df_with_refeature_engineered['negative_words_count'] = TURBO_DF.review_text.apply(count_number_of_negative_words)
df_with_refeature_engineered['neutral_words_count'] = df_with_refeature_engineered.review_length - (df_with_refeature_engineered.positive_words_count + df_with_refeature_engineered.negative_words_count)
df_with_refeature_engineered['positive_words_count_over_review_length'] = df_with_refeature_engineered.positive_words_count / (1.0*df_with_refeature_engineered.review_length)
df_with_refeature_engineered['negative_words_count_over_review_length'] = df_with_refeature_engineered.negative_words_count / (1.0*df_with_refeature_engineered.review_length)
df_with_refeature_engineered['neutral_words_count_over_review_length'] = df_with_refeature_engineered.neutral_words_count / (1.0*df_with_refeature_engineered.review_length)
df_with_refeature_engineered['positive_to_negative_words_count_ratio'] = df_with_refeature_engineered.positive_words_count / (1.0*df_with_refeature_engineered.negative_words_count)
df_with_refeature_engineered['negative_to_positive_words_count_ratio'] = df_with_refeature_engineered.negative_words_count / (1.0*df_with_refeature_engineered.positive_words_count)

df_with_refeature_engineered['friends_id_list'] = TURBO_DF.friends.apply(lambda s : s.replace('[', '').replace(']','').split(', '))
check_that_userid_does_not_appear_in_its_friendid_list(df_with_refeature_engineered)
df_with_refeature_engineered['friends_average_stars'] = df_with_refeature_engineered.friends_id_list.apply(get_average_friends_average_stars(df_with_refeature_engineered))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

In [207]:
df_with_refeature_engineered.to_csv("turbo_with_generated_features.csv", encoding='utf-8')

In [208]:
# I have taken the above file and generated more features like number of businesses within 1km in R and 
#also feautures with respect to reviews in excel and R and generated another document called turbo_predict.csv
#containing the unique business ids with all their feature vectors which will be use later for prediction.

In [111]:
turbo_csv_filename = os.path.join("./", 'turbo_predict.csv')
df_with_refeature_engineered = pd.read_csv(turbo_csv_filename)

In [112]:
corr_features = ['review_cool', 'review_funny', 'review_useful', 'review_votes_count', 'review_length',
                'Average_number_of_stars','Max_Number_of_stars','Min_Number_of_stars','business_review_count',
 'user_cool', 'user_funny', 'user_useful', 'user_votes_count', 'user_review_count', 'user_fans',
                 'user_friends_count', 'user_years_elite_count', 'user_compliments_cool', 'user_compliments_cool',
                 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list',
                 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
                 'user_compliments_profile', 'user_compliments_writer',
                 'positive_words_count', 'negative_words_count', 'positive_to_negative_words_count_ratio',
                 'negative_to_positive_words_count_ratio', 'neutral_words_count', 'neutral_words_count_over_review_length',
                 'positive_words_count_over_review_length', 'negative_words_count_over_review_length', 'friends_average_stars',              
                 'user_average_stars', 'business_stars', 'review_stars','business_review_count']
Corr_df = df_with_refeature_engineered[corr_features].corr()
Corr_df

Unnamed: 0,review_cool,review_funny,review_useful,review_votes_count,review_length,Average_number_of_stars,Max_Number_of_stars,Min_Number_of_stars,business_review_count,user_cool,user_funny,user_useful,user_votes_count,user_review_count,user_fans,user_friends_count,user_years_elite_count,user_compliments_cool,...,user_compliments_note,user_compliments_photos,user_compliments_plain,user_compliments_profile,user_compliments_writer,positive_words_count,negative_words_count,positive_to_negative_words_count_ratio,negative_to_positive_words_count_ratio,neutral_words_count,neutral_words_count_over_review_length,positive_words_count_over_review_length,negative_words_count_over_review_length,friends_average_stars,user_average_stars,business_stars,review_stars,business_review_count.1
review_cool,1.0,0.846001,0.884791,0.961694,0.375141,0.07137,0.123804,-0.108937,0.2603,0.457695,0.438596,0.453992,0.453661,0.32748,0.462639,0.456759,0.441711,0.344579,...,0.460281,0.218095,0.412474,0.253006,0.286627,0.369199,0.223213,0.192742,0.000161,0.37463,0.055881,-0.044928,-0.028273,-0.152963,0.070254,0.131946,0.097869,0.2603
review_funny,0.846001,1.0,0.799753,0.916352,0.323274,-0.008216,0.040158,-0.132546,0.200262,0.369084,0.375682,0.365082,0.370778,0.253968,0.35233,0.391286,0.372813,0.284697,...,0.369216,0.173325,0.333361,0.209818,0.239103,0.299277,0.250177,0.080743,0.07179,0.322778,0.058426,-0.078578,0.029718,-0.13084,0.024403,0.062032,-0.012682,0.200262
review_useful,0.884791,0.799753,1.0,0.958301,0.400863,0.044185,0.107005,-0.129908,0.242205,0.429709,0.408833,0.431812,0.427573,0.314926,0.450322,0.453445,0.435386,0.308505,...,0.434563,0.198768,0.39029,0.225169,0.255488,0.379657,0.253977,0.190784,0.036146,0.40047,0.079265,-0.075135,-0.01871,-0.148305,0.073928,0.118063,0.039947,0.242205
review_votes_count,0.961694,0.916352,0.958301,1.0,0.391924,0.041018,0.099608,-0.130582,0.249739,0.445585,0.43199,0.444077,0.443771,0.319133,0.451328,0.461727,0.443594,0.33099,...,0.448585,0.209039,0.40307,0.242673,0.27549,0.373996,0.256256,0.169326,0.035842,0.39144,0.069662,-0.069873,-0.009238,-0.153045,0.062629,0.113349,0.04702,0.249739
review_length,0.375141,0.323274,0.400863,0.391924,1.0,0.043213,0.083062,-0.115795,0.295686,0.322483,0.309742,0.341593,0.329049,0.362562,0.34157,0.34962,0.356094,0.219689,...,0.262135,0.184791,0.233808,0.158937,0.242313,0.867083,0.699071,0.258427,0.067303,0.999958,0.149039,-0.150147,-0.018534,-0.089188,0.024058,0.111711,0.013344,0.295686
Average_number_of_stars,0.07137,-0.008216,0.044185,0.041018,0.043213,1.0,0.821639,0.664573,0.158661,0.010931,0.006924,0.00809,0.008796,0.021664,0.01363,0.019108,0.048362,-0.00434,...,-0.003984,-0.012195,-0.012495,-0.009345,0.001372,0.126813,-0.118574,0.243656,-0.277126,0.042655,-0.070786,0.190651,-0.215034,0.032144,0.367632,0.618731,0.762887,0.158661
Max_Number_of_stars,0.123804,0.040158,0.107005,0.099608,0.083062,0.821639,1.0,0.249202,0.25747,0.070379,0.065034,0.074952,0.071356,0.11306,0.077755,0.13998,0.155646,0.037519,...,0.062054,0.018515,0.040836,0.025322,0.036279,0.162385,-0.072308,0.242063,-0.237588,0.082366,-0.055428,0.159011,-0.186619,0.006968,0.353233,0.528571,0.657287,0.25747
Min_Number_of_stars,-0.108937,-0.132546,-0.129908,-0.130582,-0.115795,0.664573,0.249202,1.0,-0.309002,-0.126258,-0.123508,-0.138773,-0.131516,-0.177036,-0.130015,-0.196867,-0.181759,-0.082008,...,-0.120209,-0.062848,-0.103192,-0.065966,-0.07572,-0.055569,-0.17526,0.087939,-0.181357,-0.115941,-0.07472,0.156609,-0.143262,0.052142,0.214197,0.304832,0.489759,-0.309002
business_review_count,0.2603,0.200262,0.242205,0.249739,0.295686,0.158661,0.25747,-0.309002,1.0,0.261979,0.241647,0.275296,0.263934,0.306554,0.28926,0.257973,0.309313,0.171153,...,0.222637,0.122695,0.192502,0.123641,0.176549,0.289852,0.183558,0.136236,-0.030898,0.295253,0.041386,-0.019918,-0.045991,-0.066273,0.063584,0.262762,0.119913,1.0
user_cool,0.457695,0.369084,0.429709,0.445585,0.322483,0.010931,0.070379,-0.126258,0.261979,1.0,0.983902,0.993545,0.998148,0.784634,0.890826,0.378545,0.471614,0.850486,...,0.907203,0.74754,0.904779,0.759746,0.839574,0.300945,0.187617,0.14737,0.010155,0.322351,0.060428,-0.053121,-0.022063,-0.185319,0.0226,0.044427,0.015167,0.261979


In [213]:
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *

def triangularize_symmetric_matrix(x, y, z):
    x_new = x[0:-1]
    y_new = y[-1:0:-1]
    tmp = np.empty(z.shape)
    tmp.fill(np.nan)
    for i in range(len(z)):
        tmp[i, 0 :i] = z[i, 0:i]
    tmp = np.flipud(tmp)
    z_new = tmp
    return (x_new, y_new, z_new)

In [214]:
py.sign_in('kevin11h', 'ilrcfqwrng')

(x, y, z) = triangularize_symmetric_matrix(corr_features,
                                           corr_features,
                                           Corr_df.as_matrix())

trace = Heatmap(z=z,
                x=x,
                y=y,
                zauto=False,
                zmax = 1,
                zmin = -1,
                colorscale=[[-1,'#3333CC'], [0, '#CC00CC'], [1, '#CC0000']],
                reversescale=True,
                colorbar=ColorBar(titleside='bottom',
                                  title='   %f'%-1))

data = Data([trace])

title = 'Feature Correlations of Yelp Reviews'


layout = Layout(title=title,
                autosize=False,
                height=500,
                width=600,
                annotations=Annotations([Annotation(text=annotation_text,
                                         xref='paper',
                                         yref='paper',
                                         xanchor='right',
                                         yanchor='bottom',
                                         x=.75,
                                         y=1)])
                )
axis_style = {'autotick':False,'showgrid':False,'showline':False}
layout.update(xaxis=XAxis(axis_style, tickangle=90))
layout.update(yaxis=YAxis(axis_style))

fig = Figure(data=data, layout=layout)

url = py.plot(fig, filename='yelp_feature_correlations', auto_open=False)
print url
py.iplot(fig, filename='yelp_feature_correlations')

https://plot.ly/~kevin11h/25
