In [142]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import re

In [143]:
#read in indian yelp reviews
df = pd.read_csv('/Users/jralston/git_proj/Scrapy_project_yelp/yelp/yelp_indian.csv')

#restrict to Dhaba
df = df[df.restaurant == 'Dhaba Indian Cuisine']

In [148]:
sample_menu = ['mulligatawny', 'onion pakora', 'samosa', 'paneer pakora',
              'lasuni gobi', 'aloo tikki chaat', 'murgh', 
              'tikka', 'chicken tikka masala', 'rogan josh',
              'naan', 'garlic naan', 'poori', 'korma', 'chutney']

## Dish Reviews Based on Yelp Review

In [164]:
#just grab rating and review text
df = df[['rating', 'text']]

#switch to lower case
df.text = df.text.apply(lambda x: x.lower())

In [165]:
# this returns just those reviews that have the word
# in the text of the review
def subset_reviews(word, df):
    return df[df.text.str.contains(word)]

# return avg rating of revies that contain dish
def avg_review_of_dish(item, df):
    return subset_reviews(item, df).mean()

# return nuber of times dishes reviewed
def dish_count(item, df):
    return subset_reviews(item, df).shape[0]

In [166]:
# create the dish review dataframe
dish_ratings = [avg_review_of_dish(item, df)[0] for item in sample_menu]
dish_counts = [dish_count(item, df) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times reviewed': dish_counts}
dish_df = pd.DataFrame(data)

In [167]:
# dish review dataframe
dish_df.sort_values(by = 'rating', ascending=False)

Unnamed: 0,dish,rating,times reviewed
1,onion pakora,4.5,2
12,poori,4.272727,11
0,mulligatawny,4.222222,9
13,korma,4.214286,56
6,murgh,4.205882,34
3,paneer pakora,4.111111,9
7,tikka,4.007143,280
4,lasuni gobi,4.0,1
8,chicken tikka masala,3.978261,184
11,garlic naan,3.962617,107


## Dish Review Based on Sentiment of Yelp Review

In [162]:
# create sentiment df which is text, polarity of text

from textblob import TextBlob
df_sentiment = df[['text']]
df_sentiment.text = df_sentiment.text.apply(lambda x: x.lower())
df_sentiment['polarity'] = df_sentiment.text.apply(lambda x: TextBlob(x).sentiment.polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [173]:
# this returns just those reviews that have the word
# in the text of the review
def subset_reviews(word, df):
    return df[df.text.str.contains(word)]

# return avg rating of revies that contain dish
def avg_review_of_dish(item, df):
    return subset_reviews(item, df).mean()

# return nuber of times dishes reviewed
def dish_count(item, df):
    return subset_reviews(item, df).shape[0]

In [200]:
# create the dish review dataframe
dish_ratings = [avg_review_of_dish(item, df_sentiment)[0] for item in sample_menu]
dish_counts = [dish_count(item, df_sentiment) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times reviewed': dish_counts}
dish_sentiment_df = pd.DataFrame(data)


dish_sentiment_df.sort_values(by = 'rating', ascending=False)

Unnamed: 0,dish,rating,times reviewed
0,mulligatawny,0.401777,9
4,lasuni gobi,0.389722,1
1,onion pakora,0.358581,2
12,poori,0.253794,11
6,murgh,0.225904,34
13,korma,0.216392,56
14,chutney,0.204518,55
11,garlic naan,0.19831,107
10,naan,0.197663,457
2,samosa,0.196692,75


# Dish Review Based on Sentiment of Yelp Sentence

### First need to split reviews into sentences

In [208]:
# split the reviews into sentecnes
reviews_sentences = ''.join(list(df.text)).split('.')

# create a dataframe with these sentences
data = {'text' : reviews_sentences}
sentences_df = pd.DataFrame(data)
sentences_df.text = sentences_df.text.apply(lambda x: x.lower())

# create column which is polarity of text
from textblob import TextBlob
sentences_df['polarity'] = sentences_df.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [209]:
# this returns just those reviews that have the word
# in the text of the review
def subset_reviews(word, df):
    return df[df.text.str.contains(word)]

# return avg rating of revies that contain dish
def avg_review_of_dish(item, df):
    return subset_reviews(item, df).mean()

# return nuber of times dishes reviewed
def dish_count(item, df):
    return subset_reviews(item, df).shape[0]

In [210]:
# create the dish review dataframe
dish_ratings = [avg_review_of_dish(item, sentences_df)[0] for item in sample_menu]
dish_counts = [dish_count(item, sentences_df) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times mentioned': dish_counts}
sentences_df = pd.DataFrame(data)


sentences_df.sort_values(by = 'rating', ascending=False)

Unnamed: 0,dish,rating,times mentioned
0,mulligatawny,0.67925,10
1,onion pakora,0.5,2
14,chutney,0.253841,67
11,garlic naan,0.170734,116
10,naan,0.15893,565
2,samosa,0.120317,88
12,poori,0.063136,11
13,korma,0.03915,63
6,murgh,0.012909,37
3,paneer pakora,0.00754,9


In [204]:
sentences_df.head()

Unnamed: 0,text,polarity
0,"we love love love dhaba, the gobi and chicken ...",0.458333
1,"the food is so good, all my friends go there now",0.7
2,the manager is wonderful and very accomodating,0.6
3,have been going there since the start!! thank...,-0.2
4,maybe this place really does have good punja...,0.378571


In [234]:
sentences_df.head()

Unnamed: 0,dish,rating,times mentioned
0,mulligatawny,0.67925,10
1,onion pakora,0.5,2
2,samosa,0.120317,88
3,paneer pakora,0.00754,9
4,lasuni gobi,0.0,1


# Model sentiment from dish and pick best dish

### First need to vectorize the sentences... i.e. for each sentence make a vector of 0s and 1s that say whether a dish is mentioned in the sentence or not

In [None]:
# split the reviews into sentecnes
reviews_sentences = ''.join(list(df.text)).split('.')

# create a dataframe with these sentences
data = {'text' : reviews_sentences}
sentences_df = pd.DataFrame(data)
sentences_df.text = sentences_df.text.apply(lambda x: x.lower())

# create column which is polarity of text
from textblob import TextBlob
sentences_df['polarity'] = sentences_df.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [211]:
example = 'example sentence with samosa and tikka'

In [275]:
def sentence_to_vector(sentence, menu):
    return [1*(dish in sentence) for dish in menu]

In [276]:
sentence_to_vector(example, sample_menu)

[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

In [277]:
# split the reviews into sentecnes
reviews_sentences = ''.join(list(df.text)).split('.')
len(reviews_sentences)

10958

In [278]:
variable_sentences = [sentence_to_vector(sentence, sample_menu) for sentence in reviews_sentences]

In [279]:
len(variable_sentences)

10958

In [280]:
len(sample_menu)

15

In [281]:
X = np.matrix(variable_sentences)
X.shape

(10958, 15)

In [267]:
y = [TextBlob(x).sentiment.polarity for x in reviews_sentences]
len(y)

10958

In [268]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [282]:
regr = linear_model.LinearRegression()

In [283]:
regr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [284]:
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [ 0.53151667  0.34746002 -0.02014995 -0.11686944 -0.15253998 -0.13289955
 -0.09340149 -0.11969432 -0.21737174 -0.12665306  0.02733276  0.05726216
 -0.07126847 -0.03857038  0.10538325]


In [288]:
regr.intercept_

0.1525399818909162

In [285]:
print(sample_menu)

['mulligatawny', 'onion pakora', 'samosa', 'paneer pakora', 'lasuni gobi', 'aloo tikki chaat', 'murgh', 'tikka', 'chicken tikka masala', 'rogan josh', 'naan', 'garlic naan', 'poori', 'korma', 'chutney']


In [286]:
data = {'coef' : regr.coef_,
       'dish' : sample_menu}
a_df = pd.DataFrame(data)
a_df

Unnamed: 0,coef,dish
0,0.531517,mulligatawny
1,0.34746,onion pakora
2,-0.02015,samosa
3,-0.116869,paneer pakora
4,-0.15254,lasuni gobi
5,-0.1329,aloo tikki chaat
6,-0.093401,murgh
7,-0.119694,tikka
8,-0.217372,chicken tikka masala
9,-0.126653,rogan josh


In [287]:
a_df.sort_values('coef')

Unnamed: 0,coef,dish
8,-0.217372,chicken tikka masala
4,-0.15254,lasuni gobi
5,-0.1329,aloo tikki chaat
9,-0.126653,rogan josh
7,-0.119694,tikka
3,-0.116869,paneer pakora
6,-0.093401,murgh
12,-0.071268,poori
13,-0.03857,korma
2,-0.02015,samosa


In [303]:
reg = linear_model.Lasso(alpha = 0.00001)
reg.fit(X,y)

Lasso(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [304]:
reg.coef_

array([ 0.5204674 ,  0.29269704, -0.01859296, -0.10579765, -0.04293298,
       -0.12163523, -0.09049454, -0.12019969, -0.21645996, -0.12134035,
        0.02716198,  0.05621224, -0.06131842, -0.03717856,  0.10330106])

In [305]:
data = {'coef' : reg.coef_,
       'dish' : sample_menu}
a_df = pd.DataFrame(data)
a_df.sort_values('coef')

Unnamed: 0,coef,dish
8,-0.21646,chicken tikka masala
5,-0.121635,aloo tikki chaat
9,-0.12134,rogan josh
7,-0.1202,tikka
3,-0.105798,paneer pakora
6,-0.090495,murgh
12,-0.061318,poori
4,-0.042933,lasuni gobi
13,-0.037179,korma
2,-0.018593,samosa
