In [356]:
import numpy as np
import pandas as pd
import json
import time

### How do we predict what rating a user is going to give?
1) Business side:
    - location
    - type
    - hours
    - parking availability
    - ambience
    - takeout?
2) User side:
    - how long on yelp
    - elite status
    - review count
    - useful/funny/cool
    - home

# TODO
    - Matthew
        - isolate restaurants in business_df
        - preprocessing: marking reviews as good or bad
    - feature extraction
        - hometown
        - number of tips by user

    - other preprocessing
        - yelpin since: timestamp today - yelping since --- COMPLETE
        - elite status: how many years have they been elite
        - only consider business with certain number of reviews
        - only consider "active users"

# Next
    - what happens if we remove "inactive" users

        

In [357]:
business_df = pd.read_csv("sample_restaurant.csv")
checkin_df = pd.read_csv("sample_checkin.csv")
review_df = pd.read_csv("sample_review.csv")
tip_df = pd.read_csv("sample_tip.csv")
user_df = pd.read_csv("sample_user.csv")

In [358]:
business_df.drop(columns = 'Unnamed: 0', inplace = True)
business_loc = business_df.iloc[:, [0,3, 4, 5, 6, 7]]

In [359]:
user_useful = user_df.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
user_useful.set_index('user_id', inplace = True)

In [360]:
review_df.drop(columns = 'Unnamed: 0', inplace=True)

In [361]:
rJoinU = review_df.join(user_useful, on = 'user_id', lsuffix='_Rev', rsuffix = '_User')

In [362]:
rJoinU.dropna(inplace = True)

In [363]:
business_loc.set_index('business_id', inplace= True)

In [364]:
rJUJBloc = rJoinU.join(business_loc, on='business_id', lsuffix='_rJU', rsuffix='_bloc')

In [365]:
rJUJBloc.dropna(inplace= True)

In [366]:
hometown = rJUJBloc.groupby(['user_id'])['city'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()

In [367]:
hometown

Unnamed: 0_level_0,city
user_id,Unnamed: 1_level_1
--_r6E98SNIrGU7weyNxbw,West Chester
--rpFxc_x14BWF708pfR6Q,Philadelphia
-0U1fhFq9zl4AyKafrKHFw,Nashville
-0aInSHjCWLfiNqfgmWnow,Philadelphia
-0aZWYi2YicFaLxTru96nA,Philadelphia
...,...
zwXmvn1op5LuFF2Kveqaug,Indianapolis
zxNtaKTMzZzum8ek2v4UXg,Reno
zxuxd6Hz2tKcpgZ71dYEcw,Edmonton
zyNrXvJyYdC34tS6BcCykA,Philadelphia


In [368]:
data = rJUJBloc.merge(hometown, how = 'left', on="user_id", suffixes=('_x', '_y'))

In [369]:
data['is_home'] = (data['city_x'] == data['city_y']).astype(int)

## Yelping Since

In [370]:
data['time_since'] = (pd.Timestamp(2022, 11, 29) - pd.to_datetime(data.date)).dt.days

In [371]:
data.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful_Rev',
       'funny_Rev', 'cool_Rev', 'text', 'date', 'name', 'review_count',
       'yelping_since', 'useful_User', 'funny_User', 'cool_User', 'elite',
       'friends', 'fans', 'average_stars', 'city_x', 'state', 'postal_code',
       'latitude', 'longitude', 'city_y', 'is_home', 'time_since'],
      dtype='object')

## Matthew's Portion: marking "good" reviews

In [372]:
merged = business_df.merge(review_df, on='business_id')

In [373]:
merged.dropna(inplace=True)

In [374]:
merged['goodReview'] = (merged['stars_x'] < merged['stars_y']).astype(int)

In [375]:
goodRev = merged.loc[:, ['review_id','business_id', 'stars_x', 'goodReview']]

In [376]:
goodRev.rename(columns={'stars_x': 'stars_bus'}, inplace=True)

In [377]:
dataFull = data.merge(goodRev, on='review_id')

# Try a model

In [379]:
# Logistic
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

#### Using features: 
    - star rating of review
    - review count of user
    - useful of user
    - funny of user
    - cool of user
    - # fans of user
    - average star rating OF USER
    - is hometown
    - time since joining yelp

## TODO:
    - try a 70-30 train test split and evaluate performance
    - extract elite
    - extract # of friends

# Elite

In [380]:
elite = data.loc[:, ['user_id', 'elite']]

In [381]:
elite['glitch'] = elite['elite'].str.contains('20,20').astype(int)

In [382]:
elite['elite'] = elite['elite'].str.split(',')

In [383]:
elite['count'] = elite['elite'].apply(lambda x: len(x))

In [384]:
elite['trueCount'] = elite['count'] - elite['glitch']

In [405]:
elite.drop(columns=['elite', 'glitch', 'count'], inplace = True)

In [408]:
elite = elite.rename(columns={'trueCount': 'eliteCount'})

In [410]:
elite = elite.drop_duplicates(subset = 'user_id')

In [413]:
dataFull = dataFull.merge(elite, how = 'left', on = 'user_id')

In [415]:
friends = data.loc[:, ['user_id', 'friends']]

In [416]:
friends['numFriends'] = friends.friends.str.split(',').apply(lambda x: len(x))

In [417]:
friends.drop(columns = 'friends', inplace = True)

In [418]:
friends = friends.drop_duplicates(subset='user_id')

In [419]:
dataFull = dataFull.merge(friends, how = 'left', on = 'user_id')

In [439]:
dataFull = dataFull.drop_duplicates(subset='review_id')

In [447]:
dataFull

Unnamed: 0,review_id,user_id,business_id_x,stars,useful_Rev,funny_Rev,cool_Rev,text,date,name,...,latitude,longitude,city_y,is_home,time_since,business_id_y,stars_bus,goodReview,eliteCount,numFriends
0,NILjwITiFhpdhiE3SiHR8Q,WUgxsRUFjATha8L4qWYDww,fbQkVymvZ0dig8umltKhZQ,3,2,1,1,They give you LOTS of food and you will not go...,2020-09-23 21:49:15,Glenn,...,27.948237,-82.527587,Tampa,1,796,fbQkVymvZ0dig8umltKhZQ,4.0,0,10,168
1,hOwUw1fz3aB4ZhgqCetGTw,fr1Hz2acAb3OaL3l6DyKNg,RewAum_fQ0lXdFoUX18J0g,5,17,4,13,It is 4.5 stars. Not yet 5 stars. Mel has a go...,2020-08-18 19:09:46,Boon,...,28.031316,-82.451534,Tampa,1,832,RewAum_fQ0lXdFoUX18J0g,5.0,0,8,825
15,bbVN5xloXk4Bzf6AF0wwMw,8fkeTZoaojIL3GIHvvTCyA,LdECsE8lJS7v5GTFTcjPSg,4,0,0,0,"Before reading on, make sure id you eat here y...",2018-12-16 00:07:13,Amy,...,27.725209,-82.742302,St. Pete Beach,1,1443,LdECsE8lJS7v5GTFTcjPSg,4.0,0,3,34
16,OQL_x9smctWsVq_qQTOlcg,_zC91aGLwBjnMJji-rpG9A,cGX-1IUwXOjkUqZbkKYcjw,5,8,0,3,"Fogo De Chao\r\n1337 Chestnut Street, Philadel...",2015-11-03 03:01:00,John,...,39.950917,-75.162971,Philadelphia,1,2582,cGX-1IUwXOjkUqZbkKYcjw,4.0,1,2,68
17,P-tNl7Z4lBq-nPvzU0S2Iw,pUNvLJwdJIOemOgU98mp1w,2HxkdqHmbYGj_BH1bLaiSw,4,5,2,3,Nektar is a beautiful wine bar right in the ce...,2020-04-10 12:53:26,Amy,...,40.362115,-74.950957,Lansdale,0,962,2HxkdqHmbYGj_BH1bLaiSw,4.0,0,10,335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18127,B1WIpl0eXoHEQw6ZkvPf3Q,nHXo42Jq950NrEbCyvF4UA,7-5eljD7oqIdLYU5_sqpqA,5,2,0,0,We haven't had Noble Roman's Pizza for a very ...,2020-06-11 02:49:34,Karen,...,39.978454,-86.154787,McCordsville,0,900,7-5eljD7oqIdLYU5_sqpqA,3.5,1,4,38
18131,bZGNw6bx4cM5BwMe-CcT7A,gWZETLPu_ihAtLOYtrm0mw,Obk11u4CHhyqwkiJ517kaw,5,16,2,13,After asking friends for recommendations in Ne...,2022-01-10 16:10:13,Gabriella,...,29.953541,-90.074752,New Orleans,1,322,Obk11u4CHhyqwkiJ517kaw,4.0,1,2,2048
18133,cc8VettDuW-6nGLosYIIMA,qJMlmtF60eJnH2MWNps6Og,UM6XgOtTX4DWWcGDTWtjEg,5,0,0,0,We came to handlebar for geeks who drink trivi...,2019-11-15 17:52:32,Maggie,...,38.627945,-90.251275,St Louis,1,1109,UM6XgOtTX4DWWcGDTWtjEg,4.0,1,3,246
18134,s80hq-ipoqi9wgAQsfnR7g,6z_Kd-MrM66rzWQX9zZRjA,5gIfmupQTP3Lk4wIgNfoEQ,3,0,0,0,Had lunch there and the menu was very simple. ...,2016-08-10 21:37:23,Nancy,...,39.928215,-86.024509,Fishers,1,2301,5gIfmupQTP3Lk4wIgNfoEQ,4.0,0,5,109


# Model

In [441]:
y = dataFull['goodReview']
X = dataFull.loc[:, ['review_count', 'useful_User', 'funny_User', 'cool_User', 'fans', 'average_stars', 'is_home', 'time_since', 'eliteCount', 'numFriends']]

In [442]:
scaler = StandardScaler()
X_ss = scaler.fit_transform(X)

In [443]:
clf = LogisticRegression(penalty='none').fit(X_ss, y)

In [444]:
clf.score(X_ss, y)

0.6022417449257801

In [425]:
params = clf.get_params()

In [426]:
params

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [448]:
print(clf.coef_)

[[-0.02703119 -0.15632987  0.23585574 -0.05737717 -0.08529643  0.51548612
  -0.0277329  -0.00225487 -0.08839213  0.08566784]]
