In [143]:
# Set width of Jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

In [144]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import glob
import os
import json
import geopy.distance
from scipy.spatial.distance import pdist, squareform
import seaborn as sns
from datetime import date
from dateutil.relativedelta import relativedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import foodie_features
import yelp_data_cleaning

In [145]:
pd.set_option('display.max_row', 200)
pd.set_option('display.max_columns', 25)

In [146]:
def pull_raw_business_data():
    raw_businesses_df = pd.read_json('yelp_reviews/business.json',lines=True)
    return raw_businesses_df

In [147]:
NOV_14_2018 = date(2018, 11, 14)

def calculate_additional_features(businesses_df, reviews_df):
    
    open_restaurants_yelp_api_data_df = pd.read_pickle('./open_restaurants_yelp_api_data_df.pkl').rename(columns={'price':'cost'})
    open_restaurants_yelp_api_data_df['cost'] = open_restaurants_yelp_api_data_df['cost'].map(lambda x: len(x))
    businesses_df['latitude'].update(open_restaurants_yelp_api_data_df['latitude'])
    businesses_df['longitude'].update(open_restaurants_yelp_api_data_df['longitude'])
    
    # is restaurant claimed
    businesses_df['is_claimed'] = [False for i in range(businesses_df.shape[0])]
    businesses_df['is_claimed'].update(open_restaurants_yelp_api_data_df['is_claimed'])
    businesses_df['is_claimed'] = businesses_df['is_claimed'].apply(lambda x: 1 if x is True else 0)
    
    # compute actual review counts since the dataset is wrong
    businesses_df['actual_review_count'] = reviews_df['business_id'].value_counts()
    
    # compute actual star rating based on actual reviews since the dataset might be wrong and yelp doesn't compute precise ratings
    businesses_df['actual_stars'] = reviews_df.groupby('business_id')['stars'].mean()
    
    # is restaurant a chain (find if business name is not unique)
    chains = businesses_df[businesses_df.name.duplicated(keep=False)].sort_values(by='name').name.unique()
    businesses_df['is_chain'] = businesses_df.name.isin(chains)
    businesses_df['is_chain'] = businesses_df['is_chain'].apply(lambda x: 1 if x is True else 0)
    
    # check if location has multiple closures (TODO fix the fact that the first duplicate location shouldn't matter)
    duplicate_lat_filter = businesses_df.latitude.duplicated(keep=False)
    duplicate_long_filter = businesses_df.longitude.duplicated(keep=False)
    duplicate_locations_df = businesses_df[ (duplicate_lat_filter) & (duplicate_long_filter) ].sort_values(by='latitude')[['latitude','longitude']]
    duplicate_locations = businesses_df.isin(duplicate_locations_df)
    
    businesses_df['duplicate_location'] = duplicate_locations.latitude & duplicate_locations.longitude
    businesses_df['duplicate_location'] = businesses_df['duplicate_location'].apply(lambda x: 1 if x is True else 0)
        
    # add if business has parking
    parking_attrbs = [(key, value[u'BusinessParking']) \
                      if value is not None and 'BusinessParking' in value else (key, 'False') \
                      for key,value in yelp_businesses_df.attributes.iteritems()]
    has_parking_tuples = [(attrbs[0], 'True' in attrbs[1]) for attrbs in parking_attrbs]
    business_ids, has_parking = zip(*has_parking_tuples)
    has_parking_series = pd.Series(has_parking, business_ids)

    businesses_df['has_parking'] = has_parking_series
    businesses_df['has_parking'] = businesses_df['has_parking'].apply(lambda x: 1 if x is True else 0)
    
    # add in cost of restaurant (TODO fix the fact that some restaurants had incomplete data so have 0 under price)
    price_attrbs = [(key, int(value[u'RestaurantsPriceRange2'])) \
                    if value is not None and 'RestaurantsPriceRange2' in value and value[u'RestaurantsPriceRange2'] != u'None' else (key, 1) \
                    for key, value in yelp_businesses_df.attributes.iteritems() ]
    business_ids, cost_rating = zip(*price_attrbs)
    cost = pd.Series(cost_rating, business_ids)
    
    businesses_df['cost'] = cost
    businesses_df['cost'].update(open_restaurants_yelp_api_data_df['cost'])
    
    # proxy for when restaurant open/closed, make sure to include updated dates
    bus_date_df = reviews_df[reviews_df.business_id.duplicated(keep=False)].sort_values(by=['business_id','date'])
    open_dates = bus_date_df.drop_duplicates(subset=['business_id'],keep='first').set_index('business_id')
    closed_dates = bus_date_df.drop_duplicates(subset=['business_id'],keep='last').set_index('business_id')

    open_closed_dates_df = pd.DataFrame(data={'open_dates' : open_dates.date, 'closed_dates' : closed_dates.date, 'is_open' : businesses_df.is_open}, index=open_dates.index)
    open_closed_dates_df['open_dates'] = open_closed_dates_df['open_dates'].apply(lambda x: x.date())
    open_closed_dates_df['closed_dates'] = open_closed_dates_df['closed_dates'].apply(lambda x: x.date())

    businesses_df['open_dates'] = open_closed_dates_df.open_dates
    businesses_df['closed_dates'] = open_closed_dates_df.closed_dates
    
    businesses_df.loc[businesses_df['is_open'] == 1, 'days_since_closed'] = 0
    businesses_df.loc[businesses_df['is_open'] == 1, 'closed_dates'] = NOV_14_2018 # 2018-11-14 is the last day in this dataset
    
    closed_ages = NOV_14_2018 - pd.to_datetime(closed_dates.date, format='%Y%m%d').dt.date
    open_ages = businesses_df['closed_dates'] - pd.to_datetime(open_dates.date, format='%Y%m%d').dt.date
    
    businesses_df['age (in days)'] = open_ages.dt.days
    businesses_df['days_since_closed'] = closed_ages.dt.days
    
    # relative review count, rating, price

    return chains, duplicate_locations_df

In [148]:
reload(foodie_features)
reload(yelp_data_cleaning)

<module 'yelp_data_cleaning' from 'yelp_data_cleaning.pyc'>

In [149]:
VEGAS_VISITORS_BY_YEAR = {'2018' : 42.12, '2017': 39.01, '2016': 42.94, '2015' : 42.31, '2014' : 41.13, '2013' : 39.67, '2012' : 39.73, '2011' : 38.93, '2010' : 37.34, '2009' : 36.35}
VEGAS_VISITORS_2018_BY_MONTH = {'1' : 3393900, '2' : 3130400, '3' : 3749800, '4'  : 3548000, '5'  : 3630400, '6'  : 3565400,\
                                '7' : 3659600, '8' : 3555200, '9' : 3457500, '10' : 3680600, '11' : 3478500, '12' : 3267600 }
VEGAS_POPULATION_BY_YEAR = {'2018' : 42.12, '2017': 39.01, '2016': 42.94, '2015' : 42.31, '2014' : 41.13, '2013' : 39.67, '2012' : 39.73, '2011' : 38.93, '2010' : 37.34, '2009' : 36.35}
def build_X_and_y(businesses_df, reviews_df, date, load_NLP=False, do_distance=False, features=None):
    """
    businesses_df: dataframe of businesses 
    date: needs to be in a date object from the datetime library
    """
    before_date_filter = reviews_df.date <= pd.Timestamp(date)
    after_date_filter = reviews_df.date > pd.Timestamp(date)
    after_six_months_after_date_filter = reviews_df.date > pd.Timestamp(date + relativedelta(months=6))
        
    reviews_before_date_df = reviews_df[before_date_filter] # all reviews before date (i.e. they were open before date)
    reviews_after_date_df = reviews_df[after_date_filter] # all reviews after date (i.e. they were open after date)
    
    # restaurants with reviews both before and after date
    restaurant_ids = list( set(reviews_before_date_df.business_id.values) & set(reviews_after_date_df.business_id.values) )
    reviews_for_open_businesses = reviews_df[reviews_df.business_id.isin(restaurant_ids)]  
    reviews_for_open_businesses_before_date = reviews_for_open_businesses[before_date_filter]
    
    # find number of other restaurants with same yelp categories -- TODO think about whether it should be at the time of date 
    business_category_counts = foodie_features.calculate_category_counts(businesses_df)
    
    # NLP score -- grab 3 most recent reviews since that's what allowed by yelp
    sentiment, avg_review_length = foodie_features.calculate_review_sentiment_and_length(reviews_for_open_businesses_before_date, load_NLP)
    
    # compute review count and rating before date
    review_count_on_date = reviews_for_open_businesses_before_date['business_id'].value_counts()
    rating_on_date = reviews_for_open_businesses_before_date.groupby('business_id')['stars'].mean()
    
    # compute smart rating (using if the reviews were useful)
    smart_rating_on_date = foodie_features.calculate_smart_ratings(reviews_for_open_businesses_before_date)
    
    # get number of businesses within fixed distance
    count_of_businesses_within_a_tenth_mile = pd.Series()
    count_of_businesses_within_a_quarter_mile = pd.Series()
    count_of_businesses_within_a_half_mile = pd.Series()
    count_of_businesses_within_1_mile = pd.Series()
    count_of_businesses_within_5_miles = pd.Series()
    count_of_businesses_within_10_miles = pd.Series()
    
    for _, city_df in businesses_df.reindex(restaurant_ids).groupby('city'):
        if do_distance == True:        
            coords = city_df[['latitude','longitude']].values
            geopy_distance_func = lambda x,y: geopy.distance.distance(x,y).miles
            trimmed_distances = pdist(coords, metric=geopy_distance_func)
            np.savetxt('distances_all_%s.csv'%city_df.city.unique()[0], trimmed_distances, delimiter=',')
        else:
            trimmed_distances = np.genfromtxt('distances_all_%s.csv'%city_df.city.unique()[0], delimiter=',')
        trimmed_dist_matrix = squareform(trimmed_distances)
        count_of_businesses_within_a_tenth_mile = count_of_businesses_within_a_tenth_mile.append( pd.Series( np.sum((trimmed_dist_matrix < 0.1),axis=1) - 1, index = city_df.index ) )
        count_of_businesses_within_a_quarter_mile = count_of_businesses_within_a_quarter_mile.append ( pd.Series( np.sum((trimmed_dist_matrix < 0.25),axis=1) - 1, index = city_df.index ) )
        count_of_businesses_within_a_half_mile = count_of_businesses_within_a_half_mile.append ( pd.Series( np.sum((trimmed_dist_matrix < 0.5),axis=1) - 1, index = city_df.index ) )
        count_of_businesses_within_1_mile = count_of_businesses_within_1_mile.append ( pd.Series( np.sum((trimmed_dist_matrix < 1),axis=1) - 1, index = city_df.index ) )
        count_of_businesses_within_5_miles = count_of_businesses_within_5_miles.append ( pd.Series( np.sum((trimmed_dist_matrix < 5),axis=1) - 1, index = city_df.index ) )
        count_of_businesses_within_10_miles = count_of_businesses_within_10_miles.append ( pd.Series( np.sum((trimmed_dist_matrix < 10),axis=1) - 1, index = city_df.index ) )
    
    #### TARGET VARIABLE -- Is a restaurant open 6 months after the input date ####
    is_closed_six_months_after_date_series = foodie_features.calculate_future_restaurant_closure(businesses_df, restaurant_ids, \
                                                                                                 reviews_for_open_businesses, after_six_months_after_date_filter)
    ############################################################################################################################################
    data = businesses_df.copy()
    data = data.reindex(restaurant_ids)
    
    data['review_count_before_date'] = review_count_on_date
    data['rating_before_date'] = rating_on_date
    data['smart_rating_before_date'] = smart_rating_on_date
    data['age_at_date'] = (date - data['open_dates']).dt.days  
    data['sentiment'] = sentiment
    data['avg_review_length'] = avg_review_length
    data['business_category_counts'] = business_category_counts
    data['num_within_a_tenth_mile_at_date'] = count_of_businesses_within_a_tenth_mile
    data['num_within_a_quarter_mile_at_date'] = count_of_businesses_within_a_quarter_mile
    data['num_within_a_half_mile_at_date'] = count_of_businesses_within_a_half_mile
    data['num_within_1_mile_at_date'] = count_of_businesses_within_1_mile
    data['num_within_5_miles_at_date'] = count_of_businesses_within_5_miles
    data['num_within_10_miles_at_date'] = count_of_businesses_within_10_miles
    data = data.join(pd.get_dummies(data['cost'],prefix = 'cost')).drop(['cost'], axis = 1 )
    data = data.join(pd.get_dummies(data['city'],prefix = 'city')).drop(['city'], axis = 1 )
    #data.drop(['attributes','address','categories','city','latitude','longitude','name','days_since_closed'],axis=1, inplace=True)
    #data.drop(['open_dates','closed_dates'], axis=1, inplace=True)
        
    X = data
    
    if features is not None:
        X = X[features]
        
    y = is_closed_six_months_after_date_series
    
    return X, y

In [150]:
raw_businesses_df = pull_raw_business_data()

In [151]:
cities = ['Las Vegas', 'Toronto', 'Phoenix']
yelp_businesses_df, categories = yelp_data_cleaning.clean_business_data(raw_businesses_df, type_of_business_list=['Restaurant'], city_filter_list=cities, \
                                                     remove_hours=True, required_num_of_closed_thresh_in_city=1000)
#del raw_businesses_df
yelp_businesses_df

Unnamed: 0_level_0,address,attributes,categories,city,is_open,latitude,longitude,name,review_count,stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1Dfx3zM-rW4n-31KeC8sJg,2450 E Indian School Rd,"{u'BusinessParking': u'{'garage': False, 'stre...","Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,1,33.495194,-112.028588,Taco Bell,18,3.0
PZ-LZzSlhSe9utkQYU8pFg,"1775 E Tropicana Ave, Ste 29","{u'BusinessParking': u'{'garage': False, 'stre...","Restaurants, Italian",Las Vegas,0,36.100016,-115.128529,Carluccio's Tivoli Gardens,40,4.0
tstimHoMcYbkSC4eBA1wEg,6055 E Lake Mead Blvd,"{u'GoodForMeal': u'{'dessert': False, 'latenig...","Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,1,36.195615,-115.040529,Maria's Mexican Restaurant & Bakery,184,4.5
NDuUMJfrWk52RA-H-OtrpA,1170 Queen Street W,"{u'BusinessParking': u'{'garage': False, 'stre...","Juice Bars & Smoothies, Food, Restaurants, Fas...",Toronto,1,43.642889,-79.425429,Bolt Fresh Bar,57,3.0
SP_YXIEwkFPPl_9anCYmpQ,1051 Bloor Street W,"{u'BusinessParking': u'{'garage': False, 'stre...","Restaurants, Nightlife, Breakfast & Brunch, Ve...",Toronto,0,43.660494,-79.432099,The Steady Cafe & Bar,29,3.5
kANF0dbeoW34s2vwh6Umfw,6125 Spring Mountain Rd,"{u'BusinessParking': u'{'garage': False, 'stre...","Fast Food, Food, Restaurants, Ice Cream & Froz...",Las Vegas,0,36.125031,-115.225620,Dairy Queen,33,2.0
44YFU284Z3KDEy25QyVoUw,"13843 N Tatum Blvd, Ste 15","{u'GoodForMeal': u'{'dessert': False, 'latenig...","Chinese, Restaurants",Phoenix,1,33.613020,-111.977036,Nee House Chinese Restaurant,269,3.5
RFbMVekR8lU9tPJ8sWrwHg,619 Mount Pleasant Road,"{u'GoodForMeal': u'{'dessert': False, 'latenig...","Italian, Restaurants",Toronto,1,43.704229,-79.388230,Tavolino,18,4.0
NPHZkn1e-tSJAbo8Zm9rYw,1614 Queen Street E,"{u'BusinessParking': u'{'garage': False, 'stre...","Tex-Mex, Mexican, Restaurants",Toronto,1,43.666181,-79.316468,Burrito Bandidos,43,3.5
jtlAfuINMCYVFpjwMgy2jw,5241 Dundas Street W,"{u'BusinessParking': u'{'garage': False, 'stre...","Grocery, Restaurants, Steakhouses, Butcher, Food",Toronto,0,43.638879,-79.537632,Medium Rare,22,4.0


In [152]:
business_ids = yelp_businesses_df.index.values
reviews_df = yelp_data_cleaning.clean_reviews_data(business_ids)
reviews_df

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,HojKatUfX4UHWaPqtl63NQ,5,2017-07-17 10:32:50,3,ywpTVqkG1a-SpsIN-PImqA,4,Opened in October 2015 as a rebrand of culinar...,6,U4INQZOPSUaj8hMjLlZ3KA
1,pHJu8tj3sI8eC5aIHLFEfQ,0,2016-01-29 04:44:56,0,hBXTJJWYLRye5KnGjN8Dcw,5,I had a very yummy pizza at Nora's tonight. T...,0,YgGp54AKCFqCIKcQhGHwRw
2,fGw9HRq8fdP9fMtZclTt8g,0,2015-02-01 04:32:42,0,KfLbiqrNCAsmIqQqibFf3g,3,Drive thru sucks! Don't go here unless your ab...,0,o2FbUvgG9MJIxYo8SIH7dg
3,1Fpk8ibHhZYnCw8fnGny8w,0,2017-11-17 21:02:19,0,wttOlUAks_3ACI5DMF8bLg,4,My friend and I came here several times on wee...,0,BW3OZfIlON-F_aMXfnAOjg
4,WX8cx6tfFcEn0cK4cIbWag,0,2018-10-28 04:05:42,0,_hX5S_Jo0SO6jklB3Lu7wA,1,"First, we made a reservation & they weren't pr...",0,OS5p_5bdHXKCcN7ZEK2TXQ
5,Sl42oSRXpwbjWd4V-Fxhaw,0,2018-10-02 23:54:46,1,qEV4RHssoNqvtO1Cf3m4rg,1,So we go into Denny's tonight for dinner and i...,0,-ThnWtxc9MAtkqnUanO46g
6,3h5jLa-kbHg0cL7v9En0Lg,0,2012-12-30 20:09:04,0,ek-kaQnj6PLEwZGz_LSrdw,4,Low Prices - Great Food - Very nice Setting\nW...,4,d2WVEYfpHMtTjDwiLh_r6A
7,rdZ6xVABSoOVeAIiCaEj4w,0,2010-09-27 07:08:59,0,aPWQLR2RL8N3u03QWsXHYw,4,This is my first time here and my first time h...,0,7weuSPSSqYLUFga6IYP4pg
8,vx1gUYqYQE2O5rfOrUi-Cw,2,2014-10-26 06:47:21,1,bWYPgbBseNPVSGZpvNCmTw,1,How far this company has fallen. From a techno...,3,41cK3lqGlQkgml7fEltugw
9,G4hjhtA_wQ-tSOGpgGlDjw,0,2017-09-03 02:47:37,0,C0lqD5lXMB2J8FudJV4uDw,5,Really really good! Get the pulpo taco! Very c...,0,uGcpEaM2ecZ2RRHsN-e1LA


In [153]:
chains, duplicate_locations_df = calculate_additional_features(yelp_businesses_df, reviews_df)

In [154]:
yelp_businesses_df

Unnamed: 0_level_0,address,attributes,categories,city,is_open,latitude,longitude,name,review_count,stars,is_claimed,actual_review_count,actual_stars,is_chain,duplicate_location,has_parking,cost,open_dates,closed_dates,days_since_closed,age (in days)
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1Dfx3zM-rW4n-31KeC8sJg,2450 E Indian School Rd,"{u'BusinessParking': u'{'garage': False, 'stre...","Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,1,33.495194,-112.028588,Taco Bell,18,3.0,0,19,2.894737,1,0,0,1,2008-09-22,2018-11-14,7,3705
PZ-LZzSlhSe9utkQYU8pFg,"1775 E Tropicana Ave, Ste 29","{u'BusinessParking': u'{'garage': False, 'stre...","Restaurants, Italian",Las Vegas,0,36.100016,-115.128529,Carluccio's Tivoli Gardens,40,4.0,0,41,4.097561,0,0,1,2,2007-08-23,2016-08-09,827,3274
tstimHoMcYbkSC4eBA1wEg,6055 E Lake Mead Blvd,"{u'GoodForMeal': u'{'dessert': False, 'latenig...","Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,1,36.195598,-115.040622,Maria's Mexican Restaurant & Bakery,184,4.5,1,189,4.301587,0,0,1,2,2010-03-23,2018-11-14,0,3158
NDuUMJfrWk52RA-H-OtrpA,1170 Queen Street W,"{u'BusinessParking': u'{'garage': False, 'stre...","Juice Bars & Smoothies, Food, Restaurants, Fas...",Toronto,1,43.642889,-79.425429,Bolt Fresh Bar,57,3.0,0,60,3.116667,0,0,1,2,2013-11-20,2018-11-14,29,1820
SP_YXIEwkFPPl_9anCYmpQ,1051 Bloor Street W,"{u'BusinessParking': u'{'garage': False, 'stre...","Restaurants, Nightlife, Breakfast & Brunch, Ve...",Toronto,0,43.660494,-79.432099,The Steady Cafe & Bar,29,3.5,0,29,3.379310,0,1,1,2,2013-07-20,2017-10-15,395,1548
kANF0dbeoW34s2vwh6Umfw,6125 Spring Mountain Rd,"{u'BusinessParking': u'{'garage': False, 'stre...","Fast Food, Food, Restaurants, Ice Cream & Froz...",Las Vegas,0,36.125031,-115.225620,Dairy Queen,33,2.0,0,33,2.242424,1,0,1,1,2008-11-02,2017-01-05,678,2986
44YFU284Z3KDEy25QyVoUw,"13843 N Tatum Blvd, Ste 15","{u'GoodForMeal': u'{'dessert': False, 'latenig...","Chinese, Restaurants",Phoenix,1,33.613020,-111.977036,Nee House Chinese Restaurant,269,3.5,0,285,3.400000,0,0,1,2,2007-07-28,2018-11-14,31,4127
RFbMVekR8lU9tPJ8sWrwHg,619 Mount Pleasant Road,"{u'GoodForMeal': u'{'dessert': False, 'latenig...","Italian, Restaurants",Toronto,1,43.704229,-79.388230,Tavolino,18,4.0,0,19,4.157895,1,0,1,2,2016-01-18,2018-11-14,267,1031
NPHZkn1e-tSJAbo8Zm9rYw,1614 Queen Street E,"{u'BusinessParking': u'{'garage': False, 'stre...","Tex-Mex, Mexican, Restaurants",Toronto,1,43.666181,-79.316468,Burrito Bandidos,43,3.5,0,43,3.418605,1,0,1,2,2012-10-13,2018-11-14,23,2223
jtlAfuINMCYVFpjwMgy2jw,5241 Dundas Street W,"{u'BusinessParking': u'{'garage': False, 'stre...","Grocery, Restaurants, Steakhouses, Butcher, Food",Toronto,0,43.638879,-79.537632,Medium Rare,22,4.0,0,22,3.818182,0,0,1,3,2009-12-03,2015-12-18,1062,2206


In [155]:
features = ['is_chain','duplicate_location','cost_1','cost_2','cost_3','cost_4', 'is_claimed', 'sentiment', 'avg_review_length', \
            'num_within_10_miles_at_date', 'review_count_before_date', 'rating_before_date' ] + ['city_%s'%city for city in cities]
X, y = build_X_and_y(yelp_businesses_df, reviews_df, date(2018, 3, 21), load_NLP=False, do_distance=False, features=features)

  category_counts.append( df[df.categories.str.contains(category)].shape[0] )
  category_counts.append( df[df.categories.str.contains(category)].shape[0] )
  category_counts.append( df[df.categories.str.contains(category)].shape[0] )


Checkpoint : 0
Checkpoint : 10000
Checkpoint : 20000


  reviews_after_six_months_after_date_df = reviews_for_open_businesses[after_six_months_after_date_filter]


In [156]:
X

Unnamed: 0_level_0,is_chain,duplicate_location,cost_1,cost_2,cost_3,cost_4,is_claimed,sentiment,avg_review_length,num_within_10_miles_at_date,review_count_before_date,rating_before_date,city_Las Vegas,city_Toronto,city_Phoenix
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
rnvsL0oFZpzpO61GXqBF6g,0,0,0,1,0,0,0,0.823433,284.666667,1576,332,4.328313,0,0,1
XYIPXJ9parr9FtvvcGI1SA,0,0,0,1,0,0,0,0.644500,260.333333,3335,327,4.039755,0,1,0
xsdRrNJuNumvrwoQ2Tt8tQ,1,0,1,0,0,0,0,-0.643667,589.666667,595,115,3.173913,0,0,1
LpwmR1unntc_8KdVfwNH9g,0,1,1,0,0,0,0,0.824300,615.000000,3306,11,4.545455,0,1,0
EBHzEtuOJz474NwRQFqJbg,1,0,0,0,1,0,0,-0.809733,308.666667,3288,13,1.769231,0,1,0
By4Suw6S-whbg51y_967BQ,0,0,1,0,0,0,0,0.952833,484.000000,3483,15,4.066667,0,1,0
-grWXL6_ngCGcXp8IbQU0g,0,0,1,0,0,0,0,0.925767,334.666667,3490,6,4.666667,0,1,0
O-04UXVtgezKPiQJOll3hg,1,1,1,0,0,0,1,0.747167,431.000000,2965,98,4.102041,1,0,0
bVqNtwcwIz2gAacHWYF2lw,0,0,1,0,0,0,0,0.701033,403.000000,1477,134,3.589552,0,0,1
Nv1PVoMJjKSutxCnGRYbnA,1,0,1,0,0,0,0,-0.283333,270.333333,1807,30,2.866667,0,0,1


In [None]:
# EDA of features
num_of_close = X[ (X.index.isin(y[y == 1].index)) ].shape[0]
num_of_open = X[ (X.index.isin(y[y == 0].index)) ].shape[0]
print num_of_open, num_of_close

num_cost_1_closed = float(X[ (X.index.isin(y[y == 1].index)) & (X.cost_1 == 1) ].shape[0])
num_cost_1_open =  float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_1 == 1)].shape[0])
num_cost_2_closed = float( X[ (X.index.isin(y[y == 1].index)) & (X.cost_2 == 1) ].shape[0])
num_cost_2_open = float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_2 == 1)].shape[0])
num_cost_3_closed =  float( X[ (X.index.isin(y[y == 1].index)) & (X.cost_3 == 1) ].shape[0])
num_cost_3_open = float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_3 == 1)].shape[0])
num_cost_4_closed = float( X[ (X.index.isin(y[y == 1].index)) & (X.cost_4 == 1) ].shape[0])
num_cost_4_open = float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_4 == 1)].shape[0])

avg_cost_closed = (1 * num_cost_1_closed + 2 * num_cost_2_closed + 3 * num_cost_3_closed + 4 * num_cost_4_closed) / num_of_close
avg_cost_open = (1 * num_cost_1_open + 2 * num_cost_2_open + 3 * num_cost_3_open + 4 * num_cost_4_open) / num_of_open

print "Avg cost for closed restaurants: ", avg_cost_closed
print "Avg cost for open resturants: ", avg_cost_open

print "Sentiment for closed:", X[X.index.isin(y[y == 1].index)].sentiment.mean(), "Sentiment for open:", X[X.index.isin(y[y == 0].index)].sentiment.mean()

print "Avg review length for closed:", X[X.index.isin(y[y == 1].index)].avg_review_length.mean(), "Avg review length for open:", X[X.index.isin(y[y == 0].index)].avg_review_length.mean()

print "Avg rating for closed:", X[X.index.isin(y[y == 1].index)].rating_before_date.mean(), "Avg rating for open:", X[X.index.isin(y[y == 0].index)].rating_before_date.mean()

print "Avg review count for closed:", X[X.index.isin(y[y == 1].index)].review_count_before_date.mean(), "Avg review count for open:", X[X.index.isin(y[y == 0].index)].review_count_before_date.mean()

print "Is chain for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_chain == 1) ].shape[0], "Is not chain for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_chain == 0)].shape[0] 
print "Is chain for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_chain == 1) ].shape[0], "Is not chain for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_chain == 0) ].shape[0]

print "Dup loc for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.duplicate_location == 1) ].shape[0], "Not dup loc for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.duplicate_location == 0)].shape[0] 
print "Dup loc for open:", X[ (X.index.isin(y[y == 0].index)) & (X.duplicate_location == 1) ].shape[0], "Not dup loc for open:", X[ (X.index.isin(y[y == 0].index)) & (X.duplicate_location == 0) ].shape[0]

print "Is claimed for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_claimed == 1) ].shape[0], "Is not claimed for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_claimed == 0)].shape[0] 
print "Is claimed for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_claimed == 1) ].shape[0], "Is not claimed for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_claimed == 0) ].shape[0]

In [None]:
### Train/test split
dropped_columns = []#['business_category_counts','num_rest_within_a_tenth_mile_at_date', 'num_rest_within_a_quarter_mile_at_date', 'num_rest_within_a_half_mile_at_date', \
                   #'num_rest_within_1_mile_at_date','num_rest_within_5_miles_at_date', 'num_rest_within_10_miles_at_date']
X_train, X_test, y_train, y_test = train_test_split(X.drop(dropped_columns,axis=1).values, y.values, test_size=0.2)
X_train_no_val, X_train_val, y_train_no_val, y_train_val = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
#(y == 1).sum(), y.shape

In [None]:
#(y_train == 1).sum(), y_train.shape

In [None]:
#(y_train_no_val == 1).sum(), y_train_no_val.shape

In [None]:
sns.heatmap(X.corr())
#X.drop(['age_at_date','num_rest_within_a_half_mile_at_date'],axis=1).corr()
#X.corr()

In [None]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = [0, 1]
    plt.xticks(tick_marks, ['Open','Closed'], rotation=45)
    plt.yticks(tick_marks, ['Open','Closed'])
    for (j,i),label in np.ndenumerate(cm):
        plt.text(i,j,label,ha='center',va='center')
        plt.text(i,j,label,ha='center',va='center')
    #plt.tick_params( axis='both', which='both', bottom='off', top='off', labelbottom='off', right='off', left='off', labelleft='off')
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
param_grid_logistic = {
    'logistic__C': np.logspace(-4, 4, 4),
    'logistic__solver' : [ 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logistic__max_iter' : [500, 1000, 2000]
}
param_grid_rf = {
    'rf__max_depth' : [4, 6, 8],
    'rf__n_estimators' : [500, 1000, 2000]
}

#pipe = Pipeline([ ( 'scaler', StandardScaler() ), ( 'logistic', LogisticRegression(penalty='l2', class_weight='balanced') ) ])
#grid_search = GridSearchCV(pipe, param_grid_logistic, cv=5, scoring='roc_auc', n_jobs=-1) #roc_auc

pipe = Pipeline([ ( 'rf', RandomForestClassifier(class_weight='balanced') ) ])
grid_search = GridSearchCV(pipe, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1) #roc_auc

grid_search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
y_pred_train = grid_search.best_estimator_.predict(X_train)
print "F1 Score:", f1_score(y_train, y_pred_train)
print "Precision Score:", precision_score(y_train, y_pred_train)
print "Recall Score:", recall_score(y_train, y_pred_train)
print "Accuracy Score:", accuracy_score(y_train, y_pred_train)
confusion_matrix(y_true=y_train, y_pred=y_pred_train)

In [None]:
y_pred_test = grid_search.best_estimator_.predict(X_test)
print "F1 Score:", f1_score(y_test, y_pred_test)
print "Precision Score:", precision_score(y_test, y_pred_test)
print "Recall Score:", recall_score(y_test, y_pred_test)
print "Accuracy Score:", accuracy_score(y_test, y_pred_test)
print "Confusion matrix:", confusion_matrix(y_true=y_test, y_pred=y_pred_test)
plot_confusion_matrix(confusion_matrix(y_true=y_test, y_pred=y_pred_test))

In [None]:
print X.columns
print grid_search.best_estimator_.named_steps['scaler'].mean_
print grid_search.best_estimator_.named_steps['logistic'].coef_

In [None]:
model = Pipeline([ ( 'scaler', StandardScaler() ), ( 'lr', LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42, class_weight='balanced') ) ])
#model = Pipeline([ ( 'scaler', StandardScaler() ), ( 'lr', RandomForestClassifier(max_depth=5, n_estimators=1000, class_weight='balanced') ) ])
model.fit(X_train_no_val, y_train_no_val)
y_pred_val = model.predict(X_train_val)
print "F1 score:", f1_score(y_train_val, y_pred_val)
print "Accuracy score:", accuracy_score(y_train_val, y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val, y_pred=y_pred_val))

In [None]:
#model.steps[1][1].coef_

In [None]:
clf = RandomForestClassifier(max_depth=3, class_weight='balanced')
clf.fit(X_train_no_val, y_train_no_val)
y_pred_val = clf.predict(X_train_val)
print "F1 score:", f1_score(y_train_val, y_pred_val)
#print "Precision Score:", precision_score(y_train_val, y_pred_val)
#print "Recall Score:", recall_score(y_train_val, y_pred_val)
print "Accuracy score:", accuracy_score(y_train_val, y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val, y_pred=y_pred_val))

In [None]:
clf.feature_importances_

In [None]:
### Define simplest model
clf = LogisticRegression(solver='lbfgs',max_iter=1000,random_state=42,class_weight='balanced')
clf.fit(X_train_no_val, y_train_no_val)
y_pred_val = clf.predict(X_train_val)
print "F1 score:", f1_score(y_train_val, y_pred_val)
#print "Precision Score:", precision_score(y_train_val, y_pred_val)
#print "Recall Score:", recall_score(y_train_val, y_pred_val)
print "Accuracy score:", accuracy_score(y_train_val, y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val, y_pred=y_pred_val)) #(clf.predict_proba(X_train_val) >= 0.5).astype(int).sum(axis=1)) #y_pred_val

In [None]:
# DO AUC/ROC, CHECK YEARS, GET MORE TIME BY MOVING BACK IN TIME, ALSO CHECK DIFFERENT FORECAST LENGTHS (1 MONTH, 3 MONTHS, 6 MONTHS), COMPARE ACROSS CITIES

In [None]:
balanced_class_ratio = float((y_train_no_val==0).sum())/(y_train_no_val==1).sum()
clf = xgb.XGBClassifier(scale_pos_weight=balanced_class_ratio, learning_rate=0.05)
clf.fit(X_train_no_val, y_train_no_val)
y_pred_val = clf.predict(X_train_val)
print "F1 Score:", f1_score(y_train_val, y_pred_val)
#print "Precision Score:", precision_score(y_train_val, y_pred_val)
#print "Recall Score:", recall_score(y_train_val, y_pred_val)
print "Accuracy Score:", accuracy_score(y_train_val, y_pred_val)
#print "Confusion matrix:", confusion_matrix(y_true=y_train_val, y_pred=y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val,y_pred=y_pred_val))

In [None]:
clf.feature_importances_

In [None]:
#balanced_class_ratio = float( (y_train==0).sum() ) / (y_train==1).sum()
#clf = xgb.XGBClassifier(scale_pos_weight=balanced_class_ratio)
#clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)
#print "F1 Score:", f1_score(y_test, y_pred)
#print "Precision Score:", precision_score(y_test, y_pred)
#print "Recall Score:", recall_score(y_test, y_pred)
#print "Accuracy Score:", accuracy_score(y_test, y_pred)
#print "Confusion matrix:", confusion_matrix(y_true=y_test, y_pred=y_pred)

In [None]:
#### Save important files to be used for model in web app
business_names_str = np.array([ str(name.encode('utf-8')) for name in yelp_businesses_df.name.values ]).astype(str)
np.savetxt('VivaLasFoodieRestaurantNames.csv', np.vectorize(lambda x: x.decode('UTF-8'))(business_names_str), delimiter=',', fmt='%s')

np.savetxt('chains.csv', np.vectorize(lambda x: x.decode('UTF-8'))(np.array([str(name.encode('utf-8')) for name in chains]).astype(str)),delimiter=',', fmt='%s')

## name_to_id_dict.json
with open('name_to_id_dict.json', 'w') as fp:
    json.dump(name_to_id_dict, fp)

## id_to_features_dict.json
with open('id_to_features_dict.json', 'w') as fp:
    json.dump(id_to_features_dict, fp)

In [None]:
def dist_of_open_businesses_in_city(businesses_df, categories):
    open_businesses = []
    closed_businesses = []
    valid_categories = []
    
    for category in categories:
        category_df = businesses_df[businesses_df['categories'].str.contains(category, na=False)]
        num_open = category_df[category_df.is_open == 1].shape[0]
        num_closed = category_df[category_df.is_open == 0].shape[0]
        if num_open + num_closed > 100 and num_closed > 50:
            open_businesses.append(category_df[category_df.is_open == 1].shape[0])
            closed_businesses.append(category_df[category_df.is_open == 0].shape[0]) 
            valid_categories.append(category)
        
    city_business_distribution = pd.DataFrame(data={'Open' : open_businesses, 'Closed' : closed_businesses}, index=valid_categories)
    
    return city_business_distribution

city_business_distribution = dist_of_open_businesses_in_city(yelp_businesses_df, categories)
city_business_distribution

In [None]:
#############
# FOCUS ON SCOPE AND TARGET AUDIENCE -- WHO WOULD USE THE PRODUCT AND WHY -- Just pick Las Vegas
# WHAT ARE YOU ACTUALLY PREDICTING...

# Signals: 
#      Age of business? Proxy can be time of first review -- Done
#      Density of other businesses in area? -- Done
#      Rating count? -- Done
#      Star rating? -- Done
#      Location? -- Done
#      Semantic analysis of text? -- Done
#      Hours of business? -- Ignore
#      Accept credit card? -- Ignore
#      Offers parking? -- Done
#      Is the business a chain? Are there others in the area? -- Done
#      Price relative to competitors? Review count relative to competitors? Star rating relative to competitors?
#      Look at geography in a given city. Are there locations in the city where businesses don't succeed -- Ignore for now
#      Useful, cool, funny reviews? -- Ignore/Done
#      Are some users more impactful?
#      Get city population/neighborhood data -- Ignore
#      Average length of review text -- Done
#      Investigate other cities
#      Change the date and forecast period
#      Number of restaurants with same category in city

#      Do AWS deployment
#      Flask/web app, think about how the input ---> model ---> output will work
#      Website output needs to show information about the restaurant
#      Change focus to food tourism where people are specifically picking restaurants, the project is not just about Las Vegas (Las Vegas is just a city in the food tourism space)
#      Numbers/stats can all be about food tourism now
#      Look at cost rating and see about fine dining (or not one $ restaurants) since those are ones that people actually care about

In [None]:
# find out distribution of types of businesses in dataset
# find out closed business distribution by city, state
# perhaps look into time
# semantic analysis of text
# do the business hours matter in when stores open or close? are they related to type of business?
# does length of business name matter?
# does star number count?
# does business accept credit card matter?
# does parking matter?
# HuggingFace library BERT based on pytorch
# do certain user reviews matter?
# who is the user?
# how is it going to be used?