In [64]:
import psycopg2
import numpy as np
import json
import pandas as pd
from sqlalchemy import create_engine, inspect
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer, FunctionTransformer
from fuzzywuzzy import fuzz
from imblearn.over_sampling import SMOTE
from collections import Counter

PSQL = 'postgres://{}@localhost:5432/rpred'

* Filter out closed restaurants.
* Scale numeric values.
* Use imputer. 
* Extract data from hours.
* Find if restaurant is a large chain (20+)
* Dummy out attributes.
* Match features to target.
* Write features to CSV.
* Write target to CSV.

Connect to Postgres database.

In [65]:
def attributes(attr):
    sql = ""
    clause = " attributes ->> '{}' AS {},"
    for a in attr:
        sql += clause.format(a[1], a[0])
    return sql

def extract_category(X, category):
    X[category[0]] = np.where(X['categories'].str.contains(category[1]), 1, 0)

In [66]:
cred = ""
with open("../credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")
    
cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

In [69]:
# Construct query, filtering out closed restaurants
COLUMNS = ['name', 'full_address', 'stars', 'review_count']

QUERY = '''SELECT {} categories, hours, {} FROM yelp_8 WHERE open = True ORDER BY name;'''

# Extract attributes
ATTRS = [
    ['takeout', 'Take-out', 'varchar(10)'],
    ['alcohol', 'Alcohol', 'varchar(15)'],
    ['good_for_groups', 'Good For Groups', 'varchar(10)'],
    ['good_for_kids', 'Good for Kids', 'varchar(10)'],
    ['credit_card', 'Accepts Credit Cards', 'varchar(10)'],
    ['noise_level', 'Noise Level', 'varchar(15)'],
    ['price', 'Price Range', 'int']
]

# Price range--Extract when it exists, otherwise use Imputer. 
# 10630/11304 = 94% rows contain range price information

# Read from table.
QUERY = QUERY.format(attributes(ATTRS), ",".join(COLUMNS))
print(QUERY)
X = pd.read_sql_query(QUERY, cnx)

SELECT  attributes ->> 'Take-out' AS takeout, attributes ->> 'Alcohol' AS alcohol, attributes ->> 'Good For Groups' AS good_for_groups, attributes ->> 'Good for Kids' AS good_for_kids, attributes ->> 'Accepts Credit Cards' AS credit_card, attributes ->> 'Noise Level' AS noise_level, attributes ->> 'Price Range' AS price, categories, hours, name,full_address,stars,review_count FROM yelp_8 WHERE open = True ORDER BY name;


### Hours

In [70]:
def sum_hours(js):
    total = 0
    for day, hours in js.items():
        close_time = int(hours['close'].split(":")[0])
        open_time = int(hours['open'].split(":")[0])
        if (close_time == 0):
            close_time = 24
        total += (close_time - open_time)
    if total > 0:
        return total
    return None
        
def extract_hours(hours_str):
    hours = ('Monday' in hours_str, 'Tuesday' in hours_str,
            'Wednesday' in hours_str, 'Thursday' in hours_str,
            'Friday' in hours_str, 'Saturday' in hours_str,
            'Sunday' in hours_str)
    # If we don't have any hours data, assume business is open on the most common days:
    # Thursday, Friday, Wednesday
    if sum(hours) == 0:
        return (False, False, True, True, True, False, False)
    return hours

# extract_hours("{}")
# extract_hours("{'Tuesday': {'close': '21:30'  'open': '06:00'}  'Friday': {'close': '22:30'  'open': '07:00'}  'Wednesday': {'close': '21:30'  'open': '06:00'}  'Thursday': {'close': '22:30'  'open': '06:00'}  'Sunday': {'close': '21:30'  'open': '06:00'}  'Saturday': {'close': '21:00'  'open': '07:00'}}")

In [71]:
X['open_mon'], X['open_tu'], X['open_wed'], X['open_thu'], X['open_fri'], X['open_sat'], X['open_sun'] = zip(*X['hours'].map(extract_hours))
X['total_hours_wk'] = X['hours'].apply(sum_hours)
#X[['hours', 'total_hours_wk']].head().values
X.head()

Unnamed: 0,takeout,alcohol,good_for_groups,good_for_kids,credit_card,noise_level,price,categories,hours,name,...,stars,review_count,open_mon,open_tu,open_wed,open_thu,open_fri,open_sat,open_sun,total_hours_wk
0,True,beer_and_wine,True,True,True,average,1.0,"{Italian,Pizza,Restaurants}","{'Monday': {'close': '21:00', 'open': '11:00'}...",1000 Degrees Neapolitan Pizzeria,...,4.5,59,True,True,True,True,True,True,True,72.0
1,True,full_bar,True,True,,average,2.0,"{Cafes,Sandwiches,Restaurants}","{'Monday': {'close': '23:30', 'open': '11:00'}...",1000 Grammes,...,3.5,19,True,True,True,True,True,True,True,86.0
2,True,full_bar,True,True,True,quiet,2.0,"{Persian/Iranian,Restaurants}","{'Monday': {'close': '21:00', 'open': '11:30'}...",1001 Nights Restaurant,...,3.5,29,True,True,True,True,True,True,True,30.0
3,True,none,True,True,,average,2.0,"{""Middle Eastern"",Restaurants}","{'Monday': {'close': '00:00', 'open': '11:00'}...",1001 Nights Shawarma,...,5.0,9,True,True,True,True,True,True,True,85.0
4,,,,,,,,"{""Fast Food"",Restaurants}",{},108 Chinese Take Away,...,5.0,3,False,False,True,True,True,False,False,


### Add columns for categories

In [72]:
CATEGORIES = [
    ['fast_food', 'Fast Food'],
    ['mexican', 'Mexican'],
    ['chinese', 'Chinese'],
    ['bar', 'Bars'],
    ['american', 'American'],
    ['fusion', 'Fusion'],
    ['pizza', 'Pizza']
]
    
for c in CATEGORIES:
    extract_category(X, c)
    
# Strip newline from full address.
X['full_address'] = X['full_address'].apply(lambda s: s.replace("\n", " "))

### Merge features and target DF.

In [73]:
y = pd.read_sql_query('''SELECT * from target ORDER BY name''', cnx)
y['key'] = y['name'] + " " + y['full_address']

X['key'] = X['name'] + " " + X['full_address']

all_columns = []
all_columns.extend(COLUMNS)
all_columns.extend([a[0] for a in ATTRS])
all_columns.extend([c[0] for c in CATEGORIES])
df = pd.merge(X, y[['is_open', 'key']], on='key', how='inner')

#df.rename(columns={'is_open' : 'stay_open'}, inplace=True)
df.drop(columns='key', inplace=True)

print("open:", len(df[df['is_open'] == 1])/len(df))
print("closed:", len(df[df['is_open'] == 0])/len(df))


open: 0.9158228804594227
closed: 0.0841771195405773


### Add column for chain/franchise size.

In [74]:
from collections import defaultdict, Counter
# Find large chains
count = Counter(df['name'].values)

chains_dict = defaultdict(lambda: "N/A")
for name in count:
    num = count[name]
    if num > 40:
        chains_dict[name] = "large"
    elif num > 10:
        chains_dict[name] = "medium"
    elif num > 2:
        chains_dict[name] = "small"
        
chain_col = df['name'].map(chains_dict)
chain_dummy = pd.get_dummies(chain_col, prefix='chain', drop_first=True)
chain_dummy.head()

df = pd.concat([df, chain_dummy], axis=1)

### Create mapper for imputing and scaling. 

In [75]:
def fix_bool(val):
    return val == 'true' or val == 'True'

def noise_level(val):
    if val == 'quiet':
        return 1
    elif val == 'loud':
        return 3
    elif val == 'very_loud':
        return 4
    else:
        return 2  # average

df['alcohol'] = df['alcohol'].apply(lambda x: x == 'full_bar' or x == 'beer_and_wine')
df['takeout'] = df['takeout'].apply(fix_bool)
df['good_for_groups'] = df['good_for_groups'].apply(fix_bool)
df['good_for_kids'] = df['good_for_groups'].apply(fix_bool)
df['credit_card'] = df['credit_card'].apply(fix_bool)
df['noise_level'] = df['noise_level'].apply(noise_level)
df.head()

Unnamed: 0,takeout,alcohol,good_for_groups,good_for_kids,credit_card,noise_level,price,categories,hours,name,...,mexican,chinese,bar,american,fusion,pizza,is_open,chain_large,chain_medium,chain_small
0,True,True,True,False,True,2,1.0,"{Italian,Pizza,Restaurants}","{'Monday': {'close': '21:00', 'open': '11:00'}...",1000 Degrees Neapolitan Pizzeria,...,0,0,0,0,0,1,1,0,0,0
1,True,True,True,False,False,2,2.0,"{Cafes,Sandwiches,Restaurants}","{'Monday': {'close': '23:30', 'open': '11:00'}...",1000 Grammes,...,0,0,0,0,0,0,1,0,0,0
2,False,False,False,False,False,2,,"{""Fast Food"",Restaurants}",{},108 Chinese Take Away,...,0,0,0,0,0,0,1,0,0,0
3,True,False,True,False,False,2,1.0,"{Food,Desserts,""Coffee & Tea"",Indian,Restaurants}","{'Monday': {'close': '22:00', 'open': '10:00'}...",10-to-10 In Delhi,...,0,0,0,0,0,0,1,0,0,0
4,False,True,True,False,True,2,2.0,"{Steakhouses,""American (New)"",Restaurants}","{'Monday': {'close': '21:00', 'open': '11:00'}...",1130 The Restaurant,...,0,0,0,1,0,0,1,0,0,0


In [78]:
mapper = DataFrameMapper([
    #(['review_count'], [Imputer(), StandardScaler()]),
    (['stars'], [Imputer()]),
    (['noise_level'], [Imputer()]),
    (['price'], [Imputer(strategy='median')]),
    (['total_hours_wk'], [Imputer(strategy='mean'), StandardScaler()]),
    ('bar', None),
    ('fast_food', None),
    ('mexican', None),
    ('chinese', None),
    ('american', None),
    ('fusion', None),
    ('chain_large', None),
    ('chain_medium', None),
    ('chain_small', None),
    (['open_mon'], LabelBinarizer()),
    (['open_tu'], LabelBinarizer()),
    (['open_wed'], LabelBinarizer()),
    (['open_thu'], LabelBinarizer()),
    (['open_fri'], LabelBinarizer()),
    (['open_sat'], LabelBinarizer()),
    (['open_sun'], LabelBinarizer()),
    (['takeout'], LabelBinarizer()),
    (['alcohol'], LabelBinarizer()),
    (['good_for_groups'], LabelBinarizer()),
    (['credit_card'], LabelBinarizer()),
    ('full_address', None),
    ('name', None),
    ('is_open', None)
], df_out=True)

df2 = mapper.fit_transform(df)
df2.head()

Unnamed: 0,stars,noise_level,price,total_hours_wk,bar,fast_food,mexican,chinese,american,fusion,...,open_fri,open_sat,open_sun,takeout,alcohol,good_for_groups,credit_card,full_address,name,is_open
0,4.5,2,1,-0.0633274,0,0,0,0,0,0,...,1,1,1,1,1,1,1,"7000 E Mayo Blvd Phoenix, AZ 85054",1000 Degrees Neapolitan Pizzeria,1
1,3.5,2,2,0.489648,0,0,0,0,0,0,...,1,1,1,1,1,1,0,1495 Rue Sainte-Catherine East Ville-Marie Mon...,1000 Grammes,1
2,5.0,2,2,0.0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,108 Portobello High Street Edinburgh EH15 1AL,108 Chinese Take Away,1
3,4.5,2,1,0.410651,0,0,0,0,0,0,...,1,1,1,1,0,1,0,67 Nicolson Street Newington Edinburgh EH8 9BZ,10-to-10 In Delhi,1
4,3.0,2,2,0.0946654,0,0,0,0,1,0,...,1,1,1,0,1,1,1,"455 N 3rd St Ste 1130 Phoenix, AZ 85004",1130 The Restaurant,1


In [108]:
# Invert --> 1 = close
invert = True
if invert:
    df2['target'] = np.where(df2['is_open'] == 1, 0, 1)
else:
    df2['target'] = np.where(df2['is_open'] == 1, 1, 0)

### Resample if needed.

In [109]:
#print("open:", len(df2[df2['close'] == 0])/len(df2))
#print("closed:", len(df2[df2['close'] == 1])/len(df2))

y = df2['target']
X = df2.drop(columns=['is_open', 'target', 'full_address', 'name'])

feature_columns = X.columns
sm = SMOTE()
X_res, y_res = sm.fit_sample(X, y)
print("Resampled...")
print(Counter(y_res))
#print(Counter(y))

Resampled...
Counter({0: 18180, 1: 18180})


In [110]:
y_res = y
X_res = X
Counter(y_res)

Counter({0: 18180, 1: 1671})

### Write to csv

In [112]:
from datetime import datetime
import logging
import os

def save(X, y):
    now = datetime.now().strftime("%Y%m%d%H%M%S%f")
    now = "20180810_unbal_invert"
    path = "../data/model_input/"+now
    
    logging.info("Writing results to ", path)
    os.mkdir(path)
    
    X.to_csv(path+"/features")
    y.to_csv(path+"/target")

    return path

save(pd.DataFrame(X_res, columns=feature_columns), pd.DataFrame(y_res, columns=["target"]))

'../data/model_input/20180810_unbal_invert'

# Scratch

In [238]:
import re

# Make categories table
df_c = pd.read_sql_query('''SELECT categories FROM yelp_dataset_8 LIMIT 200''', cnx)

def category_list(df):
    p = re.compile(r"'([\w\W]+)'")
    #df = pd.DataFrame(df_X.categories)
    return [p.findall(x) for x in df.categories]

category_list(df_c)

[["Fast Food'  'Restaurants"],
 ["Food'  'Grocery"],
 ["Cafes'  'Restaurants"],
 ["American (Traditional)'  'Comfort Food'  'Caterers'  'Event Planning & Services'  'Restaurants"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Food'  'Coffee & Tea"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Food'  'Ice Cream & Frozen Yogurt'  'Pizza'  'Restaurants"],
 ["Pubs'  'Bars'  'American (New)'  'Nightlife'  'Restaurants"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Food'  'American (Traditional)'  'Breweries'  'Restaurants"],
 ["Food'  'Grocery"],
 ["Food'  'Grocery"],
 ["Food'  'Grocery"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Bakeries'  'Food'  'Desserts"],
 ["Food'  'Grocery"],
 ["Food'  'Specialty Food'  'Meat Shops"],
 ["Food'  'Convenience Stores"],
 ["Food'  'Convenience Stores"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Bakeries'  'Food"],
 ["Food'  'Grocery"],
 ["Food'  'Ice Cream & Frozen Yogurt"],
 ["Food'  'Beer  Wine & Spiri

In [208]:
category_keywords = ['sandwiches', 'fast_food', 'nightlife', 'pizza', 'bars',
                     'mexican', 'food', 'american_traditional', 'burgers', 'chinese',
                     'italian', 'american_new', 'breakfast_brunch', 'thai', 'indian',
                     'sushi', 'korean', 'mediterranean', 'japanese', 'seafood',
                     'middle_eastern', 'pakistani', 'barbeque', 'vietnamese',
                     'asian_fusion', 'diners', 'greek', 'vegetarian']
