In [1]:
from sqlalchemy import create_engine, inspect
from sklearn_pandas import DataFrameMapper
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer, FunctionTransformer
import psycopg2
import numpy as np
import json
from fuzzywuzzy import fuzz

PSQL = 'postgres://{}@localhost:5432/rpred'

* Filter out closed restaurants.
* Scale numeric values.
* Use imputer. 
* Match features to target.
* Write features to CSV.
* Write target to CSV.


# TODO:
* Extract data from hours.
* Find if restaurant is a large chain (20+)
* Dummy out attributes.


Connect to Postgres database.

In [2]:
def attributes(attr):
    sql = ""
    clause = " attributes ->> '{}' AS {},"
    for a in attr:
        sql += clause.format(a[1], a[0])
    return sql

def extract_category(X, category):
    X[category[0]] = np.where(X['categories'].str.contains(category[1]), 1, 0)

In [4]:
cred = ""
with open("../credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")
    
cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

In [5]:
# Construct query, filtering out closed restaurants
COLUMNS = ['name', 'full_address', 'stars', 'review_count']

QUERY = '''SELECT {} categories, {} FROM yelp_8 WHERE open = True ORDER BY name;'''

# Extract attributes
ATTRS = [
    ['takeout', 'Take-out', 'varchar(10)'],
    ['alcohol', 'Alcohol', 'varchar(15)'],
    ['good_for_groups', 'Good For Groups', 'varchar(10)'],
    ['good_for_kids', 'Good for Kids', 'varchar(10)'],
    ['credit_card', 'Accepts Credit Cards', 'varchar(10)'],
    ['noise_level', 'Noise Level', 'varchar(15)'],
    ['price', 'Price Range', 'int']
]

# Price range--Extract when it exists, otherwise use Imputer. 
# 10630/11304 = 94% rows contain range price information

# Read from table.
QUERY = QUERY.format(attributes(ATTRS), ",".join(COLUMNS))
print(QUERY)
X = pd.read_sql_query(QUERY, cnx)
#X.head()

SELECT  attributes ->> 'Take-out' AS takeout, attributes ->> 'Alcohol' AS alcohol, attributes ->> 'Good For Groups' AS good_for_groups, attributes ->> 'Good for Kids' AS good_for_kids, attributes ->> 'Accepts Credit Cards' AS credit_card, attributes ->> 'Noise Level' AS noise_level, attributes ->> 'Price Range' AS price, categories, name,full_address,stars,review_count FROM yelp_8 WHERE open = True ORDER BY name;


### Add columns for categories

In [6]:
CATEGORIES = [
    ['fast_food', 'Fast Food'],
    ['mexican', 'Mexican'],
    ['chinese', 'Chinese'],
    ['bar', 'Bars'],
    ['american', 'American'],
    ['fusion', 'Fusion']
]
    
for c in CATEGORIES:
    extract_category(X, c)
#X.head()

In [7]:
# Strip newline from full address.
X['full_address'] = X['full_address'].apply(lambda s: s.replace("\n", " "))
#X.head()

### Merge features and target DF.

In [8]:
y = pd.read_sql_query('''SELECT * from target ORDER BY name''', cnx)
y['key'] = y['name'] + " " + y['full_address']

X['key'] = X['name'] + " " + X['full_address']

all_columns = []
all_columns.extend(COLUMNS)
all_columns.extend([a[0] for a in ATTRS])
all_columns.extend([c[0] for c in CATEGORIES])
df = pd.merge(X, y[['is_open', 'key']], on='key', how='inner')

df.rename(columns={'is_open' : 'stay_open'}, inplace=True)
df.drop(columns='key', inplace=True)

print("open:", len(df[df['stay_open'] == 1])/len(df))
print("closed:", len(df[df['stay_open'] == 0])/len(df))

#df.head()


open: 0.9158228804594227
closed: 0.0841771195405773


### Create mapper for imputing and scaling. 

In [12]:
def fix_bool(val):
    return val == 'true' or val == 'True'

def noise_level(val):
    if val == 'quiet':
        return 1
    elif val == 'loud':
        return 3
    elif val == 'very_loud':
        return 4
    else:
        return 2  # average

df['alcohol'] = df['alcohol'].apply(lambda x: x == 'full_bar' or x == 'beer_and_wine')
df['takeout'] = df['takeout'].apply(fix_bool)
df['good_for_groups'] = df['good_for_groups'].apply(fix_bool)
df['good_for_kids'] = df['good_for_groups'].apply(fix_bool)
df['credit_card'] = df['credit_card'].apply(fix_bool)
df['noise_level'] = df['noise_level'].apply(noise_level)
df.head()

Unnamed: 0,takeout,alcohol,good_for_groups,good_for_kids,credit_card,noise_level,price,categories,name,full_address,stars,review_count,fast_food,mexican,chinese,bar,american,fusion,stay_open
0,False,False,False,False,False,2,1.0,"{Italian,Pizza,Restaurants}",1000 Degrees Neapolitan Pizzeria,"7000 E Mayo Blvd Phoenix, AZ 85054",4.5,59,0,0,0,0,0,0,1
1,False,False,False,False,False,2,2.0,"{Cafes,Sandwiches,Restaurants}",1000 Grammes,1495 Rue Sainte-Catherine East Ville-Marie Mon...,3.5,19,0,0,0,0,0,0,1
2,False,False,False,False,False,2,,"{""Fast Food"",Restaurants}",108 Chinese Take Away,108 Portobello High Street Edinburgh EH15 1AL,5.0,3,1,0,0,0,0,0,1
3,False,False,False,False,False,2,1.0,"{Food,Desserts,""Coffee & Tea"",Indian,Restaurants}",10-to-10 In Delhi,67 Nicolson Street Newington Edinburgh EH8 9BZ,4.5,50,0,0,0,0,0,0,1
4,False,False,False,False,False,2,2.0,"{Steakhouses,""American (New)"",Restaurants}",1130 The Restaurant,"455 N 3rd St Ste 1130 Phoenix, AZ 85004",3.0,158,0,0,0,0,1,0,1


In [13]:
def binarize(val):
    if val.lower() == "none":
        return 0
    return 1

def noise_level(df):
    noise = {
        'quiet' : 1,
        'loud' : 3,
        'very_loud': 4,
        'average' : 2,
        'none' : 2,
        'None' : 2,
        None: 2
    }
    df = map(lambda n: noise[n], df)

attribute_imputer = FunctionTransformer(binarize)

mapper = DataFrameMapper([
    (['review_count'], [Imputer(), StandardScaler()]),
    (['stars'], [Imputer(), StandardScaler()]),
    (['noise_level'], [Imputer(), StandardScaler()]),
    (['price'], [Imputer(strategy='median'), StandardScaler()]),
    ('bar', None),
    ('fast_food', None),
    ('mexican', None),
    ('chinese', None),
    ('american', None),
    ('fusion', None),
    (['takeout'], LabelBinarizer()),
    (['alcohol'], LabelBinarizer()),
    (['good_for_groups'], LabelBinarizer()),
    (['good_for_kids'], LabelBinarizer()),
    (['credit_card'], LabelBinarizer()),
    ('full_address', None),
    ('name', None),
    ('stay_open', None)
], df_out=True)

df2 = mapper.fit_transform(df)
df2.head()

Unnamed: 0,review_count,stars,noise_level,price,bar,fast_food,mexican,chinese,american,fusion,takeout,alcohol,good_for_groups,good_for_kids,credit_card,full_address,name,stay_open
0,-0.0431253,1.21536,0,-0.962015,0,0,0,0,0,0,0,0,0,0,0,"7000 E Mayo Blvd Phoenix, AZ 85054",1000 Degrees Neapolitan Pizzeria,1
1,-0.325464,-0.0461031,0,0.684559,0,0,0,0,0,0,0,0,0,0,0,1495 Rue Sainte-Catherine East Ville-Marie Mon...,1000 Grammes,1
2,-0.4384,1.8461,0,0.684559,0,1,0,0,0,0,0,0,0,0,0,108 Portobello High Street Edinburgh EH15 1AL,108 Chinese Take Away,1
3,-0.106652,1.21536,0,-0.962015,0,0,0,0,0,0,0,0,0,0,0,67 Nicolson Street Newington Edinburgh EH8 9BZ,10-to-10 In Delhi,1
4,0.655664,-0.676836,0,0.684559,0,0,0,0,1,0,0,0,0,0,0,"455 N 3rd St Ste 1130 Phoenix, AZ 85004",1130 The Restaurant,1


### Resample.

In [227]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print("open:", len(df2[df2['stay_open'] == 1])/len(df2))
print("closed:", len(df2[df2['stay_open'] == 0])/len(df2))

y = df2['stay_open']
X = df2.drop(columns=['stay_open', 'full_address', 'name'])
feature_columns = X.columns
sm = SMOTE()
X_res, y_res = sm.fit_sample(X, y)
print("Resampled...")
print(Counter(y_res))
#print("open:", len(y_res['stay_open'] == 1]/len(y_res)))
#print("closed:", len(y_res[y_res['stay_open'] == 0]/len(y_res)))

open: 0.9158228804594227
closed: 0.0841771195405773
Resampled...
Counter({1: 18180, 0: 18180})


  TARGET_KIND, type_of_target(y)))


### Write to csv

In [234]:
from datetime import datetime
import logging
import os

def save(X, y):
    now = datetime.now().strftime("%Y%m%d%H%M%S%f")
    path = "../output/"+now
    
    logging.info("Writing results to ", path)
    os.mkdir(path)
    
    X.to_csv(path+"/features")
    y.to_csv(path+"/target")

    return path

save(pd.DataFrame(X_res, columns=feature_columns), pd.DataFrame(y_res, columns=["stay_open"]))

'../output/20180802215638616549'

# Scratch

In [238]:
import re

# Make categories table
df_c = pd.read_sql_query('''SELECT categories FROM yelp_dataset_8 LIMIT 200''', cnx)

def category_list(df):
    p = re.compile(r"'([\w\W]+)'")
    #df = pd.DataFrame(df_X.categories)
    return [p.findall(x) for x in df.categories]

#df_c.head()

category_list(df_c)

[["Fast Food'  'Restaurants"],
 ["Food'  'Grocery"],
 ["Cafes'  'Restaurants"],
 ["American (Traditional)'  'Comfort Food'  'Caterers'  'Event Planning & Services'  'Restaurants"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Food'  'Coffee & Tea"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Food'  'Ice Cream & Frozen Yogurt'  'Pizza'  'Restaurants"],
 ["Pubs'  'Bars'  'American (New)'  'Nightlife'  'Restaurants"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Food'  'American (Traditional)'  'Breweries'  'Restaurants"],
 ["Food'  'Grocery"],
 ["Food'  'Grocery"],
 ["Food'  'Grocery"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Bakeries'  'Food'  'Desserts"],
 ["Food'  'Grocery"],
 ["Food'  'Specialty Food'  'Meat Shops"],
 ["Food'  'Convenience Stores"],
 ["Food'  'Convenience Stores"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Burgers'  'Fast Food'  'Restaurants"],
 ["Bakeries'  'Food"],
 ["Food'  'Grocery"],
 ["Food'  'Ice Cream & Frozen Yogurt"],
 ["Food'  'Beer  Wine & Spiri

In [208]:
category_keywords = ['sandwiches', 'fast_food', 'nightlife', 'pizza', 'bars',
                     'mexican', 'food', 'american_traditional', 'burgers', 'chinese',
                     'italian', 'american_new', 'breakfast_brunch', 'thai', 'indian',
                     'sushi', 'korean', 'mediterranean', 'japanese', 'seafood',
                     'middle_eastern', 'pakistani', 'barbeque', 'vietnamese',
                     'asian_fusion', 'diners', 'greek', 'vegetarian']
