In [45]:
from sqlalchemy import create_engine, inspect
from sklearn_pandas import DataFrameMapper
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import psycopg2
import numpy as np
from fuzzywuzzy import fuzz

PSQL = 'postgres://{}@localhost:5432/rpred'

# TODO:
* Extract data from hours.
* Find if restaurant is a large chain (20+)

Connect to Postgres database.

In [46]:
cred = ""
with open("credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")
    
cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

Read from table.

In [160]:
yelp_dataset_8 = pd.read_sql_query('''SELECT 
name, full_address, city, state, review_count, stars, open, categories, attributes 
FROM yelp_dataset_8''', cnx)

In [213]:
# filter out grocery
X = yelp_dataset_8.drop(yelp_dataset_8[yelp_dataset_8.categories.str.contains("Grocery")].index)

open_count = len(X[X['open'] == 'True'])
closed_count = len(X[X['open'] == 'False'])
print("Open:", open_count, open_count/len(X), 
      "Closed:", closed_count, closed_count/len(X))

Open: 11304 0.8172353961827646 Closed: 2528 0.1827646038172354


In [214]:
# filter out closed stores
X.drop(X[X.open == 'False'].index, inplace=True)
X.drop(columns=['open'], inplace=True)

In [215]:
X.head()

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes
0,Mr Hoagie,4734 Lebanon Church Rd\nDravosburg PA 15034,Dravosburg,PA,7,3.5,['Fast Food' 'Restaurants'],{'Take-out': True 'Drive-Thru': False 'Good ...
2,Barb's Country Junction Cafe,202 3rd Ave\nCarnegie\nCarnegie PA 15106,Carnegie,PA,9,4.0,['Cafes' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...
3,Boston Market,300 Davis Blvd\nEtna\nPittsburgh PA 15275,Pittsburgh,PA,3,2.5,['American (Traditional)' 'Comfort Food' 'Ca...,{'Take-out': True 'Wi-Fi': 'no' 'Alcohol': '...
4,McDonald's,400 Waterfront Dr E\nHomestead\nHomestead PA ...,Homestead,PA,12,2.0,['Burgers' 'Fast Food' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...
5,Starbucks,270 W Bridge St\nHomestead\nHomestead PA 15120,Homestead,PA,25,3.5,['Food' 'Coffee & Tea'],{'Take-out': True 'Wi-Fi': 'free' 'Price Ran...


### Add columns for binary attributes

In [216]:
attrs = [
    ['takeout', 'Take-out'],
    ['alcohol', 'Alcohol'],
    ['good_for_groups', 'Good For Groups'],
    ['good_for_kids', 'Good for Kids'],
    ['credit_card', 'Accepts Credit Cards']
]

def extract_attribute(X, attr):
    X[attr[0]] = np.where(X['attributes'].str.contains("'{}': True".format(attr[1])), 1, 0)

for attr in attrs:
    extract_attribute(X, attr)
X.head()
    

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,takeout,alcohol,good_for_groups,good_for_kids,credit_card
0,Mr Hoagie,4734 Lebanon Church Rd\nDravosburg PA 15034,Dravosburg,PA,7,3.5,['Fast Food' 'Restaurants'],{'Take-out': True 'Drive-Thru': False 'Good ...,1,0,1,1,1
2,Barb's Country Junction Cafe,202 3rd Ave\nCarnegie\nCarnegie PA 15106,Carnegie,PA,9,4.0,['Cafes' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,1,0,1,1,1
3,Boston Market,300 Davis Blvd\nEtna\nPittsburgh PA 15275,Pittsburgh,PA,3,2.5,['American (Traditional)' 'Comfort Food' 'Ca...,{'Take-out': True 'Wi-Fi': 'no' 'Alcohol': '...,1,0,0,1,1
4,McDonald's,400 Waterfront Dr E\nHomestead\nHomestead PA ...,Homestead,PA,12,2.0,['Burgers' 'Fast Food' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,0,1,1,1
5,Starbucks,270 W Bridge St\nHomestead\nHomestead PA 15120,Homestead,PA,25,3.5,['Food' 'Coffee & Tea'],{'Take-out': True 'Wi-Fi': 'free' 'Price Ran...,1,0,0,0,1


### Add columns for numeric/categorical attributes

In [223]:
# Price range--Extract when it exists, otherwise use Imputer. 
# 10630/11304 = 94% rows contain range price information
print("Contains price range:", len(X[X['attributes'].str.contains("'Price Range'")])/len(X))

X['price'] = X['attributes'].str.extract(r"'Price Range': (\d)+")
X[['price']] = X[['price']].fillna(value=X[['price']].mean())
X.head()

Contains price range: 0.9403750884642604


Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,takeout,alcohol,good_for_groups,good_for_kids,credit_card,price,fast_food,mexican,chinese,bar,american,fusion
0,Mr Hoagie,4734 Lebanon Church Rd\nDravosburg PA 15034,Dravosburg,PA,7,3.5,['Fast Food' 'Restaurants'],{'Take-out': True 'Drive-Thru': False 'Good ...,1,0,1,1,1,1,1,0,0,0,0,0
2,Barb's Country Junction Cafe,202 3rd Ave\nCarnegie\nCarnegie PA 15106,Carnegie,PA,9,4.0,['Cafes' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,1,0,1,1,1,1,0,0,0,0,0,0
3,Boston Market,300 Davis Blvd\nEtna\nPittsburgh PA 15275,Pittsburgh,PA,3,2.5,['American (Traditional)' 'Comfort Food' 'Ca...,{'Take-out': True 'Wi-Fi': 'no' 'Alcohol': '...,1,0,0,1,1,2,0,0,0,0,1,0
4,McDonald's,400 Waterfront Dr E\nHomestead\nHomestead PA ...,Homestead,PA,12,2.0,['Burgers' 'Fast Food' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,0,1,1,1,1,1,0,0,0,0,0
5,Starbucks,270 W Bridge St\nHomestead\nHomestead PA 15120,Homestead,PA,25,3.5,['Food' 'Coffee & Tea'],{'Take-out': True 'Wi-Fi': 'free' 'Price Ran...,1,0,0,0,1,2,0,0,0,0,0,0


### Add columns for categories

In [224]:
categories = [
    ['fast_food', 'Fast Food'],
    ['mexican', 'Mexican'],
    ['chinese', 'Chinese'],
    ['bar', 'Bars'],
    ['american', 'American'],
    ['fusion', 'Fusion']
]

def extract_category(X, category):
    X[category[0]] = np.where(X['categories'].str.contains(category[1]), 1, 0)
    
for c in categories:
    extract_category(X, c)
X.head()

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,takeout,alcohol,good_for_groups,good_for_kids,credit_card,price,fast_food,mexican,chinese,bar,american,fusion
0,Mr Hoagie,4734 Lebanon Church Rd\nDravosburg PA 15034,Dravosburg,PA,7,3.5,['Fast Food' 'Restaurants'],{'Take-out': True 'Drive-Thru': False 'Good ...,1,0,1,1,1,1,1,0,0,0,0,0
2,Barb's Country Junction Cafe,202 3rd Ave\nCarnegie\nCarnegie PA 15106,Carnegie,PA,9,4.0,['Cafes' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,1,0,1,1,1,1,0,0,0,0,0,0
3,Boston Market,300 Davis Blvd\nEtna\nPittsburgh PA 15275,Pittsburgh,PA,3,2.5,['American (Traditional)' 'Comfort Food' 'Ca...,{'Take-out': True 'Wi-Fi': 'no' 'Alcohol': '...,1,0,0,1,1,2,0,0,0,0,1,0
4,McDonald's,400 Waterfront Dr E\nHomestead\nHomestead PA ...,Homestead,PA,12,2.0,['Burgers' 'Fast Food' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,0,1,1,1,1,1,0,0,0,0,0
5,Starbucks,270 W Bridge St\nHomestead\nHomestead PA 15120,Homestead,PA,25,3.5,['Food' 'Coffee & Tea'],{'Take-out': True 'Wi-Fi': 'free' 'Price Ran...,1,0,0,0,1,2,0,0,0,0,0,0


### Write to features table

In [225]:
X['full_address'] = X['full_address'].apply(lambda s: s.replace("\n", " "))
features = ['name', 'full_address' ,'review_count', 'stars', 'bar', 'fast_food', 'mexican', 'chinese']
features += [i[0] for i in attrs]
print(features)
X[features].to_sql('features', cnx, if_exists='replace', index=False)

['name', 'full_address', 'review_count', 'stars', 'bar', 'fast_food', 'mexican', 'chinese', 'takeout', 'alcohol', 'good_for_groups', 'good_for_kids', 'credit_card']


# Scratch

In [212]:
# Make categories table
df_c = pd.read_sql_query('''SELECT business_id, categories
FROM yelp_dataset_8 LIMIT 200''', cnx)

def category_list(cat):
    p = re.compile(r"'([\w\W]+)'")
    #df = pd.DataFrame(df_X.categories)
    return [p.findall(x) for x in df_X.categories]



0      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
1      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
2      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
3      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
4      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
5      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
6      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
7      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
8      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
9      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
10     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
11     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
12     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
13     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
14     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
15     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
16     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
17     [[Fast Food'  'Restauran

In [208]:
category_keywords = ['sandwiches', 'fast_food', 'nightlife', 'pizza', 'bars',
                     'mexican', 'food', 'american_traditional', 'burgers', 'chinese',
                     'italian', 'american_new', 'breakfast_brunch', 'thai', 'indian',
                     'sushi', 'korean', 'mediterranean', 'japanese', 'seafood',
                     'middle_eastern', 'pakistani', 'barbeque', 'vietnamese',
                     'asian_fusion', 'diners', 'greek', 'vegetarian']

def category_list(cat):
    p = re.compile(r"'([\w\W]+)'")
    #df = pd.DataFrame(df_X.categories)
    return [p.findall(x) for x in df_X.categories]

