In [45]:
from sqlalchemy import create_engine, inspect
from sklearn_pandas import DataFrameMapper
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import psycopg2
import numpy as np
from fuzzywuzzy import fuzz

PSQL = 'postgres://{}@localhost:5432/rpred'

# TODO:
* Split categories into columns.
* Split attributes into columns.
* Extract data from hours.
* Find if restaurant is a large chain (20+)

Connect to Postgres database.

In [46]:
cred = ""
with open("credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")
    
cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

Read from table.

In [160]:
yelp_dataset_8 = pd.read_sql_query('''SELECT 
name, full_address, city, state, review_count, stars, open, categories, attributes 
FROM yelp_dataset_8''', cnx)

In [161]:
# filter out grocery
X = yelp_dataset_8.drop(yelp_dataset_8[yelp_dataset_8.categories.str.contains("Grocery")].index)

open_count = len(X[X['open'] == 'True'])
closed_count = len(X[X['open'] == 'False'])
print("Open:", open_count, open_count/len(X), 
      "Closed:", closed_count, closed_count/len(X))

Open: 11304 0.8172353961827646 Closed: 2528 0.1827646038172354


In [166]:
# filter out closed stores
X.drop(X[X.open == 'False'].index, inplace=True)
X.drop(columns=['open'], inplace=True)

In [174]:
X.head()

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,fast_food,mexican,chinese,bar
0,Mr Hoagie,4734 Lebanon Church Rd Dravosburg PA 15034,Dravosburg,PA,7,3.5,['Fast Food' 'Restaurants'],{'Take-out': True 'Drive-Thru': False 'Good ...,1,0,0,0
2,Barb's Country Junction Cafe,202 3rd Ave Carnegie Carnegie PA 15106,Carnegie,PA,9,4.0,['Cafes' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,0,0,0,0
3,Boston Market,300 Davis Blvd Etna Pittsburgh PA 15275,Pittsburgh,PA,3,2.5,['American (Traditional)' 'Comfort Food' 'Ca...,{'Take-out': True 'Wi-Fi': 'no' 'Alcohol': '...,0,0,0,0
4,McDonald's,400 Waterfront Dr E Homestead Homestead PA 15120,Homestead,PA,12,2.0,['Burgers' 'Fast Food' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,0,0,0
5,Starbucks,270 W Bridge St Homestead Homestead PA 15120,Homestead,PA,25,3.5,['Food' 'Coffee & Tea'],{'Take-out': True 'Wi-Fi': 'free' 'Price Ran...,0,0,0,0


### Add columns for attributes

In [189]:
attrs = [
    ['takeout', 'Take-out'],
    ['alcohol', 'Alcohol'],
    ['good_for_groups', 'Good For Groups'],
    ['good_for_kids', 'Good for Kids'],
    ['credit_card', 'Accepts Credit Cards']
]

def extract_attribute(X, attr):
    X[attr[0]] = np.where(X['attributes'].str.contains("'{}': True".format(attr[1])), 1, 0)

for attr in attrs:
    extract_attribute(X, attr)
X.head()
    

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,fast_food,mexican,chinese,bar,takeout,alcohol,good_for_groups,good_for_kids,cash_only,credit_card
0,Mr Hoagie,4734 Lebanon Church Rd Dravosburg PA 15034,Dravosburg,PA,7,3.5,['Fast Food' 'Restaurants'],{'Take-out': True 'Drive-Thru': False 'Good ...,1,0,0,0,1,0,1,1,1,1
2,Barb's Country Junction Cafe,202 3rd Ave Carnegie Carnegie PA 15106,Carnegie,PA,9,4.0,['Cafes' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,0,0,0,0,1,0,1,1,1,1
3,Boston Market,300 Davis Blvd Etna Pittsburgh PA 15275,Pittsburgh,PA,3,2.5,['American (Traditional)' 'Comfort Food' 'Ca...,{'Take-out': True 'Wi-Fi': 'no' 'Alcohol': '...,0,0,0,0,1,0,0,1,1,1
4,McDonald's,400 Waterfront Dr E Homestead Homestead PA 15120,Homestead,PA,12,2.0,['Burgers' 'Fast Food' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,0,0,0,1,0,1,1,1,1
5,Starbucks,270 W Bridge St Homestead Homestead PA 15120,Homestead,PA,25,3.5,['Food' 'Coffee & Tea'],{'Take-out': True 'Wi-Fi': 'free' 'Price Ran...,0,0,0,0,1,0,0,0,1,1


### Add fast_food column

In [192]:
X['fast_food'] = np.where(X['categories'].str.contains('Fast Food'), 1, 0)
X.head()

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,fast_food,mexican,chinese,bar,takeout,alcohol,good_for_groups,good_for_kids,credit_card
0,Mr Hoagie,4734 Lebanon Church Rd Dravosburg PA 15034,Dravosburg,PA,7,3.5,['Fast Food' 'Restaurants'],{'Take-out': True 'Drive-Thru': False 'Good ...,1,0,0,0,1,0,1,1,1
2,Barb's Country Junction Cafe,202 3rd Ave Carnegie Carnegie PA 15106,Carnegie,PA,9,4.0,['Cafes' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,0,0,0,0,1,0,1,1,1
3,Boston Market,300 Davis Blvd Etna Pittsburgh PA 15275,Pittsburgh,PA,3,2.5,['American (Traditional)' 'Comfort Food' 'Ca...,{'Take-out': True 'Wi-Fi': 'no' 'Alcohol': '...,0,0,0,0,1,0,0,1,1
4,McDonald's,400 Waterfront Dr E Homestead Homestead PA 15120,Homestead,PA,12,2.0,['Burgers' 'Fast Food' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,0,0,0,1,0,1,1,1
5,Starbucks,270 W Bridge St Homestead Homestead PA 15120,Homestead,PA,25,3.5,['Food' 'Coffee & Tea'],{'Take-out': True 'Wi-Fi': 'free' 'Price Ran...,0,0,0,0,1,0,0,0,1


### Add mexican column

In [168]:
X['mexican'] = np.where(X['categories'].str.contains('Mexican'), 1, 0)
X[X['mexican'] == 1].head()

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,fast_food,mexican
60,Taco Bell,1603 S Braddock Ave\nPittsburgh PA 15218,Pittsburgh,PA,6,3.5,['Fast Food' 'Mexican' 'Tex-Mex' 'Restauran...,{'Take-out': True 'Wi-Fi': 'no' 'Drive-Thru'...,1,1
146,Taco Bell,825 Freeport Rd.\nPittsburgh PA 15238,Pittsburgh,PA,4,3.5,['Fast Food' 'Mexican' 'Tex-Mex' 'Restauran...,{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,1
155,Taco Bell,6805 W Wilkinson Blvd\nBelmont NC 28012,Belmont,NC,5,4.0,['Fast Food' 'Mexican' 'Tex-Mex' 'Restauran...,{'Take-out': True 'Wi-Fi': 'no' 'Good For': ...,1,1
166,Taco Bell,10917 Carolina Place Pkwy\nPineville NC 28134,Pineville,NC,12,3.5,['Fast Food' 'Mexican' 'Tex-Mex' 'Restauran...,{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,1
178,Salsarita's Fresh Cantina,101 S Tryon St\nSuite 5\nUptown\nCharlotte NC...,Charlotte,NC,23,4.0,['Fast Food' 'Mexican' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'free' 'Drive-Thr...,1,1


### Add chinese column

In [169]:
X['chinese'] = np.where(X['categories'].str.contains('Chinese'), 1, 0)
X[X['chinese'] == 1].head()

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,fast_food,mexican,chinese
37,Great Wall Chinese Food,243 Brownsville Rd\nMt. Oliver\nPittsburgh PA...,Pittsburgh,PA,5,4.0,['Chinese' 'Restaurants'],{'Noise Level': 'average' 'Attire': 'casual' ...,0,0,1
395,Panda Express,2000 N Neil St\nChampaign IL 61820,Champaign,IL,6,3.0,['Fast Food' 'Chinese' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'no' 'Drive-Thru'...,1,0,1
538,Panda Express,3923 E Thomas Rd\nPhoenix AZ 85018,Phoenix,AZ,20,2.0,['Fast Food' 'Chinese' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,1,0,1
572,Panda Express,7000 16th St\nSte 100\nPhoenix AZ 85020,Phoenix,AZ,16,3.0,['Fast Food' 'Chinese' 'Restaurants'],{'Take-out': True 'Wi-Fi': 'no' 'Good For': ...,1,0,1
598,Panda Express,903 E Bell Rd\nSte 101\nPhoenix AZ 85022,Phoenix,AZ,23,3.5,['Fast Food' 'Chinese' 'Restaurants'],{'Take-out': True 'Good For': {'dessert': Fal...,1,0,1


### Add bar column

In [190]:
X['bar'] = np.where(X['categories'].str.contains('Bars'), 1, 0)
X[X['bar'] == 1].head()

Unnamed: 0,name,full_address,city,state,review_count,stars,categories,attributes,fast_food,mexican,chinese,bar,takeout,alcohol,good_for_groups,good_for_kids,cash_only,credit_card
8,Duke's Upper Deck Cafe,122 W 8th Ave Homestead Homestead PA 15120,Homestead,PA,33,3.5,['Pubs' 'Bars' 'American (New)' 'Nightlife'...,{'Coat Check': False 'Take-out': True 'Wi-Fi...,0,0,0,1,1,0,1,0,1,1
123,Cappy's Cafe,5431 Walnut St Shadyside Pittsburgh PA 15232,Pittsburgh,PA,51,3.0,['Bars' 'American (Traditional)' 'Nightlife'...,{'Alcohol': 'full_bar' 'Noise Level': 'averag...,0,0,0,1,1,0,1,0,1,1
131,Le Mardi Gras,731 Copeland St Shadyside Pittsburgh PA 15232,Pittsburgh,PA,48,4.0,['Bars' 'Food' 'Bagels' 'Dive Bars' 'Night...,{'Coat Check': False 'Take-out': False 'Alco...,0,0,0,1,0,0,1,0,0,0
186,Phil's Deli New York Deli & Tavern,105 E 5th St First Ward Charlotte NC 28202,Charlotte,NC,12,3.0,['Bars' 'Food' 'Beer Wine & Spirits' 'Deli...,{'Take-out': True 'Good For': {'dessert': Fal...,0,0,0,1,1,0,1,0,1,1
195,Smoothie King,301 S College St Ste 265 Uptown Charlotte NC ...,Charlotte,NC,8,4.5,['Health Markets' 'Food' 'Juice Bars & Smoot...,{'Parking': {'garage': True 'street': False ...,0,0,0,1,0,0,0,0,1,1


### Write to features table

In [193]:
X['full_address'] = X['full_address'].apply(lambda s: s.replace("\n", " "))
features = ['name', 'full_address' ,'review_count', 'stars', 'bar', 'fast_food', 'mexican', 'chinese']
features += [i[0] for i in attrs]
print(features)
X[features].to_sql('features', cnx, if_exists='replace', index=False)

['name', 'full_address', 'review_count', 'stars', 'bar', 'fast_food', 'mexican', 'chinese', 'takeout', 'alcohol', 'good_for_groups', 'good_for_kids', 'credit_card']


# Scratch

In [212]:
# Make categories table
df_c = pd.read_sql_query('''SELECT business_id, categories
FROM yelp_dataset_8 LIMIT 200''', cnx)

def category_list(cat):
    p = re.compile(r"'([\w\W]+)'")
    #df = pd.DataFrame(df_X.categories)
    return [p.findall(x) for x in df_X.categories]



0      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
1      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
2      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
3      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
4      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
5      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
6      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
7      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
8      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
9      [[Fast Food'  'Restaurants], [Food'  'Grocery]...
10     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
11     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
12     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
13     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
14     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
15     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
16     [[Fast Food'  'Restaurants], [Food'  'Grocery]...
17     [[Fast Food'  'Restauran

In [208]:
category_keywords = ['sandwiches', 'fast_food', 'nightlife', 'pizza', 'bars',
                     'mexican', 'food', 'american_traditional', 'burgers', 'chinese',
                     'italian', 'american_new', 'breakfast_brunch', 'thai', 'indian',
                     'sushi', 'korean', 'mediterranean', 'japanese', 'seafood',
                     'middle_eastern', 'pakistani', 'barbeque', 'vietnamese',
                     'asian_fusion', 'diners', 'greek', 'vegetarian']

def category_list(cat):
    p = re.compile(r"'([\w\W]+)'")
    #df = pd.DataFrame(df_X.categories)
    return [p.findall(x) for x in df_X.categories]

