In [7]:
from sqlalchemy import create_engine, inspect
from sklearn_pandas import DataFrameMapper
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import psycopg2
import numpy as np
from collections import Counter
import json


In [None]:
PSQL = 'postgres://{}@localhost:5432/rpred'

cred = ""
with open("../credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")

cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

# [Categories](#categories)
# [Attributes](#attributes)
# [Hours](#hours)
# [Example Json](#example)

 <a class="anchor" id="categories"></a>
# Find most popular category keywords:

In [2]:
category_keywords = ['sandwiches', 'fast food', 'nightlife', 'pizza', 'bars',
                     'mexican', 'food', 'american', 'burgers', 'chinese',
                     'italian', 'breakfast & brunch', 'thai', 'indian',
                     'sushi', 'korean', 'mediterranean', 'japanese', 'seafood',
                     'middle eastern', 'pakistani', 'barbeque', 'vietnamese',
                     'asian fusion', 'diners', 'greek', 'vegetarian', 'cafes',
                    'comfort food', 'bakeries']
    
df_c = pd.read_sql_query('''SELECT business_id, categories FROM yelp_dataset_8''', cnx)

df_c.head()

Unnamed: 0,business_id,categories
0,5UmKMjUEUNdYWqANhGckJw,['Fast Food' 'Restaurants']
1,yXuao0pFz1AxB21vJjDf5w,['Food' 'Grocery']
2,zaXDakTd3RXyOa7sMrUE1g,['Cafes' 'Restaurants']
3,8QlnAcjGE6dgfKTVoxrPvQ,['American (Traditional)' 'Comfort Food' 'Ca...
4,PK6aSizckHFWk8i0oxt5DA,['Burgers' 'Fast Food' 'Restaurants']


In [3]:
df_c['categories'] = df_c.categories.str.lower()
df_c.head()

Unnamed: 0,business_id,categories
0,5UmKMjUEUNdYWqANhGckJw,['fast food' 'restaurants']
1,yXuao0pFz1AxB21vJjDf5w,['food' 'grocery']
2,zaXDakTd3RXyOa7sMrUE1g,['cafes' 'restaurants']
3,8QlnAcjGE6dgfKTVoxrPvQ,['american (traditional)' 'comfort food' 'ca...
4,PK6aSizckHFWk8i0oxt5DA,['burgers' 'fast food' 'restaurants']


In [4]:
keyword_count = {}
for keyword in category_keywords:
    count = len(df_c[df_c.categories.str.contains(keyword)])
    keyword_count[keyword] = count
    
sorted(keyword_count.items(), key=lambda x: x[1], reverse=True)

[('food', 13486),
 ('fast food', 3154),
 ('sandwiches', 1220),
 ('bakeries', 1203),
 ('cafes', 1127),
 ('burgers', 963),
 ('bars', 901),
 ('mexican', 837),
 ('american', 808),
 ('breakfast & brunch', 622),
 ('nightlife', 408),
 ('chinese', 270),
 ('pizza', 206),
 ('italian', 148),
 ('seafood', 110),
 ('mediterranean', 103),
 ('comfort food', 101),
 ('barbeque', 72),
 ('vegetarian', 71),
 ('asian fusion', 61),
 ('thai', 59),
 ('diners', 58),
 ('indian', 55),
 ('greek', 55),
 ('japanese', 54),
 ('middle eastern', 54),
 ('vietnamese', 31),
 ('korean', 30),
 ('sushi', 28),
 ('pakistani', 8)]

## Top 10 interesting keywords:
1. ('fast food', 3154),
2. ('burgers', 963),
3. ('bars', 901),
4. ('mexican', 837),
5. ('american', 808),
6. ('chinese', 270),
7. ('pizza', 206),
8. ('italian', 148),
9. ('seafood', 110),
10. ('mediterranean', 103),

# TODO: Create SQL table for categories.

In [37]:
json_str = ""
with open("../data/yelp_dataset_round8/yelp_academic_dataset_business.json") as file:
    json_str = file.read()

In [42]:
att = []
hours = []
for item in json_str.split("\n"):
    if item.find("'Restaurants'") > 0 or item.find("Food") > 0 or item.find("Cafe") > 0 or item.find("Bakeries") > 0:
        item = json.loads(item)
        att.append(item['attributes'])
        hours.append(item['hours'])

 <a class="anchor" id="attributes"></a>
# Attributes
* Good for
* noise level
* price range

 <a class="anchor" id="hours"></a>
# Hours
* Find most popular closed days
* Average hours open
* Total hours open weekly

In [55]:
open_days = Counter([kk for k in hours for kk in k])
sorted(open_days.items(), key=lambda x: x[1])

[('Sunday', 7912),
 ('Monday', 8824),
 ('Saturday', 9232),
 ('Tuesday', 9356),
 ('Wednesday', 9503),
 ('Thursday', 9552),
 ('Friday', 9565)]

#### Most businesses open:
* ('Thursday', 9552)
* ('Friday', 9565)
* ('Wednesday', 9503)

#### Less businesses open:
* ('Sunday', 7912)
* ('Monday', 8824)

 <a class="anchor" id="example"></a>
Example Json:

    {'business_id': '5UmKMjUEUNdYWqANhGckJw',
     'full_address': '4734 Lebanon Church Rd\nDravosburg, PA 15034',
     'hours': {'Friday': {'close': '21:00', 'open': '11:00'},
       'Tuesday': {'close': '21:00', 'open': '11:00'},
       'Thursday': {'close': '21:00', 'open': '11:00'},
       'Wednesday': {'close': '21:00', 'open': '11:00'},
       'Monday': {'close': '21:00', 'open': '11:00'}},
     'open': True,
     'categories': ['Fast Food', 'Restaurants'],
     'city': 'Dravosburg',
     'review_count': 7,
     'name': 'Mr Hoagie',
     'neighborhoods': [],
     'longitude': -79.9007057,
     'state': 'PA',
     'stars': 3.5,
     'latitude': 40.3543266,
     'attributes': {'Take-out': True,
      'Drive-Thru': False,
      'Good For': {'dessert': False,
       'latenight': False,
       'lunch': False,
       'dinner': False,
       'brunch': False,
       'breakfast': False},
     'Caters': False,
    'Noise Level': 'average',
    'Takes Reservations': False,
    'Delivery': False,
    'Ambience': {'romantic': False,
      'intimate': False,
      'classy': False,
      'hipster': False,
      'divey': False,
      'touristy': False,
      'trendy': False,
      'upscale': False,
      'casual': False},
    'Parking': {'garage': False,
      'street': False,
      'validated': False,
      'lot': False,
      'valet': False},
    'Has TV': False,
    'Outdoor Seating': False,
    'Attire': 'casual',
    'Alcohol': 'none',
    'Waiter Service': False,
    'Accepts Credit Cards': True,
    'Good for Kids': True,
    'Good For Groups': True,
    'Price Range': 1},
    'type': 'business'}