### **Formatting business data for frontend map points** ###

Author: Neha Deshpande

In [1]:
# Imports
#basics
import numpy as np
import pandas as pd
import scipy 

#misc
import time

#files & data
import json
from google.colab import files

#viz
import matplotlib.pyplot as plt
import seaborn as sns 

#settings
color = sns.color_palette()


In [2]:
# Install kaggle and create directory to store kaggle.json
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [3]:
# Check if .kaggle exists in what is printed after running this cell
!ls ~/ -a

.   .bashrc  .config  .ipython	.kaggle  .local  .profile
..  .cache   .gsutil  .jupyter	.keras	 .npm	 .tmux.conf


In [4]:
#########################################################
#             STEP 1: GET DATASET INTO COLAB            #
#########################################################
# Source: https://medium.com/@opalkabert/downloading-kaggle-datasets-into-google-colab-fb9654c94235
# Get JSON Kaggle API token by going to: 
# 1. Kaggle Profile
# 2. Go to Accounts tab
# 3. Scroll down to API and "Create new API Token"
# 4. JSON file should be automatically downloaded

In [5]:
# Upload kaggle.json you downloaded in the previous step
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nehadesh","key":"b1ff3516c2e6233953aac308c8a8aab7"}'}

In [6]:
# Store Kaggle API token in colab
!rm -f ~/.kaggle/kaggle.json
!cp kaggle.json ~/.kaggle/

In [7]:
# Kaggle recommends running this to prevent other users of your computer from accessing your credentials
!chmod 600 ~/.kaggle/kaggle.json

In [8]:
# Download Yelp Dataset from Kaggle: 
# Link to dataset: https://www.kaggle.com/yelp-dataset/yelp-dataset
!kaggle datasets download -d yelp-dataset/yelp-dataset
!ls

Downloading yelp-dataset.zip to /content
100% 4.92G/4.92G [01:29<00:00, 68.6MB/s]
100% 4.92G/4.92G [01:29<00:00, 58.8MB/s]
kaggle.json  sample_data  yelp-dataset.zip


In [9]:
# Unzip Yelp Data
!unzip '/content/yelp-dataset.zip' -d '/content/yelp-dataset'

Archive:  /content/yelp-dataset.zip
  inflating: /content/yelp-dataset/Dataset_User_Agreement.pdf  
  inflating: /content/yelp-dataset/yelp_academic_dataset_business.json  
  inflating: /content/yelp-dataset/yelp_academic_dataset_checkin.json  
  inflating: /content/yelp-dataset/yelp_academic_dataset_review.json  
  inflating: /content/yelp-dataset/yelp_academic_dataset_tip.json  
  inflating: /content/yelp-dataset/yelp_academic_dataset_user.json  


In [10]:
# Larger datasets require pyspark
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 89kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 57.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=5f7824a76ae5527d6a68c7348b35ce3064d9caba9607c9b898a863fe73bf03fc
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [12]:
spark = SparkSession.builder.appName('yelp_dataset').getOrCreate()

In [13]:
# RAW, UNCLEANED DATA (refer to the other notebook for how to clean, etc. Didn't want to remove anything if y'all needed it)
reviews_sk = spark.read.json('/content/yelp-dataset/yelp_academic_dataset_review.json')
business_sk = spark.read.json('/content/yelp-dataset/yelp_academic_dataset_business.json')
users_sk = spark.read.json('/content/yelp-dataset/yelp_academic_dataset_user.json')
# UNCOMMENT BELOW IF YOU NEED THEM
# tips_sk = spark.read.json('/content/yelp-dataset/yelp_academic_dataset_tip.json')
# checkin_sk = spark.read.json('/content/yelp-dataset/yelp_academic_dataset_checkin.json')

In [14]:
# Get dataset for Atlanta
food_business = business_sk.filter(col('categories').contains('Restaurant'))
atl_food_business = food_business.filter(lower(col("city")) == "atlanta")
atl_food_business = atl_food_business.filter(lower(col("state")) == "ga")

In [15]:
reviews_sk = reviews_sk.withColumn("rating", reviews_sk['stars'].cast("double"))
atl_food_reviews = reviews_sk.join(atl_food_business, "business_id", "inner")

In [18]:
final_data = atl_food_business.select("business_id", "name", "stars", "latitude", "longitude", "categories")

In [19]:
cuisine_set = set()
cuisine_set.add('Afghan')
cuisine_set.add('African')
# cuisine_set.add('Senegalese')
# cuisine_set.add('South African')
cuisine_set.add('American (New)')
cuisine_set.add('American (Traditional)')
cuisine_set.add('Arabian')
cuisine_set.add('Argentine')
cuisine_set.add('Armenian')
cuisine_set.add('Asian Fusion')
cuisine_set.add('Australian')
cuisine_set.add('Austrian')
cuisine_set.add('Bangladeshi')
cuisine_set.add('Barbeque')
cuisine_set.add('Basque')
cuisine_set.add('Belgian')
cuisine_set.add('Brasseries')
cuisine_set.add('Brazilian')
cuisine_set.add('Breakfast & Brunch')
# cuisine_set.add('Pancakes')
cuisine_set.add('British')
cuisine_set.add('Buffets')
cuisine_set.add('Bulgarian')
cuisine_set.add('Burgers')
cuisine_set.add('Burmese')
cuisine_set.add('Cafes')
# cuisine_set.add('Themed Cafes')
cuisine_set.add('Cafeteria')
cuisine_set.add('Cajun/Creole')
cuisine_set.add('Cambodian')
cuisine_set.add('Caribbean')
# cuisine_set.add('Dominican')
# cuisine_set.add('Haitian')
# cuisine_set.add('Puerto Rican')
# cuisine_set.add('Trinidadian')
cuisine_set.add('Catalan')
cuisine_set.add('Cheesesteaks')
cuisine_set.add('Chicken Shop')
cuisine_set.add('Chicken Wings')
cuisine_set.add('Chinese')
# cuisine_set.add('Cantonese')
# cuisine_set.add('Dim Sum')
# cuisine_set.add('Hainan')
# cuisine_set.add('Shanghainese')
# cuisine_set.add('Szechuan')
cuisine_set.add('Comfort Food')
cuisine_set.add('Creperies')
cuisine_set.add('Cuban')
cuisine_set.add('Czech')
cuisine_set.add('Delis')
cuisine_set.add('Diners')
cuisine_set.add('Dinner Theater')
cuisine_set.add('Eritrean')
cuisine_set.add('Ethiopian')
cuisine_set.add('Fast Food')
cuisine_set.add('Filipino')
cuisine_set.add('Fish & Chips')
cuisine_set.add('Fondue')
cuisine_set.add('Food Court')
cuisine_set.add('Food Stands')
cuisine_set.add('French')
# cuisine_set.add('Mauritius')
# cuisine_set.add('Reunion')
cuisine_set.add('Game Meat')
cuisine_set.add('Gastropubs')
cuisine_set.add('Georgian')
cuisine_set.add('German')
cuisine_set.add('Gluten-Free')
cuisine_set.add('Greek')
cuisine_set.add('Guamanian')
cuisine_set.add('Halal')
cuisine_set.add('Hawaiian')
cuisine_set.add('Himalayan/Nepalese')
cuisine_set.add('Honduran')
cuisine_set.add('Hong Kong Style Cafe')
cuisine_set.add('Hot Dogs')
cuisine_set.add('Hot Pot')
cuisine_set.add('Hungarian')
cuisine_set.add('Iberian')
cuisine_set.add('Indian')
cuisine_set.add('Indonesian')
cuisine_set.add('Irish')
cuisine_set.add('Italian')
# cuisine_set.add('Calabrian')
# cuisine_set.add('Sardinian')
# cuisine_set.add('Sicilian')
# cuisine_set.add('Tuscan')
cuisine_set.add('Japanese')
# cuisine_set.add('Conveyor Belt Sushi')
# cuisine_set.add('Izakaya')
# cuisine_set.add('Japanese Curry')
# cuisine_set.add('Ramen')
# cuisine_set.add('Teppanyaki')
cuisine_set.add('Kebab')
cuisine_set.add('Korean')
cuisine_set.add('Kosher')
cuisine_set.add('Laotian')
cuisine_set.add('Latin American')
# cuisine_set.add('Colombian')
# cuisine_set.add('Salvadoran')
# cuisine_set.add('Venezuelan')
cuisine_set.add('Live/Raw Food')
cuisine_set.add('Malaysian')
cuisine_set.add('Mediterranean')
# cuisine_set.add('Falafel')
cuisine_set.add('Mexican')
# cuisine_set.add('Tacos')
cuisine_set.add('Middle Eastern')
# cuisine_set.add('Egyptian')
# cuisine_set.add('Lebanese')
cuisine_set.add('Modern European')
cuisine_set.add('Mongolian')
cuisine_set.add('Moroccan')
cuisine_set.add('New Mexican Cuisine')
cuisine_set.add('Nicaraguan')
cuisine_set.add('Noodles')
cuisine_set.add('Pakistani')
cuisine_set.add('Pan Asia')
cuisine_set.add('Persian/Iranian')
cuisine_set.add('Peruvian')
cuisine_set.add('Pizza')
cuisine_set.add('Polish')
cuisine_set.add('Polynesian')
cuisine_set.add('Pop-Up Restaurants')
cuisine_set.add('Portuguese')
cuisine_set.add('Poutineries')
cuisine_set.add('Russian')
cuisine_set.add('Salad')
cuisine_set.add('Sandwiches')
cuisine_set.add('Scandinavian')
cuisine_set.add('Scottish')
cuisine_set.add('Seafood')
cuisine_set.add('Singaporean')
cuisine_set.add('Slovakian')
cuisine_set.add('Somali')
cuisine_set.add('Soul Food')
cuisine_set.add('Soup')
cuisine_set.add('Southern')
cuisine_set.add('Spanish')
cuisine_set.add('Sri Lankan')
cuisine_set.add('Steakhouses')
cuisine_set.add('Supper Clubs')
cuisine_set.add('Sushi Bars')
cuisine_set.add('Syrian')
cuisine_set.add('Taiwanese')
cuisine_set.add('Tapas Bars')
cuisine_set.add('Tapas/Small Plates')
cuisine_set.add('Tex-Mex')
cuisine_set.add('Thai')
cuisine_set.add('Turkish')
cuisine_set.add('Ukrainian')
cuisine_set.add('Uzbek')
cuisine_set.add('Vegan')
cuisine_set.add('Vegetarian')
cuisine_set.add('Vietnamese')
cuisine_set.add('Waffles')
cuisine_set.add('Wraps')

In [20]:
!pip install requests
import requests
import json



In [22]:
def getCuisine(categories_list):
    cuisine=""
    categories = categories_list.split(", ")
    for category in categories:
      if category in cuisine_set:
        return category
    return 'Other'

In [23]:
cuisine_udf = udf(lambda z: getCuisine(z),StringType())
final_data = final_data.withColumn('cuisine', cuisine_udf(col('categories')))

In [24]:
final_data = final_data.select("business_id", "name", "stars", "latitude", "longitude", "cuisine")
final_data.write.csv('/content/final_data', header=True)

In [27]:
# RENAME SAVED FILES TO part_1.csv, part_2.csv

In [28]:
api_key = 'YOUR_API_KEY_HERE'
def get_neighborhood(lati, longi):
  time.sleep(.025)
  geo_url = 'https://maps.googleapis.com/maps/api/geocode/json?latlng={},{}&key={}'.format(str(lati), str(longi), api_key)
  r = requests.get(geo_url)
  address_parts = json.loads(r.text)['results'][0]['address_components']
  for addr in address_parts:
    if 'neighborhood' in addr['types']:
      return addr['long_name']

In [29]:
import csv
for i in range(1, 3):
  with open('/content/final_data/part_{}.csv'.format(i),'r') as csvinput:
      with open('/content/final_data/part_{}_w_neighb.csv'.format(i), 'w') as csvoutput:
          writer = csv.writer(csvoutput, lineterminator='\n')
          reader = csv.reader(csvinput)

          all = []
          row = next(reader)
          row.append('neighborhood')
          all.append(row)

          for row in reader:
              lati = row[3]
              longi = row[4]

              try:
                neighb = get_neighborhood(float(lati), float(longi))
                if neighb == None:
                  continue
                row.append(neighb)
                all.append(row)
              except Exception as e:
                print(e)
          writer.writerows(all)