# Get Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import json
import os.path

In [3]:
t_map = pd.read_csv('/content/drive/MyDrive/CSVs/OpenStreetMap_Tourist_Attractions_for_North_America.csv')

  t_map = pd.read_csv('/content/drive/MyDrive/CSVs/OpenStreetMap_Tourist_Attractions_for_North_America.csv')


In [4]:
bcols = ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'is_open', 'categories']
data = []
filename = '/content/drive/MyDrive/yelp_dataset/yelp_academic_dataset_business.json'

with open(filename) as f:
  for line in f:
    doc = json.loads(line)
    add = []
    for bcol in bcols:
      add.append(doc[bcol])
    data.append(add)

businesses = pd.DataFrame(data = data, columns = bcols)

In [None]:
rcols = ['review_id', 'business_id', 'stars', 'text']
data = []
filename = '/content/drive/MyDrive/yelp_dataset/yelp_academic_dataset_review.json'

with open(filename) as f:
  for line in f:
    doc = json.loads(line)
    add = []
    for rcol in rcols:
      add.append(doc[rcol])
    data.append(add)

reviews = pd.DataFrame(data = data, columns = rcols)

# Business Preprocessing

In [5]:
#Isolate to restaurants only

businesses['categories'] = businesses['categories'].fillna('Nothing')
restaurants = businesses[businesses['categories'].str.startswith('Restaurants')]

In [6]:
#Drop closed businesses
restaurants = restaurants[restaurants['is_open'] == 1]
restaurants = restaurants.drop('is_open', axis = 1)

In [7]:
#Unify addresses
restaurants['address'] = restaurants['address'] + ', ' + restaurants['city'] + ', ' + restaurants['state'] + ' ' + restaurants['postal_code']

In [8]:
#get categories
cats = restaurants['categories'].str.split(',', expand = True)

restaurants['cat_1'] = cats[1].str.strip()
restaurants['cat_2'] = cats[2].str.strip()
restaurants['cat_3'] = cats[3].str.strip()
restaurants = restaurants.drop('categories', axis = 1)

In [9]:
restaurants = restaurants.drop(['city', 'state'], axis = 1)
restaurants = restaurants.reset_index(drop = True)
restaurants

Unnamed: 0,business_id,name,address,postal_code,latitude,longitude,stars,cat_1,cat_2,cat_3
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,"935 Race St, Philadelphia, PA 19107",19107,39.955505,-75.155564,4.0,Food,Bubble Tea,Coffee & Tea
1,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,"5505 S Virginia St, Reno, NV 89502",89502,39.476117,-119.789339,2.5,Italian,,
2,ljxNT9p0y7YMPx0fcNBGig,Tony's Restaurant & 3rd Street Cafe,"312 Piasa St, Alton, IL 62002",62002,38.896563,-90.186203,3.0,Specialty Food,Steakhouses,Food
3,ABxoFuzZy5mqQ8C5FJJajQ,Core de Roma,"201 Jefferson St, Bala Cynwyd, PA 19004",19004,40.028357,-75.238084,5.0,Italian,,
4,ppFCk9aQkM338Rgwpl2F5A,Wawa,"3604 Chestnut St, Philadelphia, PA 19104",19104,39.954573,-75.194894,3.0,Automotive,Delis,Gas Stations
...,...,...,...,...,...,...,...,...,...,...
9958,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,"6024 Ridge Ave, Philadelphia, PA 19128",19128,40.032483,-75.214430,3.0,Specialty Food,Food,Sandwiches
9959,sf_oQ62L8UEnOOLf00nNGA,Pizza Hut,"5028 Old Hickory, Hermitage, TN 37076",37076,36.193201,-86.614748,3.0,Pizza,Fast Food,Chicken Wings
9960,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,"19 N Bishop Ave, Clifton Heights, PA 19018",19018,39.925656,-75.310344,3.0,Sandwiches,Convenience Stores,Coffee & Tea
9961,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,"1108 S 9th St, Philadelphia, PA 19147",19147,39.935982,-75.158665,4.5,Mexican,,


# Review Preprocessing

In [None]:
#Drop reviews not about open restaurants
merged = reviews.merge(restaurants[['business_id']], on = 'business_id', how = 'right')
reviews = merged.dropna(subset = ['business_id'])

In [None]:
reviews = reviews.reset_index()
reviews = reviews.drop('index', axis = 1)
reviews

# Map Preprocessing

In [10]:
#Renaming
renames = {'X': 'longitude', 'Y': 'latitude', 'osm_id2': 'attraction_id', 'tourism': 'type', 'attraction_id': 'drop'}
t_map = t_map.rename(columns = renames)

In [11]:
#Drop stuff
to_keep = ['longitude', 'latitude', 'addr_city', 'addr_country', 'addr_housenumber', 'addr_province', 'addr_postcode', 'addr_state', 'addr_street', 'addr_unit', 'name', 'type', 'attraction_id']
t_map = t_map[to_keep]

In [12]:
#Unite addresses
feats = ['addr_housenumber', 'addr_street', 'addr_unit', 'addr_city', 'addr_state', 'addr_province', 'addr_country', 'addr_postcode']
for feat in feats:
  t_map[feat] = t_map[feat].astype(str)
  t_map[feat] = t_map[feat].replace('nan', '')

t_map['address'] = t_map['addr_housenumber'].str.strip() + ' ' + t_map['addr_street'].str.strip() + ', ' + t_map['addr_city'].str.strip() + ', ' + t_map['addr_state'].str.strip() + t_map['addr_province'].str.strip() + ' ' + t_map['addr_postcode'].str.strip() + ', ' + t_map['addr_country'].str.strip()


In [13]:
#Set unknown if any fields dropped
for index, row in t_map.iterrows():
  addr = row['address'].split(',')
  if any(field.strip() == '' for field in addr):
    t_map.at[index, 'address'] = 'Unknown'

In [14]:
#Second round drops
t_map = t_map.drop(feats, axis = 1)
t_map = t_map.dropna()

In [15]:
t_map = t_map.reset_index(drop = True)
t_map

Unnamed: 0,longitude,latitude,name,type,attraction_id,address
0,-92.318096,14.616769,Brisas del Mar,viewpoint,5327709923,Unknown
1,-92.355947,14.653835,Hotel Playa Linda,hotel,388651468,Unknown
2,-92.240303,14.745767,Rancho El Tesoro,camp_site,7883004685,Unknown
3,-92.433990,14.732903,Misión Surf Mexico,camp_site,7228473785,Unknown
4,-92.284821,14.865885,Villas exotica,motel,4794499945,Unknown
...,...,...,...,...,...,...
126467,-74.716278,40.152368,History,information,11770408378,Unknown
126468,-74.715997,40.152257,History,information,11770408379,Unknown
126469,-74.714336,40.153047,Deleware Canal Trail Map,information,11770408380,Unknown
126470,-94.206435,38.908874,Fangorn Trail,information,11770424833,Unknown


# Distance Stuff

In [16]:
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

In [17]:
rest_info = restaurants[['business_id', 'longitude', 'latitude']]
t_info = t_map[['attraction_id', 'longitude', 'latitude']]

In [18]:
def haversine(lon1, lat1, lon2, lat2):

    #Radius of earth in km
    r = 6378.137

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2

    c = 2 * np.arcsin(np.sqrt(a))
    dist = r * c
    return dist

In [19]:
close_enough = 25 #kilometers

In [None]:
gdf_rests = gpd.GeoDataFrame(rest_info, geometry=[Point(xy) for xy in zip(rest_info.longitude, rest_info.latitude)])
gdf_tours = gpd.GeoDataFrame(t_info, geometry=[Point(xy) for xy in zip(t_info.longitude, t_info.latitude)])

gdf_rests.crs = gdf_tours.crs = "EPSG:4326"

gdf_rests = gdf_rests.to_crs("EPSG:3857")
gdf_tours = gdf_tours.to_crs("EPSG:3857")

gdf_tours_buffered = gdf_tours.copy()
gdf_tours_buffered.geometry = gdf_tours_buffered.geometry.buffer(close_enough * 1000)

close_rests = gpd.sjoin(gdf_rests, gdf_tours_buffered, op='within')

close_rests_df = pd.DataFrame(close_rests.drop(columns='geometry')).reset_index(drop=True)

In [21]:
close_rests_df = close_rests_df.drop(columns = 'index_right')
close_rests_df['distance'] = haversine(close_rests_df['longitude_left'], close_rests_df['latitude_left'], close_rests_df['longitude_right'], close_rests_df['latitude_right'])
close_rests_df = close_rests_df.drop(columns = ['longitude_left', 'latitude_left', 'longitude_right', 'latitude_right'])
close_rests_df

Unnamed: 0,business_id,attraction_id,distance
0,MTSW4McQd7CbVtyjqoe9mw,1073691908,5.749833
1,ABxoFuzZy5mqQ8C5FJJajQ,1073691908,14.795421
2,ppFCk9aQkM338Rgwpl2F5A,1073691908,5.804820
3,3BJxm-HnvzdwD1zjmSbmyQ,1073691908,6.425897
4,bTve2mwLk5Zc01vRKqc2KQ,1073691908,4.156078
...,...,...,...
1468016,D5n8XvQqC_vzOCi63wNChQ,11099864692,10.745030
1468017,C8KFRgrMtc_naR6I7yWytg,9077244443,21.572610
1468018,C8KFRgrMtc_naR6I7yWytg,2676917087,21.296754
1468019,C8KFRgrMtc_naR6I7yWytg,2673195412,21.301340


# Download Time

In [None]:
reviews.to_csv('reviews.csv', index = False)

In [None]:
restaurants.to_csv('restaurants.csv', index = False)

In [None]:
close_rests_df.to_csv('close_rests.csv', index = False)

In [None]:
t_map.to_csv('t_map.csv', index = False)

In [None]:
reviews.to_csv('reviews.zip', index = False, compression='gzip')