# Get Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import json
import os.path

In [3]:
t_map = pd.read_csv('/content/drive/MyDrive/CSVs/OpenStreetMap_Tourist_Attractions_for_North_America.csv')

  t_map = pd.read_csv('/content/drive/MyDrive/CSVs/OpenStreetMap_Tourist_Attractions_for_North_America.csv')


In [4]:
bcols = ['business_id', 'name', 'address', 'review_count', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'is_open', 'categories']
data = []
filename = '/content/drive/MyDrive/yelp_dataset/yelp_academic_dataset_business.json'

with open(filename) as f:
  for line in f:
    doc = json.loads(line)
    add = []
    for bcol in bcols:
      add.append(doc[bcol])
    data.append(add)

businesses = pd.DataFrame(data = data, columns = bcols)

In [5]:
rcols = ['review_id', 'business_id', 'stars', 'text']
data = []
filename = '/content/drive/MyDrive/yelp_dataset/yelp_academic_dataset_review.json'

with open(filename) as f:
  for line in f:
    doc = json.loads(line)
    add = []
    for rcol in rcols:
      add.append(doc[rcol])
    data.append(add)

reviews = pd.DataFrame(data = data, columns = rcols)

# Business Preprocessing

In [5]:
#Isolate to restaurants only

businesses['categories'] = businesses['categories'].fillna('Nothing')
restaurants = businesses[businesses['categories'].str.startswith('Restaurants')]

In [6]:
#Drop closed businesses
restaurants = restaurants[restaurants['is_open'] == 1]
restaurants = restaurants.drop('is_open', axis = 1)

In [7]:
#Unify addresses
restaurants['address'] = restaurants['address'] + ', ' + restaurants['city'] + ', ' + restaurants['state'] + ' ' + restaurants['postal_code']

In [8]:
#get categories
cats = restaurants['categories'].str.split(',', expand = True)

restaurants['cat_1'] = cats[1].str.strip()
restaurants['cat_2'] = cats[2].str.strip()
restaurants['cat_3'] = cats[3].str.strip()
restaurants = restaurants.drop('categories', axis = 1)

In [None]:
restaurants = restaurants.drop(['city', 'state'], axis = 1)
restaurants = restaurants.reset_index(drop = True)
restaurants

# Review Preprocessing

In [None]:
#Drop reviews not about open restaurants
merged = reviews.merge(restaurants[['business_id']], on = 'business_id', how = 'right')
reviews = merged.dropna(subset = ['business_id'])

In [None]:
reviews = reviews.reset_index()
reviews = reviews.drop('index', axis = 1)
reviews

# Map Preprocessing

In [11]:
#Renaming
renames = {'X': 'longitude', 'Y': 'latitude', 'osm_id2': 'attraction_id', 'tourism': 'type', 'attraction_id': 'drop'}
t_map = t_map.rename(columns = renames)

In [12]:
#Drop stuff
to_keep = ['longitude', 'latitude', 'website', 'addr_city', 'addr_country', 'addr_housenumber', 'addr_province', 'addr_postcode', 'addr_state', 'addr_street', 'addr_unit', 'name', 'type', 'attraction_id']
t_map = t_map[to_keep]

In [13]:
#Unite addresses
feats = ['addr_housenumber', 'addr_street', 'addr_unit', 'addr_city', 'addr_state', 'addr_province', 'addr_country', 'addr_postcode']
for feat in feats:
  t_map[feat] = t_map[feat].astype(str)
  t_map[feat] = t_map[feat].replace('nan', '')

t_map['address'] = t_map['addr_housenumber'].str.strip() + ' ' + t_map['addr_street'].str.strip() + ', ' + t_map['addr_city'].str.strip() + ', ' + t_map['addr_state'].str.strip() + t_map['addr_province'].str.strip() + ' ' + t_map['addr_postcode'].str.strip() + ', ' + t_map['addr_country'].str.strip()


In [14]:
#Set unknown if any fields dropped
for index, row in t_map.iterrows():
  addr = row['address'].split(',')
  if any(field.strip() == '' for field in addr):
    t_map.at[index, 'address'] = 'Unknown'

In [15]:
#Fill website column
t_map['website'].fillna('Unknown', inplace = True)
t_map['website']

0                                   Unknown
1                                   Unknown
2                                   Unknown
3         https://www.misionsurfmexico.com/
4                                   Unknown
                        ...                
265356                              Unknown
265357                              Unknown
265358                              Unknown
265359                              Unknown
265360                              Unknown
Name: website, Length: 265361, dtype: object

In [16]:
#Second round drops
t_map = t_map.drop(feats, axis = 1)
t_map = t_map.dropna()

In [17]:
t_map = t_map.reset_index(drop = True)
t_map

Unnamed: 0,longitude,latitude,website,name,type,attraction_id,address
0,-92.318096,14.616769,Unknown,Brisas del Mar,viewpoint,5327709923,Unknown
1,-92.355947,14.653835,Unknown,Hotel Playa Linda,hotel,388651468,Unknown
2,-92.240303,14.745767,Unknown,Rancho El Tesoro,camp_site,7883004685,Unknown
3,-92.433990,14.732903,https://www.misionsurfmexico.com/,Misión Surf Mexico,camp_site,7228473785,Unknown
4,-92.284821,14.865885,Unknown,Villas exotica,motel,4794499945,Unknown
...,...,...,...,...,...,...,...
126467,-74.716278,40.152368,Unknown,History,information,11770408378,Unknown
126468,-74.715997,40.152257,Unknown,History,information,11770408379,Unknown
126469,-74.714336,40.153047,Unknown,Deleware Canal Trail Map,information,11770408380,Unknown
126470,-94.206435,38.908874,Unknown,Fangorn Trail,information,11770424833,Unknown


# Distance Stuff

In [None]:
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

In [None]:
rest_info = restaurants[['business_id', 'longitude', 'latitude']]
t_info = t_map[['attraction_id', 'longitude', 'latitude']]

In [None]:
def haversine(lon1, lat1, lon2, lat2):

    #Radius of earth in km
    r = 6378.137

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2

    c = 2 * np.arcsin(np.sqrt(a))
    dist = r * c
    return dist

In [None]:
close_enough = 25 #kilometers

In [None]:
gdf_rests = gpd.GeoDataFrame(rest_info, geometry=[Point(xy) for xy in zip(rest_info.longitude, rest_info.latitude)])
gdf_tours = gpd.GeoDataFrame(t_info, geometry=[Point(xy) for xy in zip(t_info.longitude, t_info.latitude)])

gdf_rests.crs = gdf_tours.crs = "EPSG:4326"

gdf_rests = gdf_rests.to_crs("EPSG:3857")
gdf_tours = gdf_tours.to_crs("EPSG:3857")

gdf_tours_buffered = gdf_tours.copy()
gdf_tours_buffered.geometry = gdf_tours_buffered.geometry.buffer(close_enough * 1000)

close_rests = gpd.sjoin(gdf_rests, gdf_tours_buffered, op='within')

close_rests_df = pd.DataFrame(close_rests.drop(columns='geometry')).reset_index(drop=True)

In [None]:
close_rests_df = close_rests_df.drop(columns = 'index_right')
close_rests_df['distance'] = haversine(close_rests_df['longitude_left'], close_rests_df['latitude_left'], close_rests_df['longitude_right'], close_rests_df['latitude_right'])
close_rests_df = close_rests_df.drop(columns = ['longitude_left', 'latitude_left', 'longitude_right', 'latitude_right'])
close_rests_df

# Download Time

In [None]:
reviews.to_csv('reviews.csv', index = False)

In [None]:
restaurants.to_csv('restaurants.csv', index = False)

In [None]:
close_rests_df.to_csv('close_rests.csv', index = False)

In [None]:
t_map.to_csv('t_map.csv', index = False)

In [None]:
reviews.to_csv('reviews.zip', index = False, compression='gzip')

# NLP Experiment

## Model Installation

In [None]:
%pip install transformers[sentencepiece]

In [None]:
#Model credit to Heng Yang:
#https://huggingface.co/yangheng/deberta-v3-base-absa-v1.1

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification \
  .from_pretrained("yangheng/deberta-v3-base-absa-v1.1")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

## Review Sampling Setup

In [None]:
k = 25
mini = 300

tester = reviews.groupby('business_id', sort = False)
paired = reviews.groupby('business_id', sort = False).apply(lambda x: x.sample(n = min(k, len(x)))).reset_index(drop = True)
groups = paired.groupby('business_id', sort = False)
partition = [group for _, group in groups]

In [None]:
def select_reasonable(lists, refer):
    keyed_lists = {}
    for l in lists:
        business_id = l['business_id'].iloc[0]
        if len(l) != 0 and len(refer.get_group(business_id)) >= mini:
            keyed_lists[business_id] = l
        else:
            keyed_lists[business_id] = None
    return keyed_lists

In [None]:
usable = select_reasonable(partition, tester)

In [None]:
len(usable)

9963

In [None]:
len([ls for ls in usable.keys() if usable[ls] is not None])

677

## Use Model

In [None]:
from tqdm import tqdm

In [None]:
#Scoring parameters
attrs = ['food', 'drink', 'service', 'value']
results = {key: [] for key in attrs}

In [None]:
#Define scoring metric based on predicted valence and confidence score
def adjust_score(d):
    if d['label'] == 'Positive':
        return d['score']
    elif d['label'] == 'Negative':
        return -d['score']
    else:
        return 0

In [None]:
def score_grouped_small(idx, aspect):
  bid = restaurants['business_id'].iloc[idx]
  revs = usable[bid]
  if revs is not None:
    dicts = classifier(revs['text'].tolist(),  text_pair = aspect)
    labels = [adjust_score(d) for d in dicts]
    results[aspect].append(sum(labels))
  else:
    results[aspect].append(None)

In [None]:
#Calculate + store score
#Switch aspect as needed

for i in tqdm(range(len(restaurants))):
    score_grouped_small(i, 'drink')

100%|██████████| 9963/9963 [1:23:32<00:00,  1.99it/s]


In [None]:
drink_scores = pd.DataFrame(results['drink'], columns = ['drink'])
drink_scores.to_csv('results_drink.csv', index = False)

In [None]:
value_scores = pd.DataFrame(results['value'], columns = ['value'])
value_scores.to_csv('results_value.csv', index = False)

In [None]:
food_scores = pd.DataFrame(results['food'], columns = ['food'])
food_scores.to_csv('results_food.csv', index = False)

In [None]:
service_scores = pd.DataFrame(results['service'], columns = ['service'])
service_scores.to_csv('results_service.csv', index = False)

# Finishing Up

In [None]:
drinks = pd.read_csv('/content/drive/MyDrive/Scoresheets/results_drink.csv')
values = pd.read_csv('/content/drive/MyDrive/Scoresheets/results_value.csv')
foods = pd.read_csv('/content/drive/MyDrive/Scoresheets/results_food.csv')
services = pd.read_csv('/content/drive/MyDrive/Scoresheets/results_service.csv')

In [None]:
total = pd.concat([drinks, values, foods, services], axis = 1)
total

In [None]:
total.to_csv('scores.csv', index = False)