### Yelp Datasets Preprocessing

In [1]:
import pickle
import pandas as pd
import numpy as np
import math, itertools
from scipy.spatial.distance import euclidean
import geopy.distance
from sklearn.preprocessing import MinMaxScaler

In [2]:
def compute_similarity_matrix(X):
    num_rows, num_cols = X.shape
    M = np.empty((num_rows,num_rows))
    M[:] = np.nan

    for i, j in itertools.combinations(range(num_rows), 2):
        d_ij = euclidean(X[i], X[j]) 

        sim = math.exp(-d_ij)
        M[i][j] = sim
        M[j][i] = sim

    for i in range(num_rows):
        d_ii = euclidean(X[i], X[i]) 
        sim = math.exp(-d_ii)
        M[i][i] = sim

    return M

def binarize_feature_matrix(df):

    df = pd.concat([pd.get_dummies(df['state'], prefix='state'), df], axis=1)
    df = pd.concat([pd.get_dummies(df['Noise Level'], prefix='noise'), df], axis=1)
    df = pd.concat([pd.get_dummies(df['Attire'], prefix='attire'), df], axis=1)
    df = pd.concat([pd.get_dummies(df['Alcohol'], prefix='alochol'), df], axis=1)
    df = pd.concat([pd.get_dummies(df['Price_Range'], prefix='price'), df], axis=1)
    df = pd.concat([pd.get_dummies(df['Wi_Fi'], prefix='price'), df], axis=1)

    del_columns = ["city","state","Noise Level","Attire","Alcohol","Price_Range","Wi_Fi","business_id","latitude","longitude"]
    df_processed = df.drop(del_columns, axis=1)

    return df_processed

def preprocess_feature_matrix(df):

    df = df[df['review_count'] > 5] 
    del_columns = ["full_address","name","review_count","type",\
                    "Sunday_Open","Sunday_Close","Monday_Open","Monday_Close","Tuesday_Open","Tuesday_Close","Wednesday_Open","Wednesday_Close",\
                    "Thursday_Open","Thursday_Close","Friday_Open","Friday_Close","Saturday_Open","Saturday_Close",\
                    "Music_Background_Music","Music_Jukebox","Music_Live","Music_Video","Music_Karaoke","Music_DJ","Coat_Check",\
                    "Corkage","BYOB","Smoking","Good_for_Dancing","Happy_Hour","Caters","Drive-Thru"]

    df_processed = df.drop(del_columns, axis=1)
    ht = {}
    for col in df_processed:
        try:   
            ht[col] = (df_processed[col].isna().sum())/len(df_processed)
        except:
            continue
    
    df_processed_nona = df_processed.dropna()

    #Min-Max Scaler
    scaler = MinMaxScaler()

    #Extract Las Vegas data
    yelp_lasvegas_df = df_processed_nona[df_processed_nona['city']=='Las Vegas']
    yelp_lasvegas_df.reset_index(drop=True, inplace=True)
    print("Las Vegas dataset size:", yelp_lasvegas_df.shape)
    yelp_lasvegas_df_binarized = binarize_feature_matrix(yelp_lasvegas_df)
    yelp_lasvegas_df_binarized_scaled = scaler.fit_transform(yelp_lasvegas_df_binarized)

    #Extract Phoenix data
    yelp_phoenix_df = df_processed_nona[df_processed_nona['city']=='Phoenix']
    yelp_phoenix_df.reset_index(drop=True, inplace=True)
    print("Phoenix dataset size:", yelp_phoenix_df.shape)
    yelp_phoenix_df_binarized = binarize_feature_matrix(yelp_phoenix_df)
    yelp_phoenix_df_binarized_scaled = scaler.fit_transform(yelp_phoenix_df_binarized)

    return yelp_lasvegas_df, yelp_lasvegas_df_binarized_scaled, yelp_phoenix_df, yelp_phoenix_df_binarized_scaled


In [3]:
#Read yelp rating data
data_path = 'raw_data/yelp.csv'
yelp_df = pd.read_csv(data_path)

yelp_lasvegas_df, yelp_lasvegas_df_binarized_scaled, yelp_phoenix_df, yelp_phoenix_df_binarized_scaled = preprocess_feature_matrix(yelp_df)

lasvegas_business_list = yelp_lasvegas_df["business_id"].tolist()
phoenix_business_list = yelp_phoenix_df["business_id"].tolist()
print("Computed feature matrices for Yelp data for Las Vegas and Phoenix.")

yelp_lasvegas_similarity_matrix = compute_similarity_matrix(yelp_lasvegas_df_binarized_scaled)
yelp_phoenix_similarity_matrix = compute_similarity_matrix(yelp_phoenix_df_binarized_scaled)
print("Computed similarity matrices for Yelp data for Las Vegas and Phoenix.")

#Compute costs based on distance from city center
# city_coordinates  = {'Las Vegas':(36.166635, -115.147590), 'Phoenix':(33.451618, -112.074267), 'Charlotte':(35.225896, -80.843871), 'Pittsburgh':(40.441525, -79.999641), \
        #                     'Scottsdale':(33.495454, -111.925564)}

yelp_phoenix_df['cost'] = yelp_phoenix_df.apply(lambda row: geopy.distance.distance((row['latitude'], row['longitude']), (33.451618, -112.074267)).km, axis=1)
yelp_lasvegas_df['cost'] = yelp_lasvegas_df.apply(lambda row: geopy.distance.distance((row['latitude'], row['longitude']), (36.166635, -115.147590)).km, axis=1)

yelp_phoenix_costs = yelp_phoenix_df['cost'].tolist()
yelp_lasvegas_costs = yelp_lasvegas_df['cost'].tolist()
print("Computed costs for Yelp data for Las Vegas and Phoenix.")

Las Vegas dataset size: (3103, 57)
Phoenix dataset size: (1849, 57)
Computed feature matrices for Yelp data for Las Vegas and Phoenix.
Computed similarity matrices for Yelp data for Las Vegas and Phoenix.
Computed costs for Yelp data for Las Vegas and Phoenix.


In [4]:
def compute_geo_distance_matrix(df):
    coords = list(zip(df['latitude'].tolist(), df['longitude'].tolist()))
    n = len(coords)
    dist_mat = np.zeros((n, n), dtype=float)
    for i in range(n):
        for j in range(i + 1, n):
            d_ij = geopy.distance.distance(coords[i], coords[j]).km
            dist_mat[i, j] = d_ij
            dist_mat[j, i] = d_ij
    return dist_mat

yelp_lasvegas_graphmat = compute_geo_distance_matrix(yelp_lasvegas_df)
yelp_phoenix_graphmat = compute_geo_distance_matrix(yelp_phoenix_df)
print("Computed graph distance matrices (km) for Yelp data for Las Vegas and Phoenix.")

Computed graph distance matrices (km) for Yelp data for Las Vegas and Phoenix.


In [6]:
#Save Yelp data
save_path = 'pickled_data/yelp/'

with open(save_path + 'yelp_vegas_sim.pkl', 'wb') as f:
    pickle.dump(yelp_lasvegas_similarity_matrix, f)

with open(save_path + 'yelp_vegas_ids.pkl', 'wb') as f:
    pickle.dump(lasvegas_business_list, f)

with open(save_path + 'yelp_vegas_costs.pkl', 'wb') as f:
    pickle.dump(yelp_lasvegas_costs, f)

with open(save_path + 'yelp_vegas_graphMat.pkl', 'wb') as f:
    pickle.dump(yelp_lasvegas_graphmat, f)

with open(save_path + 'yelp_phoenix_sim.pkl', 'wb') as f:
    pickle.dump(yelp_phoenix_similarity_matrix, f)

with open(save_path + 'yelp_phoneix_ids.pkl', 'wb') as f:
    pickle.dump(phoenix_business_list, f)

with open(save_path + 'yelp_phoenix_costs.pkl', 'wb') as f:
    pickle.dump(yelp_phoenix_costs, f)

with open(save_path + 'yelp_phoenix_graphMat.pkl', 'wb') as f:
    pickle.dump(yelp_phoenix_graphmat, f)