In [None]:
# all necessary imports
import json
import csv
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# importing the dataset from json
df= pd.read_json('/Users/jyotit-kaushal/github/boozeless-analytics/data/singapore_geo_dataset.json')

In [None]:
df.info()

In [None]:
# creating a ranking system governed by rating and number of reviews using bayesian probability
df_temp1= df[df['review_count']>=100]
r= df_temp1['average_rating'].mean()
w= (df['review_count'].mean())/5

bayesian_weighted_rating= ((r*w)+(df['average_rating']*df['review_count']))/(w+df['review_count'])
df['bayesian_weighted_rating']= bayesian_weighted_rating



In [None]:
categorical_columns= ['region', 'neighborhood', 'price_point_bucket', 'platform_category', 'venue_segment']


df_temp2= df.copy()

encoder = LabelEncoder()
for col in categorical_columns:
    df_temp2[col] = encoder.fit_transform(df_temp2[col])

train_data = df_temp2.dropna(subset=['bayesian_weighted_rating'])
test_data = df_temp2[df_temp2['bayesian_weighted_rating'].isna()]

X_train = train_data[categorical_columns]
y_train = train_data['bayesian_weighted_rating']
X_test = test_data[categorical_columns]

model = RandomForestRegressor()
model.fit(X_train, y_train)

pred_y_train= model.predict(X_train)
r2= r2_score(y_train, pred_y_train)

print("R2 Score:", r2)

mse = mean_squared_error(y_train, pred_y_train)
print("Mean Squared Error:", mse)

missing_review_predictions = model.predict(X_test)

df.loc[df['bayesian_weighted_rating'].isna(), 'bayesian_weighted_rating'] = missing_review_predictions


In [None]:

busy_during_nighttime = []

for busy_time_dict in df['busy_times']:
    if busy_time_dict:
        times=[]
        for day, set_times in busy_time_dict.items():
            midnight= list(busy_time_dict[day].values())[:1]
            nighttime= list(busy_time_dict[day].values())[-7:]
            times.extend(midnight)
            times.extend(nighttime)

        if((sum(times)/(8*6))>40):
            busy_during_nighttime.append("Yes")
        else:
            busy_during_nighttime.append("No")
    else:
        busy_during_nighttime.append(None)
            
print(busy_during_nighttime)

df['busy_during_nighttime']=busy_during_nighttime

In [None]:
df.shape

In [None]:
df_temp3 = df.copy()

atmosphere = []
crowd = []
dining_options = []
happy_hour = []
highlights = []
offerings = []
payments = []
planning = []
types_of_alcohol = []
amenities_cat = []
accessibility = []
ordering_options = []

for amenities in df['venue_amenities']:
    if amenities:
        if 'amenities' in amenities:
            amenities_cat.append(amenities['amenities'])
        else:
            amenities_cat.append(None)

        if 'atmosphere' in amenities:
            atmosphere.append(amenities['atmosphere'])
        else:
            atmosphere.append(None)
        
        if 'crowd' in amenities:
            crowd.append(amenities['crowd'])
        else:
            crowd.append(None)
            
        if 'dining_options' in amenities:
            dining_options.append(amenities['dining_options'])
        else:
            dining_options.append(None)

        if 'ordering_options' in amenities:
            ordering_options.append(amenities['ordering_options'])
        else:
            ordering_options.append(None)
            
        if 'happy_hour' in amenities:
            happy_hour.append(amenities['happy_hour'])
        else:
            happy_hour.append(None)
            
        if 'highlights' in amenities:
            highlights.append(amenities['highlights'])
        else:
            highlights.append(None)
            
        if 'offerings' in amenities:
            offerings.append(amenities['offerings'])
        else:
            offerings.append(None)
            
        if 'payments' in amenities:
            payments.append(amenities['payments'])
        else:
            payments.append(None)
            
        if 'planning' in amenities:
            planning.append(amenities['planning'])
        else:
            planning.append(None)
            
        if 'types_of_alcohol' in amenities:
            types_of_alcohol.append(amenities['types_of_alcohol'])
        else:
            types_of_alcohol.append(None)

        if 'accessability' in amenities:
            accessibility.append(amenities['accessability'])
        else:
            accessibility.append(None)

    else:
        atmosphere.append(None)
        crowd.append(None)
        dining_options.append(None)
        happy_hour.append(None)
        highlights.append(None)
        offerings.append(None)
        payments.append(None)
        planning.append(None)
        types_of_alcohol.append(None)
        amenities_cat.append(None)
        ordering_options.append(None)
        accessibility.append(None)

df['atmosphere'] = atmosphere
df['crowd'] = crowd
df['dining_options'] = dining_options
df['happy_hour'] = happy_hour
df['highlights'] = highlights
df['offerings'] = offerings
df['payments'] = payments
df['planning'] = planning
df['types_of_alcohol'] = types_of_alcohol
df['ordering_options'] = ordering_options
df['amenities_cat'] = amenities_cat
df['accessibility'] = accessibility




In [None]:
df = df.drop(columns=['average_rating', 'busy_times', 'venue_amenities', 'venue_amenities'])

In [None]:
categorical_columns= ['region', 'neighborhood', 'platform_category', 'venue_segment', 'bayesian_weighted_rating']


df_temp4= df.copy()

encoder = LabelEncoder()
for col in categorical_columns:
    df_temp4[col] = encoder.fit_transform(df_temp4[col])

df_temp4['price_point_bucket']= encoder.fit_transform(df_temp4['price_point_bucket'])

df_temp4.loc[df_temp4['price_point_bucket'] == 4, 'price_point_bucket'] = None


train_data = df_temp4.dropna(subset=['price_point_bucket'])
test_data = df_temp4[df_temp4['price_point_bucket'].isna()]

X_train = train_data[categorical_columns]
y_train = train_data['price_point_bucket']
X_test = test_data[categorical_columns]

model = RandomForestClassifier()
model.fit(X_train, y_train)

pred_y_train= model.predict(X_train)
r2= r2_score(y_train, pred_y_train)

print("R2 Score:", r2)

mse = mean_squared_error(y_train, pred_y_train)
print("Mean Squared Error:", mse)

missing_pricepoint_predictions = model.predict(X_test)


def change_to_symbols(lst):
    symbols = {0: '$', 1: '$$', 2: '$$$', 3: '$$$$'}
    result = []

    for value in lst:
        if value in symbols:
            result.append(symbols[value])
        else:
            result.append(value)

    return result

missing_pricepoint_predictions= change_to_symbols(missing_pricepoint_predictions)

df.loc[df['price_point_bucket'].isna(), 'price_point_bucket'] = missing_pricepoint_predictions

In [None]:
df['review_sample']= df['review_sample'].astype(str)

In [None]:
df.shape

In [None]:
# df.to_csv("/Users/jyotit-kaushal/github/boozeless-analytics/data/singapore_processed_dataset.csv", index= False)

In [None]:
pivot_table_nightbusy = pd.pivot_table(df, 
                            index='busy_during_nighttime', 
                            aggfunc='size', 
                            fill_value=0)

print(pivot_table_nightbusy)

In [None]:
# pivot table for price_point_bucket
pivot_table_pricepoint = pd.pivot_table(df, 
                            index='price_point_bucket', 
                            aggfunc='size', 
                            fill_value=0)

print(pivot_table_pricepoint)

In [None]:
# pivot table for venue_segment
pivot_table_venueseg = pd.pivot_table(df, 
                            index='venue_segment', 
                            aggfunc='size', 
                            fill_value=0)

print(pivot_table_venueseg)

In [None]:
pivot_table_venuesubseg = pd.pivot_table(df, 
                            index='venue_subsegment', 
                            aggfunc='size', 
                            fill_value=0)

print(pivot_table_venuesubseg)

In [None]:
# pivot table for reviews

df['review_count_10000'] = df['review_count'].round(-3)

pivot_table_reviewcnt = pd.pivot_table(df, 
                            index='review_count_10000', 
                            aggfunc='size', 
                            fill_value=0)

print(pivot_table_reviewcnt)