In [None]:
# all necessary imports
import json
import csv
import pandas as pd

from transformers import pipeline
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import LongformerTokenizer, LongformerForSequenceClassification, EncoderDecoderModel, LongformerConfig, LongformerTokenizerFast
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob

In [None]:
# importing the dataset from json
df= pd.read_csv('/Users/jyotit-kaushal/github/boozeless-analytics/data/singapore_processed_dataset.csv')

In [None]:
df.info()

In [None]:
# pivot table for venue_segment
pivot_table_venueseg = pd.pivot_table(df, 
                            index='venue_segment', 
                            aggfunc='size', 
                            fill_value=0)

print(pivot_table_venueseg)

In [None]:
excluded_segments = ["Bar", "Entertainment", "Non Alcoholic", "Restaurant"]
df = df[df["venue_segment"].isin(excluded_segments)]

df.shape

In [None]:
df = df[df['bayesian_weighted_rating'] >= 4]
df.shape

In [None]:
# pivot table for price_point_bucket
pivot_table_pricepoint = pd.pivot_table(df, 
                            index='price_point_bucket', 
                            aggfunc='size', 
                            fill_value=0)

print(pivot_table_pricepoint)

In [None]:
excluded_price_points= ['$$$', '$$$$']
df= df[df['price_point_bucket'].isin(excluded_price_points)]

df.shape

In [None]:
# df.to_csv("/Users/jyotit-kaushal/github/boozeless-analytics/data/singapore_processed_dataset_1761.csv", index= False)

In [None]:
boozeless_rating= [0]*len(df)
def normalize_list(lst):
    min_val = min(lst)
    max_val = max(lst)
    normalized_list = [(x - min_val) / (max_val - min_val) for x in lst]
    return normalized_list


In [None]:
# checking types_of_alcohol available

toa_index = df.columns.get_loc('types_of_alcohol')


for i in range(len(df)):
    # print(df.iloc[i, toa_index])
    if type(df.iloc[i, toa_index])==str:
        if 'hard_liquor' in df.iloc[i, toa_index]:
            boozeless_rating[i]+=1
        else:
            pass
        if 'cocktails' in df.iloc[i, toa_index]:
            boozeless_rating[i]+=1
        else:
            pass
    else:
        pass


boozeless_rating= normalize_list(boozeless_rating)
print(boozeless_rating)

In [None]:
reviews= df['review_sample'].tolist()
X_train= [str(value) for value in reviews]

topics= ['Ambience', 'Menu', 'Venue', 'Cocktails', 'Health', 'Food', 'Special Occasion', 'A topic different from innovative food, menu, cocktail related']
model = 'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7'

tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096", max_length=512, padding=True, truncation=True)
classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=model)

In [None]:
print(X_train)

In [None]:
resultingclassification={}
count=0
for s in X_train:
    try:
        c= classifier(s, topics, truncation=True)
        labels= c['labels']
        scores= c['scores']

        max_value_index = scores.index(max(scores))
        label = labels[max_value_index]

        count+=1
        print(count)
        resultingclassification[s]= label
    except Exception as e:
        resultingclassification[s]= np.nan
        continue

# print(resultingclassification)

In [None]:
topic_classification_list= [0]*len(X_train)

i=0
for key, value in resultingclassification.items():
    if value in ['Ambience', 'Menu', 'Venue', 'Cocktails', 'Health', 'Special Occasion']:
        topic_classification_list[i]+=1
        i+=1
    else:
        i+=1

# print(topic_classification_list)
boozeless_rating = [x + y for x, y in zip(boozeless_rating, topic_classification_list)]
boozeless_rating= normalize_list(boozeless_rating)
print(boozeless_rating)

In [None]:
# sentiment analysis of reviews

model1= AutoModelForSequenceClassification.from_pretrained('adam-chell/tweet-sentiment-analyzer')

resultinglabels1={}
classifier= pipeline('sentiment-analysis', model= model1, tokenizer= tokenizer)
# print(len(X_train))
for s in X_train:
    try:
        resultinglabels1[s]= classifier(str(s), truncation=True)
    except Exception as e:
        resultinglabels1[s]= np.nan
        continue

print(resultinglabels1)
labels1=[]
scores1=[]
for key, value in resultinglabels1.items():
    try:
        labels1.append(value[0]['label'])
        scores1.append(value[0]['score'])
    except Exception as e:
        labels1.append(np.nan)
        scores1.append(np.nan)
        continue



In [None]:
reviews_index = df.columns.get_loc('types_of_alcohol')
words= ['Ambience', 'Environment', 'Menu', 'Venue', 'Cocktails', 'Unique', 'Health', 'Lifestyle', 'Food', 'Innovative', 'Special Occasion', 'Interesting']

word_relevancy_check= [0]*len(X_train)
print(len(word_relevancy_check))


def count_occurences(sentence, words):
    blob = TextBlob(sentence)
    total_count = sum(blob.words.count(word) for word in words)
    return total_count


for i in range(len(X_train)):
    if X_train[i]:
        c= count_occurences(X_train[i], words)
        word_relevancy_check[i]+=c
    else:
        pass
    
word_relevancy_check = normalize_list(word_relevancy_check)
print(word_relevancy_check)

In [None]:
boozeless_rating = [x + y for x, y in zip(boozeless_rating, word_relevancy_check)]
boozeless_rating= normalize_list(boozeless_rating)
print(boozeless_rating)

In [None]:
buzynigghttime_index = df.columns.get_loc('busy_during_nighttime')
buzylist= [0]*len(df)

for i in range(len(df)):
    if df.iloc[i, buzynigghttime_index]:
        if df.iloc[i, buzynigghttime_index]=="Yes":
            buzylist[i]+=1
        else:
            pass
    else:
        pass

print(buzylist)


In [None]:
boozeless_rating = [x + y for x, y in zip(boozeless_rating, buzylist)]
boozeless_rating= normalize_list(boozeless_rating)
print(boozeless_rating)


In [None]:
df['boozeless_rating']= boozeless_rating

In [None]:
sorted_boozeless= df.sort_values(by= 'boozeless_rating', ascending=False)

In [None]:
sorted_boozeless.head(25)

In [None]:
sorted_boozeless.to_csv("/Users/jyotit-kaushal/github/boozeless-analytics/data/restaurants_sorted_boozeless.csv", index= False)

In [None]:
sorted_boozeless.to_excel("/Users/jyotit-kaushal/github/boozeless-analytics/data/restaurants_sorted_boozeless.xlsx", index= False)