```Packages and given datasets```

In [55]:
import pandas as pd
import json
from datetime import datetime

***Restaurant Dataset***

In [56]:
# Open file and convert to dataframe (df)

mypath = "./Given_Datasets/"
file_name = "restaurant_data.json"
rest_path = mypath + file_name

with open(rest_path, encoding="utf8") as rest_file:
    rest_df = pd.read_json(rest_file)

print(rest_df.info())

# Check first few rows of df
print(rest_df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   results_found  79 non-null     int64 
 1   results_start  79 non-null     int64 
 2   results_shown  79 non-null     int64 
 3   restaurants    79 non-null     object
dtypes: int64(3), object(1)
memory usage: 2.6+ KB
None
   results_found  results_start  results_shown  \
0          29287              1             20   
1           7625              1             20   
2          21776              1             20   
3          16762              1             20   
4          12026              1             20   

                                         restaurants  
0  [{'restaurant': {'R': {'res_id': 18649486}, 'a...  
1  [{'restaurant': {'R': {'res_id': 18707652}, 'a...  
2  [{'restaurant': {'R': {'res_id': 18392725}, 'a...  
3  [{'restaurant': {'R': {'res_id': 58882}, 'apik...  
4  [{'res

***Country-Code Dataset***

In [57]:
mypath = "./Given_Datasets/"
file_name = "Country-code.xlsx"
CC_path = mypath + file_name

country_df = pd.read_excel(CC_path)
print(country_df.info())
print(country_df.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Country Code  15 non-null     int64 
 1   Country       15 non-null     object
dtypes: int64(1), object(1)
memory usage: 372.0+ bytes
None
   Country Code       Country
0             1         India
1            14     Australia
2            30        Brazil
3            37        Canada
4            94     Indonesia
5           148   New Zealand
6           162   Phillipines
7           166         Qatar
8           184     Singapore
9           189  South Africa


***Checking all features of a restaurant***

In [58]:
# Function to retrieve all keys from each restaurant
def get_all_keys(dic):
    keys = set(dic.keys())
    for value in dic.values():
        if isinstance(value, dict):
            keys.update(get_all_keys(value))
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    keys.update(get_all_keys(item))
    return keys

common_keys = set()
add_keys = set()

# Check which keys are common for each restaurant and which are additional keys
for rest_list in rest_df["restaurants"]:
    for rest in rest_list:
        rest_keys = set(get_all_keys(rest))

        if not common_keys:
            common_keys = rest_keys
        else:
            common_keys = common_keys.intersection(rest_keys)

        add_keys = add_keys.union(rest_keys)

add_keys -= common_keys

print("Common Keys:")
print(common_keys)
print("\nAdditional Keys:")
print(add_keys)

Common Keys:
{'average_cost_for_two', 'rating_color', 'establishment_types', 'aggregate_rating', 'featured_image', 'photos_url', 'has_online_delivery', 'locality', 'cuisines', 'city_id', 'is_delivering_now', 'switch_to_order_menu', 'locality_verbose', 'id', 'user_rating', 'currency', 'url', 'menu_url', 'thumb', 'apikey', 'restaurant', 'deeplink', 'name', 'price_range', 'res_id', 'offers', 'latitude', 'events_url', 'zipcode', 'has_fake_reviews', 'include_bogo_offers', 'address', 'city', 'longitude', 'is_table_reservation_supported', 'rating_text', 'has_table_booking', 'R', 'location', 'country_id', 'votes'}

Additional Keys:
{'photos', 'custom_rating_text', 'photo_id', 'is_editable', 'uuid', 'status', 'event_id', 'text', 'title', 'type', 'offer_id', 'restaurants', 'book_url', 'medio_provider', 'added_by', 'color', 'order_deeplink', 'start_date', 'description', 'event_category', 'order', 'impressions', 'offer_type', 'zomato_events', 'thumb_url', 'md5sum', 'type_code', 'photo', 'is_end_ti

**Q1.** Extract the following fields and store the data as restaurants.csv. <br>
◦   	Restaurant Id<br>
◦   	Restaurant Name<br>
◦   	Country<br>
◦   	City<br>
◦   	User Rating Votes<br>
◦   	User Aggregate Rating (in float)<br>
◦   	Cuisines<br>


In [59]:
print(f"Number of Restaurants per row: {len((rest_df.iloc[0]['restaurants']))}")
print(f"Restaurant example: {list(rest_df['restaurants'][1][0]['restaurant'].keys())}")

result = []
for rest_list in rest_df["restaurants"]:
    for rest in rest_list:
        curr = rest['restaurant']
        rest_id = curr['id']
        name = curr["name"]
        country_id =  curr["location"]["country_id"]
        city = curr["location"]["city"]
        rating_votes = curr["user_rating"]["votes"]
        agg_user_rating = curr["user_rating"]["aggregate_rating"]
        cuisines = curr["cuisines"]
        temp = [rest_id, name, country_id, city, rating_votes, agg_user_rating, cuisines]
        result.append(temp)

result = pd.DataFrame(result, columns= ['Restaurant Id', "Restaurant Name", "Country_id", 
                                        "City", "User Rating Votes",
                                        "User Aggregate Rating", "Cuisines"])



Number of Restaurants per row: 20
Restaurant example: ['R', 'apikey', 'id', 'name', 'url', 'location', 'switch_to_order_menu', 'cuisines', 'average_cost_for_two', 'price_range', 'currency', 'offers', 'opentable_support', 'is_zomato_book_res', 'mezzo_provider', 'is_book_form_web_view', 'book_form_web_view_url', 'book_again_url', 'thumb', 'user_rating', 'photos_url', 'menu_url', 'featured_image', 'has_online_delivery', 'is_delivering_now', 'has_fake_reviews', 'include_bogo_offers', 'deeplink', 'is_table_reservation_supported', 'has_table_booking', 'events_url', 'establishment_types']


In [60]:
# Merge restaurant_df to country_df using country code/id as the joining keys to obtain country name.
result_df = pd.merge(result, country_df, how='left', left_on="Country_id", right_on="Country Code")
result_df = result_df.drop(columns=["Country_id", "Country Code"])
result_df = result_df[['Restaurant Id', "Restaurant Name", "Country", 
                                        "City", "User Rating Votes",
                                        "User Aggregate Rating", "Cuisines"]]

result_df.to_csv("restaurants.csv", index=False)   

**Q2.** 	Extract the list of restaurants that have past event in the month of April 2019 and store the data as restaurant_events.csv. <br>
◦   	Event Id <br>
◦   	Restaurant Id <br>
◦   	Restaurant Name <br>
◦   	Photo URL <br>
◦   	Event Title <br>
◦   	Event Start Date <br>
◦   	Event End Date <br>
Note: Populate empty values with "NA". <br>


In [61]:
result_2 = []

april_start_date = datetime(2019, 4, 1)
april_end_date = datetime(2019, 4, 30)

for rest_list in rest_df["restaurants"]:
    for rest in rest_list:
        curr = rest['restaurant']
        rest_id = curr['id']
        name = curr["name"]
        photo_url = curr["photos_url"]

        events = curr.get("zomato_events", False)
        if events:
            for event in events:
                event_det = event["event"]
                event_id = event_det["event_id"]
                event_title = event_det["title"]
                event_start_date = event_det["start_date"]
                event_end_date = event_det["end_date"]

                # convert start date and end date to datetime format
                event_start_date_dt = datetime.strptime(event_start_date, "%Y-%m-%d")
                event_end_date_dt = datetime.strptime(event_end_date, "%Y-%m-%d")
                
                # filter by ensuring April 2019 falls between start date and end date 
                if (event_start_date_dt <= april_end_date) and (april_start_date <= event_end_date_dt):
                    temp = [event_id, rest_id, name, photo_url, event_title, event_start_date, event_end_date]
                    result_2.append(temp)

rest_event_df = pd.DataFrame(result_2, columns= ["Event id", 'Restaurant Id', "Restaurant Name", "Photo URL", 
                                        "Event Title", "Event Start Date",
                                        "Event End Date"])

rest_event_df.to_csv("restaurant_events.csv", index = False)

**Q3.** 	From the dataset (restaurant_data.json), determine the threshold for the different rating text based on aggregate rating. Return aggregates for the following ratings only: <br>
◦   	Excellent <br>
◦   	Very Good <br>
◦   	Good <br>
◦   	Average <br>
◦   	Poor <br>


In [62]:
rating = {}
selected_rate = ["Excellent", "Very Good", "Good", 'Average', "Poor"]

for rest_list in rest_df["restaurants"]:
    for rest in rest_list:
        user_rating = rest['restaurant']["user_rating"]
        aggre_rating = float(user_rating['aggregate_rating'])
        rate_text = user_rating["rating_text"]

        if rate_text in rating:
            rating[rate_text] = [min(aggre_rating, rating[rate_text][0]), max(aggre_rating, rating[rate_text][1])]
        else:
            rating[rate_text] = [aggre_rating, aggre_rating]

rating = list(rating.items())

filtered_rating = list(filter(lambda x: x[0] in selected_rate, rating))
sorted_rating = sorted(filtered_rating, key = lambda x: x[1][1], reverse=True)

for word, spread in sorted_rating:
    print(f"The '{word}' rating has a minimum and maximum rating threshold of {spread} respectively")


The 'Excellent' rating has a minimum and maximum rating threshold of [4.5, 4.9] respectively
The 'Very Good' rating has a minimum and maximum rating threshold of [4.0, 4.4] respectively
The 'Good' rating has a minimum and maximum rating threshold of [3.5, 3.9] respectively
The 'Average' rating has a minimum and maximum rating threshold of [2.5, 3.4] respectively
The 'Poor' rating has a minimum and maximum rating threshold of [2.2, 2.2] respectively


Based on the given data, <br>
The 'Excellent' rating has a minimum and maximum rating threshold of [4.5, 4.9] respectively. <br>
The 'Very Good' rating has a minimum and maximum rating threshold of [4.0, 4.4] respectively. <br>
The 'Good' rating has a minimum and maximum rating threshold of [3.5, 3.9] respectively. <br>
The 'Average' rating has a minimum and maximum rating threshold of [2.5, 3.4] respectively. <br>
The 'Poor' rating has a minimum and maximum rating threshold of [2.2, 2.2] respectively. <br>

However, based on the rating threshold patterns, we can assume that the "Poor" rating has a maximum threshold of 2.4

```Simple Test```

In [63]:
### Check if the restaurant data from restaurant.json file tallies with the restaurant.csv file

csv_data = pd.read_csv('restaurants.csv')

if len(csv_data) == sum(rest_df.results_shown):
    print("All restaurant data successfully transferred")
else:
    print("Oh no. Restaurant counts do not tally")

All restaurant data successfully transferred
