In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LOAD THE LIBRARIES

In [None]:
from itertools import combinations
import json

# LOAD THE DATASETS

In [None]:
atmosphere_df = pd.read_csv("/kaggle/input/categorized-data/categorization_by_atmosphere.csv")
#food_df = pd.read_csv("/kaggle/input/categorized-data/categorization_by_food.csv")
highlights_df = pd.read_csv("/kaggle/input/categorized-data/restaurant_highlights.csv")
original_df = pd.read_csv("/kaggle/input/summarized-one-row-per-restaurant/merged_df_generated.csv")

print(atmosphere_df.shape)
print(food_df.shape)
print(highlights_df.shape)
print(original_df.shape)

In [None]:
atmosphere_df.head()
atmosphere_df.Characteristic.value_counts()

characteristic_counts = atmosphere_df['Characteristic'].value_counts()

high_frequency_characteristics = characteristic_counts[characteristic_counts > 5] 

high_frequency_characteristics=pd.DataFrame(high_frequency_characteristics)
high_frequency_characteristics.reset_index(inplace=True)
high_frequency_characteristics

In [None]:
print(atmosphere_df.shape)
high_frequency_characteristics_list = high_frequency_characteristics.Characteristic.tolist()

# Filter rows where the 'Characteristic' column matches any value in the list
atmosphere_df = atmosphere_df[atmosphere_df['Characteristic'].isin(high_frequency_characteristics_list)]
atmosphere_df.reset_index(inplace=True, drop=True)
print(atmosphere_df.shape)

atmosphere_df['Characteristic'] = atmosphere_df['Characteristic'].replace({
    "Clean": "Cleanliness",
    "Calm and relaxing": "Calm and relaxing environment",
    "Family-friendly": "Family-friendly environment"
})

atmosphere_df.head()

In [None]:
atmosphere_df.to_csv("atmosphere_df_new.csv", index=False)

In [None]:
high_frequency_characteristics_list

In [None]:
atmosphere_categories=pd.DataFrame(high_frequency_characteristics_list,columns=["atmosphere_categories"])
atmosphere_categories.to_csv("atmosphere_categories.csv",index=False)

# CREATE QA-PAIRS USING AMBIANCE DATA

In [None]:
# Create QA pairs
qa_pairs = []
for characteristic in atmosphere_df['Characteristic'].unique():
    restaurants = atmosphere_df.loc[atmosphere_df['Characteristic'] == characteristic, 'Restaurant Name'].unique()
    question = f"What restaurants are known for {characteristic.lower()}?"
    answer = ", ".join(restaurants)  # Join names with a comma for a single string
    qa_pairs.append({"Question": question, "Answer":  f"Some of the restaurants known for {characteristic.lower()} include {answer}"})

# Convert QA pairs to a DataFrame
qa_df = pd.DataFrame(qa_pairs)

print(qa_df.shape)
qa_df.head()

In [None]:
# Get unique characteristics
unique_characteristics = atmosphere_df['Characteristic'].unique()

overlap_results = []

# Create combinations of two characteristics
for characteristic1, characteristic2 in combinations(unique_characteristics, 2):
    # Find restaurants that match both characteristics
    restaurants1 = atmosphere_df.loc[atmosphere_df['Characteristic'] == characteristic1, 'Restaurant Name'].unique()
    restaurants2 = atmosphere_df.loc[atmosphere_df['Characteristic'] == characteristic2, 'Restaurant Name'].unique()
    # Find overlapping restaurants
    overlapping_restaurants = set(restaurants1).intersection(restaurants2)
    # Store results if there are overlapping restaurants
    if overlapping_restaurants:
        overlap_results.append({
            "Characteristic 1": characteristic1,
            "Characteristic 2": characteristic2,
            "Overlapping Restaurants": ", ".join(overlapping_restaurants)
        })

    question = f"What restaurants are known for {characteristic1.lower()}, and {characteristic2.lower()}?"
    
    # Limit the answer to a maximum of 3 restaurant names
    if len(overlapping_restaurants) > 3:
        answer = ", ".join(list(overlapping_restaurants)[:3])  # Take only the first 3 restaurants
    else:
        answer = ", ".join(overlapping_restaurants)  # Take all if less than or equal to 3
    
    # Append to QA pairs only if there are overlapping restaurants
    if overlapping_restaurants:
        qa_pairs.append({"Question": question, "Answer": f"Some of the restaurants known for {characteristic1.lower()}, and {characteristic2.lower()} include {answer}"})

qa_df = pd.DataFrame(qa_pairs)
overlap_df = pd.DataFrame(overlap_results)
print(qa_df.shape)
qa_df.head()

In [None]:
overlap_df['combined_characteristics'] = overlap_df['Characteristic 1'] + ', ' + overlap_df['Characteristic 2']
combined_characteristics=overlap_df['combined_characteristics'].unique().tolist()
combined_characteristics
combined_characteristics=pd.DataFrame(combined_characteristics,columns=["combined_characteristics"])
combined_characteristics.to_csv("combined_characteristics.csv",index=False)

In [None]:
# for combo in combinations(unique_characteristics, 2):
#     print(combo)

# CREATE QA-PAIRS FROM THE SUMMARIZED/MODIFIED RESTAURANT REVIEW DATASET

In [None]:
original_df2 = original_df[original_df['restaurant_name'].isin(atmosphere_df['Restaurant Name'].unique().tolist())]
original_df2.head(1)

In [None]:
for index, row in original_df2.iterrows():
    restaurant = row['restaurant_name']
    #region = row['district']
    rating = row['rating']
    address = row['translated_address']
    location = row['location']
    price = row['price']
    
    qa_pairs.append({
        "Question": f"What is the price normally spent for dining at the restaurant {restaurant}?",
        "Answer": f"The price range for the restaurant {restaurant} is between {price}"
    })
    
    qa_pairs.append({
        "Question": f"What is the average rating of {restaurant}?",
        "Answer": f"Average rating for the restaurant {restaurant} is {rating}"
    })
    
    qa_pairs.append({
        "Question": f"What is the location of {restaurant} in Japan?",
        "Answer": f"{restaurant} is located in {location} part of Japan."
    })
    
#     qa_pairs.append({
#         "Question": f"Which district is the restaurant {restaurant} located in?",
#         "Answer": region
#     })

    qa_pairs.append({
        "Question": f"What is the full address of {restaurant}?",
        "Answer": f"Full address of {restaurant} is {address}"
    })

qa_df = pd.DataFrame(qa_pairs)
qa_df.shape

In [None]:
qa_df.head()

# CATEGORIZE RESTAURANTS BY FOOD TYPES AND GENERATE QA-PAIRS

In [None]:
original_df2['categories'] = original_df2['categories'].str.split(', ')
exploded_df = original_df2.explode('categories')

category_counts = exploded_df['categories'].value_counts()

filtered_categories = category_counts[category_counts > 3].index

filtered_df = exploded_df[exploded_df['categories'].isin(filtered_categories)]

grouped = filtered_df.groupby('categories')['restaurant_name'].apply(set).reset_index()

for food_category in grouped['categories'].unique():
    restaurants = grouped.loc[grouped['categories'] == food_category, 'restaurant_name'].values[0]
    question = f"Recommend restaurants that specializes in {food_category.lower()}."
    # Prepare the answer as a list of restaurant names
    answer = ", ".join(restaurants)  # Join names with a comma for a single string
    qa_pairs.append({"Question": question, "Answer": f"Some of the restaurants that specialize in {food_category.lower()} include {answer}"})

qa_df = pd.DataFrame(qa_pairs)
print(qa_df.shape)
qa_df.head()

In [None]:
#exploded_df.categories.value_counts()

In [None]:
food_categories=filtered_df.categories.unique().tolist()
food_categories

In [None]:
food_categories=pd.DataFrame(food_categories,columns=["food_categories"])
food_categories.to_csv("food_categories.csv",index=False)

In [None]:
qa_df.shape

# CREATE QA PAIRS FOR RESTAURANT HIGHLIGHTS

In [None]:
print(highlights_df.shape)
highlights_df2 = highlights_df[highlights_df['Restaurant Name'].isin(atmosphere_df['Restaurant Name'].unique().tolist())]
print(highlights_df2.shape)

In [None]:
for index, row in highlights_df2.iterrows():
    restaurant = row['Restaurant Name']
    highlight = row['Highlights']
    
    qa_pairs.append({
        "Question": f"Tell me something about the restaurant {restaurant}.",
        "Answer": highlight
    })

qa_df = pd.DataFrame(qa_pairs)
qa_df.shape

# SAVE THE QA DATASET

In [None]:
qa_df.to_csv("qa_df.csv",index=False)