In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LOAD THE LIBRARIES

In [None]:
import keras
import keras_nlp
import re

# LOAD THE GEMMA MODEL

In [None]:
%%time
llm = keras_nlp.models.GemmaCausalLM.from_preset('gemma_instruct_2b_en')

# READ THE DATA

In [None]:
dataset_for_qa = pd.read_csv("/kaggle/input/summarized-one-row-per-restaurant/merged_df_generated.csv")
print(dataset_for_qa.shape)
dataset_for_qa.head()

In [None]:
dataset_for_qa['summarized_review'] = dataset_for_qa['summarized_review'].str.split("Summarized review:").str[-1].str.strip()

In [None]:
df = dataset_for_qa #.head(2)

# CATEGORIZE RESTAURANTS BY AMBIANCE

In [None]:
# Function to categorize restaurants by ambiance
def categorize_restaurants(df):
    # Define the characteristics to look for
    characteristics = [
        "cheap meals", "luxurious", "comfortable atmosphere", "friendly staff", "calm and relaxing", 
        "beautiful ambiance", "clean", "authentic food", "great service", "family-friendly",
        "romantic setting", "quick service", "unique dishes", "chef craftsmanship"
    ]
    
    # Dictionary to store categorized information for each restaurant
    restaurant_categories = {}

    for restaurant_name, group in df.groupby('restaurant_name'):
        reviews = group['summarized_review'].tolist()
        
        instruction = (
            f"Analyze the following reviews for '{restaurant_name}' and identify whether the restaurant fits any of the following characteristics: "
            f"{', '.join(characteristics)}. Note that a restaurant can belong to more than one category if applicable based on the review content. "
            f"For each characteristic, indicate if it applies or not based on the reviews provided.\n"
            f"Reviews: {reviews}\n\n"
            f"Return the applicable characteristics in a structured format, listing each characteristic that applies."
        )
        
        # Generate a categorized response with Gemma
        prompt = f"Instruction: {instruction}\nResponse:"
        categorized_response = llm.generate([prompt], max_length=1000)[0]
        
        # Store the categorized information for this restaurant
        restaurant_categories[restaurant_name] = categorized_response
    
    return restaurant_categories


In [None]:
%%time
result = categorize_restaurants(df)

In [None]:
result_df = pd.DataFrame.from_dict(result, orient='index', columns=['Categories']).reset_index()

# Rename columns for clarity
result_df.rename(columns={'index': 'Restaurant Name', 'Categories': 'Characteristics'}, inplace=True)

In [None]:
result_df.to_csv("result_df.csv", index = False)

In [None]:
%%time
# Define a function to extract characteristics that apply to each restaurant
def extract_applicable_characteristics(row):
    characteristics = {}
    # Use regex to find each characteristic and its corresponding "Yes" or "No" value
    matches = re.findall(r'\|\s(.+?)\s\|\s(Yes|No)\s\|', row['Characteristics'])
    for characteristic, applies in matches:
        # If the characteristic applies ("Yes"), store the restaurant name under that characteristic
        if applies == 'Yes':
            characteristics[characteristic.strip()] = row['Restaurant Name']
    return characteristics


applicable_characteristics = [extract_applicable_characteristics(row) for _, row in result_df.iterrows()]

# filling NaNs with empty strings
categorized_df = pd.DataFrame(applicable_characteristics).fillna('')

categorized_df.head()

In [None]:
# Clean up column names by removing any extra characters like "|"
categorized_df.columns = [col.strip().replace('|', '').strip() for col in categorized_df.columns]

# SAVE RESULTS

In [None]:
categorized_df.to_csv("categorized_df.csv", index=False)

In [None]:
# Unpivot the DataFrame to have a single column for Restaurant Name and another for Characteristic
unpivoted_df = categorized_df.melt(var_name='Characteristic', value_name='Restaurant Name')
unpivoted_df = unpivoted_df[unpivoted_df['Restaurant Name'] != '']
unpivoted_df.head()

In [None]:
unpivoted_df.to_csv("unpivoted_df.csv", index=False)