In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LOAD LIBRARIES

In [None]:
import keras
import keras_nlp

# LOAD GEMMA MODEL

In [None]:
%%time
# Load Language Model via Keras
LLM = keras_nlp.models.GemmaCausalLM.from_preset('gemma_instruct_2b_en')

**Note:** The "restaurant-overview-summarization" dataset was generated using a local notebook. This notebook processes restaurant and menu data, performs tasks such as data cleaning (including text translation), creates additional features and combine the reviews data generated by "FoodieFinder_Summarization_Base" kaggle notebook.

Github link to the notebook (FoodieFinder_CombineRestaurantData_FeatureEngineering):

# READ THE DATA

In [None]:
df1 = pd.read_csv("/kaggle/input/restaurant-overview-summarization/reviews_df_1.csv")
df2 = pd.read_csv("/kaggle/input/restaurant-overview-summarization/reviews_df_2.csv")
df3 = pd.read_csv("/kaggle/input/restaurant-overview-summarization/reviews_df_3.csv")
df4 = pd.read_csv("/kaggle/input/restaurant-overview-summarization/reviews_df_4.csv")
df5 = pd.read_csv("/kaggle/input/restaurant-overview-summarization/reviews_df_5.csv")
df = pd.concat([df1,df2,df3,df4,df5])

In [None]:
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
df.head(1)

In [None]:
df.rename(columns = {'summarized_comments':'review'}, inplace = True)
df.columns

In [None]:
df = df[['restaurant_name','location','categories','translated_address', 'district', 'rating', 'selected_dishes', 'price', 'review']]
df.head(1)

# TEXT SUMMARIZATION

In [None]:
# Function to summarize reviews for a single restaurant
def summarize_reviews(reviews):
    # Concatenate all the reviews into a single text
    combined_reviews = " ".join(reviews)    
    prompt = (
        "Summarize the following reviews in detail in paragraphs. Include information about menu items, highlight both positive and negative aspects, "
        "and avoid adding any external or irrelevant information. Keep the summary factual "
        "and honest to reflect what customers truly feel about the restaurant:\n\n"
        f"{combined_reviews}\n\n"
        "Summarized review:"
    )

    # Generate the summarized review
    summarized_review = LLM.generate([prompt], max_length=2000)[0]
    
    return combined_reviews,summarized_review

# Group by restaurant name and summarize the reviews
def create_summarized_reviews_df(df):
    summarized_reviews = []
    
    # Group the dataframe by 'restaurant_name'
    grouped_reviews = df.groupby('restaurant_name')['review'].apply(list)
    
    # Loop through each group and summarize the reviews
    for restaurant_name, reviews in grouped_reviews.items():
        # Summarize all reviews for this restaurant
        context,summarized_review = summarize_reviews(reviews)
        
        # Append to the list as a dictionary
        summarized_reviews.append({
            'context': context,
            'restaurant_name': restaurant_name,
            'summarized_review': summarized_review
        })
    
    # Create a new dataframe from the summarized reviews
    summary_df = pd.DataFrame(summarized_reviews)
    
    return summary_df

In [None]:
%%time
summary_df = create_summarized_reviews_df(df)

In [None]:
# Drop 'price' and 'review' columns from df
df_cleaned = df.drop(columns=['review'])

# Drop duplicates from df based on the 'restaurant_name' column
df_cleaned = df_cleaned.drop_duplicates()

# Perform the left join
merged_df = pd.merge(summary_df, df_cleaned, on='restaurant_name', how='left')
print(merged_df.shape)
merged_df.head()

# SAVE THE OUTPUT

In [None]:
# Save to CSV
merged_df.to_csv("merged_df_generated.csv", index=False)

In [None]:
df.to_csv("dataset_used.csv", index=False) 