In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import joblib

In [6]:
# 1. Load the data
df = pd.read_csv(r"A:\THENMOZHI\vscode\swiggy\swiggy.csv")

In [7]:
df.head(10)

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json
5,158204,Sam Uncle,Abohar,3.6,20+ ratings,₹ 200,Continental,22119652000052,https://www.swiggy.com/restaurants/sam-uncle-c...,"Sam Uncle, hanumangarh road near raja bajaj sh...",Menu/158204.json
6,156588,shere punjab veg,Abohar,4.0,100+ ratings,₹ 150,North Indian,22120652000021,https://www.swiggy.com/restaurants/shere-punja...,"shere punjab veg, major surinder chowk near ve...",Menu/156588.json
7,244866,Shri Balaji Vaishno Dhaba,Abohar,--,Too Few Ratings,₹ 100,North Indian,22119652000389,https://www.swiggy.com/restaurants/shri-balaji...,"Shri Balaji Vaishno Dhaba, St no 13,6th chowk,...",Menu/244866.json
8,156602,Hinglaj Kachori Bhandhar,Abohar,4.2,20+ ratings,₹ 100,"Snacks,Chaat",22119652000042,https://www.swiggy.com/restaurants/hinglaj-kac...,"Hinglaj Kachori Bhandhar, street no 11 circula...",Menu/156602.json
9,158193,yummy hub,Abohar,--,Too Few Ratings,₹ 200,Indian,22119652000045,https://www.swiggy.com/restaurants/yummy-hub-c...,"yummy hub, hanumangarh road near dr naveen set...",Menu/158193.json


In [8]:
# 2. Drop duplicate rows
df.drop_duplicates(inplace=True)

# 3. Drop rows with any null values
df.dropna(inplace=True)

# 4. Drop unwanted columns
columns_to_drop = ['lic_no', 'link', 'address', 'menu']
df.drop(columns=columns_to_drop, inplace=True)

# 5. Clean 'cost' column: remove ₹ and commas, convert to float
df['cost'] = df['cost'].replace('[₹,]', '', regex=True).astype(float)

# 6. Clean 'rating_count' column
def convert_rating_count(value):
    if pd.isnull(value):
        return np.nan
    value = value.strip()
    if value == 'Too Few Ratings':
        return np.random.randint(0, 10)
    elif '20+' in value:
        return np.random.randint(20, 50)
    elif '50+' in value:
        return np.random.randint(50, 100)
    elif '100+' in value:
        return np.random.randint(100, 500)
    elif '500+' in value:
        return np.random.randint(500, 1000)
    elif '1K+' in value:
        return np.random.randint(1000, 5000)
    elif '5K+' in value:
        return np.random.randint(5000, 10000)
    elif '10K+' in value:
        return np.random.randint(10000, 15000)
    else:
        try:
            return int(''.join(filter(str.isdigit, value)))
        except:
            return np.nan

df['rating_count'] = df['rating_count'].apply(convert_rating_count)

# 7. Clean 'rating' column: remove 'K' if present, convert to float
def clean_rating(val):
    if isinstance(val, str):
        val = val.strip().replace('K', '')
        try:
            return float(val)
        except:
            return np.nan
    return val

df['rating'] = df['rating'].apply(clean_rating)

# Drop rows with invalid/missing rating
df.dropna(subset=['rating'], inplace=True)

# 8. Split 'city' column into 'city' and 'main_city'
def split_city(value):
    parts = str(value).split(',')
    city = parts[0].strip()
    main_city = parts[1].strip() if len(parts) > 1 else city
    return pd.Series([city, main_city])

df[['city', 'main_city']] = df['city'].apply(split_city)


# 10. Save cleaned data
df.to_csv("cleaned_swiggy_data.csv", index=False)

In [9]:
df.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,main_city
1,531342,Janta Sweet House,Abohar,4.4,66,200.0,"Sweets,Bakery",Abohar
2,158203,theka coffee desi,Abohar,3.8,478,100.0,Beverages,Abohar
3,187912,Singh Hut,Abohar,3.7,23,250.0,"Fast Food,Indian",Abohar
5,158204,Sam Uncle,Abohar,3.6,43,200.0,Continental,Abohar
6,156588,shere punjab veg,Abohar,4.0,142,150.0,North Indian,Abohar


In [10]:
df.shape

(61343, 8)

In [16]:
# Load the cleaned data
df = pd.read_csv("cleaned_swiggy_data.csv")

# 1. Label Encode the 'name' column
label_encoder = LabelEncoder()
df['name_encoded'] = label_encoder.fit_transform(df['name'])

# Save the label encoder
with open('encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# 2. One-Hot Encode categorical columns
onehot_cols = ['city', 'cuisine', 'main_city']

# ColumnTransformer to One-Hot Encode specified columns and pass the rest
ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), onehot_cols)
    ],
    remainder='passthrough'
)

# Create new DataFrame for encoding (drop 'name' since we have 'name_encoded')
df_to_encode = df.drop(columns=['name'])

# Apply transformation
encoded_array = ct.fit_transform(df_to_encode)

# Get feature names from one-hot encoder
onehot_feature_names = ct.named_transformers_['onehot'].get_feature_names_out(onehot_cols)

# Combine with the remaining columns (remainder='passthrough')
remainder_cols = [col for col in df_to_encode.columns if col not in onehot_cols]
all_feature_names = list(onehot_feature_names) + remainder_cols

# Create encoded DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=all_feature_names)
encoded_df.index = df.index  # Match original indices

# 3. Save final encoded data
encoded_df.to_csv("encoded_data.csv", index=False)

print("✅ Preprocessing complete. Files saved:\n- encoder.pkl\n- encoded_data.csv")


✅ Preprocessing complete. Files saved:
- encoder.pkl
- encoded_data.csv


In [17]:
encoded_df.shape

(61343, 2950)

In [21]:
# 1. Load cleaned data
cleaned_df = pd.read_csv("cleaned_swiggy_data.csv")

# 2. Load label encoder for 'name' if needed
with open('encoder.pkl', 'rb') as f:
    name_encoder = pickle.load(f)

# 3. Define columns to one-hot encode
onehot_cols = ['city', 'cuisine', 'main_city']

# 4. Setup and fit ColumnTransformer on training data and save it
ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), onehot_cols)
    ],
    remainder='passthrough'
)

# Prepare data to encode (drop 'name' since we encode it separately)
df_to_encode = cleaned_df.drop(columns=['name'])

# Fit ColumnTransformer on your data
encoded_array = ct.fit_transform(df_to_encode)

# Save ColumnTransformer
with open('column_transformer.pkl', 'wb') as f:
    pickle.dump(ct, f)

# 5. Scale features and save scaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(encoded_array)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# 6. Train KMeans model and save it
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

with open('kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

# Add cluster info to cleaned_df
cleaned_df['cluster'] = clusters

# 7. Save cleaned_df with cluster info for mapping results
cleaned_df.to_csv("cleaned_with_clusters.csv", index=False)

# -----------------------------------------
# Now the recommendation function using saved objects
# -----------------------------------------

# Load saved objects
with open('column_transformer.pkl', 'rb') as f:
    ct = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('kmeans.pkl', 'rb') as f:
    kmeans = pickle.load(f)

# Load cleaned data with clusters
cleaned_df = pd.read_csv("cleaned_with_clusters.csv")


def recommend_restaurants(user_input, top_n=5):
    """
    user_input: dict with keys - city, cuisine, rating, cost
    Example:
    user_input = {
        'city': 'Abohar',
        'cuisine': 'North Indian',
        'rating': 4.0,
        'cost': 150.0,
        'main_city': 'Abohar'   # include main_city for consistency
    }
    """

    # Create a DataFrame for the input
    user_dict = {
        'id': 0,
        'rating_count': 100,   # Default value for missing rating_count
        'name_encoded': 0,     # Dummy value for name encoded (not used here)
        **user_input
    }

    user_df = pd.DataFrame([user_dict])

    # For encoding, combine with cleaned data (without cluster and name)
    base_df = cleaned_df.drop(columns=['cluster', 'name'])

    combined_df = pd.concat([base_df, user_df], ignore_index=True)

    # Transform with loaded ColumnTransformer
    encoded_combined = ct.transform(combined_df)

    # Scale features
    scaled_combined = scaler.transform(encoded_combined)

    # Predict cluster for user input (last row)
    user_cluster = kmeans.predict([scaled_combined[-1]])[0]

    # Filter restaurants in that cluster
    recommended = cleaned_df[cleaned_df['cluster'] == user_cluster]

    # Optional: sort by rating, rating_count, cost etc
    recommended = recommended.sort_values(by=['rating', 'rating_count'], ascending=False)

    # Return top N restaurant details
    return recommended[['name', 'city', 'rating', 'cost', 'cuisine']].head(top_n)


# --------------------
# Example usage:

user_example = {
    'city': 'Abohar',
    'cuisine': 'North Indian',
    'rating': 4.0,
    'cost': 150.0,
    'main_city': 'Abohar'
}

top_restaurants = recommend_restaurants(user_example, top_n=5)
print(top_restaurants)

                         name                 city  rating   cost  \
35048             Tony Bakery          Manasarovar     5.0  300.0   
36744  SHRI KRISHNA FOOD ZONE                Katni     5.0  200.0   
5198            HUNGER TREATS              Arekere     5.0  300.0   
31606                Bento It           Gachibowli     5.0  500.0   
23123            Gelato Vinto  Greater Kailash New     5.0  400.0   

                      cuisine  
35048       Biryani,Fast Food  
36744  North Indian,Beverages  
5198           Burgers,Snacks  
31606            Asian,Korean  
23123               Ice Cream  
