Importing needed libraries

In [9]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

Data Preprocessing and Encoding for Swiggy Restaurant Dataset

In [15]:
#Loading the raw dataset
file_path = r"C:/Users/laksh/Downloads/swiggy.csv"  
raw_data = pd.read_csv(file_path)

#Handling missing values
required_columns = ['city', 'cuisine', 'rating', 'cost', 'name']
for col in required_columns:
    print(f"Unique values in {col}:")
    print(raw_data[col].dropna().unique())

#dropping rows with missing values in critical columns
swiggy_data = raw_data.dropna(subset=['city', 'cuisine', 'name']).copy()  #copy() to avoid SettingWithCopyWarning

#handling missing or invalid numeric columns
swiggy_data['rating'] = pd.to_numeric(swiggy_data['rating'], errors='coerce')

#cleaning the 'cost' column by removing the '₹' symbol and converting to numeric
swiggy_data['cost'] = swiggy_data['cost'].str.replace('₹', '').str.replace(',', '').str.strip()  #Clean extra spaces and comma
swiggy_data['cost'] = pd.to_numeric(swiggy_data['cost'], errors='coerce')

#dropping rows with invalid ratings or cost values
swiggy_data = swiggy_data.dropna(subset=['rating', 'cost'])

#standardizing text columns
swiggy_data['city'] = swiggy_data['city'].apply(lambda x: x.strip().title() if isinstance(x, str) else x)
swiggy_data['cuisine'] = swiggy_data['cuisine'].apply(lambda x: x.strip().title() if isinstance(x, str) else x)

#One-Hot Encoding for categorical columns
categorical_columns = ['city', 'cuisine']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

#fitting and transforming the categorical columns
encoded_features = encoder.fit_transform(swiggy_data[categorical_columns])

#converting encoded features into a DataFrame
encoded_features_df = pd.DataFrame(
    encoded_features,
    columns=encoder.get_feature_names_out(categorical_columns)
)

#sqaving the encoder to a pickle file
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

#Concatenating encoded features with the original data
swiggy_data = pd.concat([swiggy_data, encoded_features_df], axis=1)

#Saving the cleaned data
swiggy_data.to_csv("cleaned_data.csv", index=False)
print("Cleaned data saved as 'cleaned_data.csv'.")

#Saving the encoded features
encoded_features_df.to_csv("encoded_data.csv", index=False)
print("Encoded data saved as 'encoded_data.csv'.")

Unique values in city:
['Abohar' 'Adilabad' 'Adityapur' 'Adoni' 'Agartala' 'Agra'
 'Vastrapur,Ahmedabad' 'GOTA,Ahmedabad' 'Paldi & Ambawadi,Ahmedabad'
 'Ghatlodia,Ahmedabad' 'Bopal,Ahmedabad' 'Gandhinagar,Ahmedabad'
 'LalDarwaja,Ahmedabad' 'Naranpura,Ahmedabad' 'Navrangpura,Ahmedabad'
 'Science City,Ahmedabad' 'Maninagar,Ahmedabad' 'Chandkheda,Ahmedabad'
 'Ahmednagar' 'Aizawl' 'Ajmer' 'Akola' 'Alappuzha' 'Aligarh' 'Alipurduar'
 'Allahabad' 'Alwar' 'Ambala' 'Ambikapur' 'Ambur' 'Amravati' 'Amreli'
 'Amritsar' 'Anand' 'Anantapur' 'Ankleshwar' 'Arakkonam' 'Arambagh'
 'Arrah' 'Aruppukottai' 'Asansol' 'Aurangabad' 'Aurangabad_bihar'
 'Azamgarh' 'Baddi' 'Bagalkot' 'Bagdogra' 'Bahadurgarh' 'Bahraich'
 'Balaghat' 'Balangir' 'Balasore' 'Ballari' 'Balrampur' 'Balurghat'
 'Banda' 'Yeshwanthpur,Bangalore' 'Geddalahalli,Bangalore'
 'Koramangala,Bangalore' 'JP Nagar,Bangalore' 'Mahadevpura,Bangalore'
 'HSR,Bangalore' 'Arekere,Bangalore' 'Indiranagar,Bangalore'
 'Banashankari,Bangalore' 'Whitefield,Ba

City Name Cleaning and Standardization for Swiggy Dataset

In [6]:
#loading the dataset
file_path = 'C:/Users/laksh/Downloads/swiggy.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

#extracting the city column
cities = df['city']

#def function to clean and standardize city names
def clean_city_name(city):
    #Splitting the city on ',' and retain only the main city name
    city = city.split(',')[0].strip().lower()

    #removing additional information like &,phase,sector...
    unwanted_terms = ['&', 'phase', 'sector', 'campus', 'vihar', 'west', 'east', 'nagar', 'colony']
    for term in unwanted_terms:
        if term in city:
            city = city.split(term)[0].strip()

    #returning city name after cleaning
    return city

#applying the cleaning function to the cities
cleaned_cities = cities.apply(clean_city_name)

#removing empty city names from the list
cleaned_cities = cleaned_cities[cleaned_cities != '']

#getting unique city names
unique_cities = cleaned_cities.unique()

#sorting the unique cities for better readability
unique_cities_sorted = sorted(unique_cities)

#printing the results
print("Unique cities in the dataset:", unique_cities_sorted)

Unique cities in the dataset: ['abids', 'abohar', 'adajan', 'adilabad', 'adityapur', 'adoni', 'adyar', 'agartala', 'agra', 'ahmed', 'airoli', 'aizawl', 'ajmer', 'akola', 'akota', 'alambagh', 'alappuzha', 'aliganj', 'aligarh', 'alipore', 'alipurduar', 'alkapuri', 'allahabad', 'aluva', 'alwal', 'alwar', 'alwarpet', 'ambala', 'ambattur', 'ambikapur', 'ambur', 'ameerpet', 'aminabad', 'amravati', 'amreli', 'amritsar', 'anand', 'anantapur', 'ankleshwar', 'anna', 'arakkonam', 'arambagh', 'arekere', 'arrah', 'arumbakkam', 'aruppukottai', 'asansol', 'ashiyana', 'ashok', 'athwa', 'attapur', 'aundh', 'aurangabad', 'aurangabad_bihar', 'avadi', 'azamgarh', 'baddi', 'bagalkot', 'bagdogra', 'bahadurgarh', 'bahraich', 'bajaj', 'bala', 'balaghat', 'balangir', 'balasore', 'ballari', 'bally', 'balrampur', 'balurghat', 'banashankari', 'banda', 'bandra', 'baner', 'banjara hills', 'bantwal', 'bapatlachirala', 'baramati', 'baran', 'bardhaman', 'bardoli', 'bareilly', 'barmer', 'barnala', 'barshi', 'barwani', 

Restaurant Recommendation System Using PCA and Cosine Similarity

In [None]:
#loaing the cleaned and encoded data
encoded_data = pd.read_csv("encoded_data.csv")  #loading the encoded dataset
cleaned_data = pd.read_csv("cleaned_data.csv")  #loading the cleaned dataset

#checking for missing values in the encoded data
print("Checking for missing values in encoded data:")
print(encoded_data.isnull().sum())

#Standardizing the data (important before PCA)
scaler = StandardScaler()
encoded_data_scaled = scaler.fit_transform(encoded_data)

#applying PCA for dimensionality reduction
pca = PCA(n_components=10)  #adjusting n_components based on your needs
encoded_features_pca = pca.fit_transform(encoded_data_scaled)

#checking the explained variance ratio of the PCA components
print("Explained Variance Ratio of PCA components:", pca.explained_variance_ratio_)

#defining the restaurant recommendation function
def recommend_restaurant(input_cuisine, input_city, encoded_data, cleaned_data, top_n=10, similarity_threshold=0.1):
    #finding the index of the input cuisine and city
    input_data = cleaned_data[(cleaned_data['cuisine'] == input_cuisine) & (cleaned_data['city'] == input_city)]
    
    if input_data.empty:
        return "No matching input data found."
    
    #getting the input restaurant's encoded vector
    input_index = input_data.index[0]
    input_vector = encoded_data.iloc[input_index].values.reshape(1, -1)
    
    #standardizning the input vector using the same scaler
    input_vector_scaled = scaler.transform(input_vector)
    
    #tranforming the input vector using PCA
    input_vector_pca = pca.transform(input_vector_scaled)
    
    #calculating cosine similarity between the input restaurant and all others
    similarity_scores = cosine_similarity(input_vector_pca, encoded_features_pca)
    
    #getting the indices of the top N similar restaurants (above the similarity threshold)
    similar_restaurants = np.argsort(similarity_scores[0])[::-1]  # Sort indices by similarity (descending)
    
    #filtering restaurants that meet the similarity threshold
    recommended_restaurants = []
    for idx in similar_restaurants:
        if similarity_scores[0][idx] >= similarity_threshold:
            recommended_restaurants.append(cleaned_data.iloc[idx])
        if len(recommended_restaurants) >= top_n:
            break
    
    #returning the recommended restaurants or a message if none meet the threshold
    if not recommended_restaurants:
        return "No restaurants meet the similarity threshold."
    
    return pd.DataFrame(recommended_restaurants), similarity_scores  # Return both recommended restaurants and similarity scores

#example usage of the function
input_cuisine = "Indian"
input_city = "Abohar"
recommended_restaurants, similarity_scores = recommend_restaurant(input_cuisine, input_city, encoded_data, cleaned_data, top_n=10, similarity_threshold=0.1)

#displaying the recommended restaurants
print("Recommended Restaurants:")
print(recommended_restaurants)

#displaying similarity scores and PCA features
print("Similarity Scores:", similarity_scores)
print("Transformed PCA Features (Sample):")
print(encoded_features_pca[:5])  #displaying first 5 transformed features

  cleaned_data = pd.read_csv("cleaned_data.csv")  # Load the cleaned dataset


Checking for missing values in encoded data:
city_Abids & Koti,Hyderabad    0
city_Abohar                    0
city_Adajan,Surat              0
city_Adilabad                  0
city_Adityapur                 0
                              ..
cuisine_Waffle,Bakery          0
cuisine_Waffle,Beverages       0
cuisine_Waffle,Burgers         0
cuisine_Waffle,Desserts        0
cuisine_Waffle,Ice Cream       0
Length: 2416, dtype: int64
Explained Variance Ratio of PCA components: [0.00056471 0.00055108 0.00053958 0.00052409 0.00051863 0.00051334
 0.00051159 0.00050722 0.00050151 0.00050034]




Recommended Restaurants:
          id                     name    city  rating  rating_count   cost  \
7   156587.0        Bharawan Da Dhaba  Abohar     4.4   50+ ratings  300.0   
11  171675.0           Domino's Pizza  Abohar     4.4   20+ ratings  400.0   
1   158203.0        theka coffee desi  Abohar     3.8  100+ ratings  100.0   
12  351387.0          NIKKU VEG THALI  Abohar     4.1  100+ ratings  150.0   
14  161405.0  Bihari da vaishno dhaba  Abohar     4.0   20+ ratings  200.0   
4   156588.0         shere punjab veg  Abohar     4.0  100+ ratings  150.0   
6   156590.0         Sethi Milk Badam  Abohar     4.2   20+ ratings  100.0   
10  420675.0            Royal Chicken  Abohar     4.2   20+ ratings  200.0   
2   187912.0                Singh Hut  Abohar     3.7   20+ ratings  250.0   
9   530909.0               FOODY MOOD  Abohar     4.7   20+ ratings  300.0   

                 cuisine          lic_no  \
7                 Indian         license   
11                Pizzas  12