# Data Cleaning

In [2]:
import sqlite3
import pandas as pd
import numpy as np

# Connect to the database
conn = sqlite3.connect("zomato.db")

# Load the dataset into a Pandas DataFrame
df = pd.read_sql("SELECT * FROM zomato_restaurants;", conn)

# Close the connection
conn.close()

# Check for missing values
print(df.isnull().sum())

url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64


# Rename

In [4]:
# rename columns
df.rename(columns={
    'approx_cost(for two people)': 'cost',
    'listed_in(type)': 'type',
    'listed_in(city)': 'city'
}, inplace=True)

# Drop unnecessary data

In [6]:
# check and drop duplicate columns
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [7]:
# location and cuisines is fixed by dropping rows since there is only 21 rows
df.dropna(subset=['location', 'cuisines'], inplace=True)

In [8]:
# drop unnecessary columns
df.drop(columns=['url','address', 'phone', 'reviews_list','menu_item'], inplace=True)

In [9]:
print("Rows with excessive text removed. New dataset shape:", df.shape)

Rows with excessive text removed. New dataset shape: (51672, 12)


# Restructure data presentation & data types

In [11]:
# check and change data types
print(df.dtypes)

name            object
online_order    object
book_table      object
rate            object
votes            int64
location        object
rest_type       object
dish_liked      object
cuisines        object
cost            object
type            object
city            object
dtype: object


# Handle missing values

In [13]:
# Replace string 'None' and other non-standard missing values with np.nan
df.replace(["None", "nan", "NaN"], np.nan, inplace=True)

In [14]:
# convert online order and book table [1 for yes, 0 for no, 2 for irrelevant]
def map_yes_no_irrelevant(value):
    if value == "Yes":
        return 1
    elif value == "No":
        return 0
    else:
        return 2

# Apply the function to both columns
df['online_order'] = df['online_order'].apply(map_yes_no_irrelevant).astype(int)
df['book_table'] = df['book_table'].apply(map_yes_no_irrelevant).astype(int)

In [15]:
# dealing with numerics [-1 for irrelevant]

# Function to clean numeric columns and replace non-numeric values with median
def clean_numeric_column(series, fill_with_median=True):
    series = series.astype(str).str.replace(',', '', regex=True)  
    series = pd.to_numeric(series, errors='coerce')  
    
    if fill_with_median:
        median_value = series.median()
        series = series.fillna(median_value)
    else:  
        series = series.fillna(-1)
    
    return series

# Extract numeric part from 'rate' and clean
df['rate'] = df['rate'].astype(str).str.extract(r'([\d.]+)')
df['rate'] = clean_numeric_column(df['rate'], fill_with_median=True)

# Clean 'cost' column (remove commas and fill with median)
df['cost'] = clean_numeric_column(df['cost'], fill_with_median=True).astype(int)

# Clean 'votes' column (remove commas and replace non-numeric with -1)
df['votes'] = clean_numeric_column(df['votes'], fill_with_median=False).astype(int)

In [16]:
# dealing with missing categorical values
df[['dish_liked', 'rest_type']] = df[['dish_liked', 'rest_type']].fillna({'dish_liked': "Not Available", 'rest_type': "Other"})

In [17]:
# Define the list of valid city names
valid_cities = [
    "Church Street", "Brigade Road", "MG Road", "Lavelle Road", "Residency Road",
    "Indiranagar", "Old Airport Road", "Whitefield", "Malleshwaram", "Frazer Town",
    "Bellandur", "Sarjapur Road", "Koramangala 4th Block", "Koramangala 5th Block",
    "Brookefield", "Koramangala 6th Block", "Koramangala 7th Block", "Marathahalli",
    "Electronic City", "BTM", "HSR", "Rajajinagar", "Kalyan Nagar", "Kammanahalli",
    "Jayanagar", "JP Nagar", "New BEL Road", "Bannerghatta Road", "Basavanagudi",
    "Banashankari"
]

# Fill missing values in 'city' safely
df['city'] = df['city'].fillna("Not Available")

# Replace invalid city names with "Not Available" safely
df['city'] = df['city'].apply(lambda x: x if x in valid_cities else "Not Available")

print("Invalid city names have been replaced with 'Not Available'.")

Invalid city names have been replaced with 'Not Available'.


In [18]:
# verify missing values
print(df.isnull().sum())

name            0
online_order    0
book_table      0
rate            0
votes           0
location        0
rest_type       0
dish_liked      0
cuisines        0
cost            0
type            0
city            0
dtype: int64


In [19]:
print(df.dtypes)

name             object
online_order      int32
book_table        int32
rate            float64
votes             int32
location         object
rest_type        object
dish_liked       object
cuisines         object
cost              int32
type             object
city             object
dtype: object


# Save cleaned data in SQLite & Local

In [21]:
# cleaned data
conn = sqlite3.connect("zomato.db")
df.to_sql("zomato_cleaned", conn, if_exists="replace", index=False)
conn.close()

print("Data cleaning complete! Cleaned data saved as 'zomato_cleaned'.")

Data cleaning complete! Cleaned data saved as 'zomato_cleaned'.


In [22]:
# to save cleaned data into working directory
conn = sqlite3.connect("zomato.db")
df_cleaned = pd.read_sql("SELECT * FROM zomato_cleaned;", conn)
conn.close()

# Save to CSV
df_cleaned.to_csv("zomato_cleaned.csv", index=False)
print("Cleaned data saved as 'zomato_cleaned.csv'.")

Cleaned data saved as 'zomato_cleaned.csv'.
