# Zomato Data Cleaning and Preprocessing
This notebook demonstrates how to clean and preprocess the Zomato dataset using pandas and numpy.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [None]:
# Step 1: Load the dataset from a CSV file
df = pd.read_csv('zomato.csv')

In [None]:
# Step 2: Drop unnecessary columns to clean the dataset
columns_to_drop = [
    'url', 'phone', 'rest_type', 'dish_liked',
    'reviews_list', 'menu_item', 'listed_in(city)'
]
df = df.drop(columns=columns_to_drop)

In [None]:
# Step 3: Rename some columns for better readability
df = df.rename(columns={
    'approx_cost(for two people)': 'two_people_cost',
    'listed_in(type)': 'type_of_restaurant',
    'rate': 'rating'
})

In [None]:
# Step 4: Drop rows where 'location' is missing
df = df.dropna(subset=['location'])

In [None]:
# Step 5: Drop rows where 'cuisines' or 'two_people_cost' is missing
df = df.dropna(subset=['cuisines', 'two_people_cost'])

In [None]:
# Step 6: Convert 'two_people_cost' to numeric and get per person cost
df['two_people_cost'] = df['two_people_cost'].str.replace(',', '')
df['two_people_cost'] = df['two_people_cost'].astype(int) / 2
df = df.rename(columns={'two_people_cost': 'cost_per_person'})

In [None]:
# Step 7: Define a function to clean and convert the 'rating' column
def handle_rating(value):
    if value in ['NEW', '-']:
        return np.nan
    else:
        return float(str(value).split('/')[0])

In [None]:
# Step 8: Apply the function and fill missing ratings with the mean
df['rating'] = df['rating'].apply(handle_rating)
df['rating'] = df['rating'].fillna(df['rating'].mean())

In [None]:
# Step 9: Save the cleaned dataset to a new CSV file
df.to_csv('zomato_data_analysis.csv', index=False)