# Zomato User Order Analysis - India

**Course:** 602 - Data Analytics using Python

This project analyzes Zomato food delivery orders across major Indian cities to understand user ordering patterns, tip behavior, and predict order values using machine learning.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## Task 1: Data Understanding

Loading the dataset and exploring its structure.

In [None]:
# Load the dataset
df = pd.read_csv('../data/zomato_orders.csv')
df.shape

In [None]:
# First 5 rows
df.head()

In [None]:
# Last 5 rows
df.tail()

In [None]:
# Column names and data types
df.dtypes

In [None]:
# Basic info
df.info()

### Data Type Classification

**Quantitative Data:**
- Discrete: Order_Count_Year, Year, Month
- Continuous: Order_Amount, Tip_Amount, Delivery_Time_Mins

**Qualitative Data:**
- Nominal: City, Restaurant_Name, Cuisine_Type, User_Name, Payment_Method
- Ordinal: Day_of_Week, Month

## Task 2: Exploratory Data Analysis (EDA)

### Univariate Analysis

In [None]:
# Order Amount Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df['Order_Amount'], bins=30, color='#e74c3c', edgecolor='white')
axes[0].set_title('Order Amount Distribution')
axes[0].set_xlabel('Amount (₹)')
axes[0].set_ylabel('Frequency')

axes[1].boxplot(df['Order_Amount'])
axes[1].set_title('Order Amount Boxplot')
axes[1].set_ylabel('Amount (₹)')

plt.tight_layout()
plt.savefig('../outputs/order_amount_dist.png', dpi=100)
plt.show()

In [None]:
# Tip Amount Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

tip_data = df['Tip_Amount'].dropna()
axes[0].hist(tip_data, bins=30, color='#27ae60', edgecolor='white')
axes[0].set_title('Tip Amount Distribution')
axes[0].set_xlabel('Tip (₹)')

axes[1].boxplot(tip_data)
axes[1].set_title('Tip Amount Boxplot')

plt.tight_layout()
plt.savefig('../outputs/tip_amount_dist.png', dpi=100)
plt.show()

In [None]:
# Orders by City
plt.figure(figsize=(10, 5))
city_counts = df['City'].value_counts()
plt.bar(city_counts.index, city_counts.values, color='#3498db', edgecolor='white')
plt.title('Orders by City')
plt.xlabel('City')
plt.ylabel('Number of Orders')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../outputs/orders_by_city.png', dpi=100)
plt.show()

### Bivariate Analysis

In [None]:
# Order Amount vs Tip Amount
plt.figure(figsize=(8, 5))
plt.scatter(df['Order_Amount'], df['Tip_Amount'], alpha=0.4, c='#e74c3c')
plt.title('Order Amount vs Tip Amount')
plt.xlabel('Order Amount (₹)')
plt.ylabel('Tip Amount (₹)')

# Trend line
z = np.polyfit(df['Order_Amount'].dropna(), df['Tip_Amount'].dropna(), 1)
p = np.poly1d(z)
plt.plot(df['Order_Amount'].sort_values(), p(df['Order_Amount'].sort_values()), 'k--', linewidth=2)

plt.tight_layout()
plt.savefig('../outputs/order_vs_tip.png', dpi=100)
plt.show()

In [None]:
# Correlation Matrix
numeric_cols = ['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins', 'Order_Count_Year']
corr = df[numeric_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, square=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('../outputs/correlation_matrix.png', dpi=100)
plt.show()

### Multivariate Analysis

In [None]:
# Pairplot
pairplot_data = df[['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']].dropna()
sns.pairplot(pairplot_data, diag_kind='kde')
plt.suptitle('Pair Plot', y=1.02)
plt.savefig('../outputs/pairplot.png', dpi=100)
plt.show()

In [None]:
# Average Order by City and Cuisine
pivot = df.pivot_table(values='Order_Amount', index='City', columns='Cuisine_Type', aggfunc='mean')

plt.figure(figsize=(12, 8))
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd')
plt.title('Average Order Amount: City vs Cuisine')
plt.tight_layout()
plt.savefig('../outputs/city_cuisine_heatmap.png', dpi=100)
plt.show()

## Task 3: Handling Missing Data and Outliers

In [None]:
# Check missing values
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
# Handle missing values
df_clean = df.copy()
df_clean['Tip_Amount'].fillna(df_clean['Tip_Amount'].median(), inplace=True)
df_clean['Delivery_Time_Mins'].fillna(df_clean['Delivery_Time_Mins'].mean(), inplace=True)

# Verify
df_clean.isnull().sum().sum()

In [None]:
# Outlier Detection
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

for i, col in enumerate(['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']):
    axes[i].boxplot(df_clean[col])
    axes[i].set_title(f'{col}')

plt.suptitle('Outlier Detection')
plt.tight_layout()
plt.savefig('../outputs/outliers.png', dpi=100)
plt.show()

**Outlier Impact:**
- High order amounts represent premium restaurant orders or group orders
- High tips correlate with larger orders
- Delivery time outliers may indicate traffic or preparation delays

## Task 4: Spread of Data

In [None]:
# Statistical Summary
df_clean[['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']].describe()

In [None]:
# Skewness and Kurtosis
for col in ['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']:
    data = df_clean[col]
    print(f"{col}: Skewness = {data.skew():.2f}, Kurtosis = {data.kurtosis():.2f}")

In [None]:
# Distribution Plots
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
colors = ['#e74c3c', '#27ae60', '#3498db']

for i, col in enumerate(['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']):
    sns.histplot(df_clean[col], kde=True, ax=axes[i], color=colors[i])
    axes[i].axvline(df_clean[col].mean(), color='red', linestyle='--', label='Mean')
    axes[i].axvline(df_clean[col].median(), color='green', linestyle='--', label='Median')
    axes[i].set_title(col)
    axes[i].legend()

plt.tight_layout()
plt.savefig('../outputs/distribution.png', dpi=100)
plt.show()

## Task 5: Automating EDA

In [None]:
# Using describe()
df_clean.describe()

In [None]:
# Correlation using corr()
df_clean[['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins', 'Order_Count_Year']].corr()

In [None]:
# Reusable EDA function
def quick_eda(dataframe, column):
    """Quick EDA for any numeric column"""
    data = dataframe[column].dropna()
    return pd.Series({
        'count': len(data),
        'mean': data.mean(),
        'median': data.median(),
        'std': data.std(),
        'min': data.min(),
        'max': data.max(),
        'skewness': data.skew(),
        'kurtosis': data.kurtosis()
    })

quick_eda(df_clean, 'Order_Amount')

## Task 6: Regression Analysis

In [None]:
# Variables:
# Dependent (Y): Tip_Amount
# Independent (X): Order_Amount, Delivery_Time_Mins

# Covariance
df_clean[['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']].cov()

In [None]:
# Correlation
corr_val = df_clean['Order_Amount'].corr(df_clean['Tip_Amount'])
print(f"Order Amount ↔ Tip Amount correlation: {corr_val:.3f}")
print("Interpretation: Moderate positive correlation - higher orders receive higher tips")

## Task 7: Supervised Learning - Regression Models

In [None]:
# Prepare data
ml_data = df_clean[['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']].dropna()

X = ml_data[['Order_Amount']]
y = ml_data['Tip_Amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training: {len(X_train)}, Testing: {len(X_test)}")

In [None]:
# Simple Linear Regression
lr_simple = LinearRegression()
lr_simple.fit(X_train, y_train)
y_pred_simple = lr_simple.predict(X_test)

print(f"Coefficient: {lr_simple.coef_[0]:.4f}")
print(f"Intercept: {lr_simple.intercept_:.4f}")
print(f"Equation: Tip = {lr_simple.intercept_:.2f} + {lr_simple.coef_[0]:.4f} × Order_Amount")

In [None]:
# Multiple Linear Regression
X_multi = ml_data[['Order_Amount', 'Delivery_Time_Mins']]
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_multi, y, test_size=0.2, random_state=42)

lr_multi = LinearRegression()
lr_multi.fit(X_train_m, y_train_m)
y_pred_multi = lr_multi.predict(X_test_m)

print(f"Coefficients: {dict(zip(X_multi.columns, lr_multi.coef_))}")

In [None]:
# Actual vs Predicted Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].scatter(y_test, y_pred_simple, alpha=0.5, c='#e74c3c')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')
axes[0].set_title('Simple Linear Regression')

axes[1].scatter(y_test_m, y_pred_multi, alpha=0.5, c='#3498db')
axes[1].plot([y_test_m.min(), y_test_m.max()], [y_test_m.min(), y_test_m.max()], 'k--')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predicted')
axes[1].set_title('Multiple Linear Regression')

plt.tight_layout()
plt.savefig('../outputs/regression_comparison.png', dpi=100)
plt.show()

## Task 8-9: Overfitting and Underfitting

In [None]:
# Training vs Testing Error
y_train_pred = lr_simple.predict(X_train)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_pred_simple)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_pred_simple)

results = pd.DataFrame({
    'Metric': ['MSE', 'R² Score'],
    'Training': [train_mse, train_r2],
    'Testing': [test_mse, test_r2]
})
results

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

axes[0].bar(['Training', 'Testing'], [train_mse, test_mse], color=['#e74c3c', '#3498db'])
axes[0].set_title('MSE Comparison')
axes[0].set_ylabel('MSE')

axes[1].bar(['Training', 'Testing'], [train_r2, test_r2], color=['#e74c3c', '#3498db'])
axes[1].set_title('R² Score Comparison')
axes[1].set_ylabel('R²')

plt.tight_layout()
plt.savefig('../outputs/overfitting_analysis.png', dpi=100)
plt.show()

# Interpretation
if abs(test_mse - train_mse) / train_mse < 0.2:
    print("Model generalizes well - no significant overfitting")
else:
    print("Potential overfitting detected")

## Task 10: Classification Task

In [None]:
# Create binary target: High Tipper (above median) vs Low Tipper
median_tip = df_clean['Tip_Amount'].median()
df_clean['High_Tipper'] = (df_clean['Tip_Amount'] > median_tip).astype(int)

print(f"Median Tip: ₹{median_tip:.2f}")
df_clean['High_Tipper'].value_counts()

In [None]:
# Logistic Regression
X_class = df_clean[['Order_Amount', 'Delivery_Time_Mins']].dropna()
y_class = df_clean.loc[X_class.index, 'High_Tipper']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_c, y_train_c)
y_pred_c = log_reg.predict(X_test_c)

## Task 11-12: Model Evaluation

In [None]:
# Regression Metrics
print("Simple Linear Regression:")
print(f"  MSE: {mean_squared_error(y_test, y_pred_simple):.4f}")
print(f"  MAE: {mean_absolute_error(y_test, y_pred_simple):.4f}")
print(f"  R²:  {r2_score(y_test, y_pred_simple):.4f}")

print("\nMultiple Linear Regression:")
print(f"  MSE: {mean_squared_error(y_test_m, y_pred_multi):.4f}")
print(f"  MAE: {mean_absolute_error(y_test_m, y_pred_multi):.4f}")
print(f"  R²:  {r2_score(y_test_m, y_pred_multi):.4f}")

In [None]:
# Classification Metrics
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_c):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_c, y_pred_c, target_names=['Low Tipper', 'High Tipper']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test_c, y_pred_c)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Low Tipper', 'High Tipper'],
            yticklabels=['Low Tipper', 'High Tipper'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('../outputs/confusion_matrix.png', dpi=100)
plt.show()

## Task 13: Data Visualization & Insights

In [None]:
# Dashboard
fig = plt.figure(figsize=(14, 10))

# Top Restaurants
ax1 = fig.add_subplot(2, 2, 1)
top_rest = df_clean['Restaurant_Name'].value_counts().head(10)
ax1.barh(top_rest.index, top_rest.values, color='#e74c3c')
ax1.set_title('Top 10 Restaurants')
ax1.invert_yaxis()

# Orders by Day
ax2 = fig.add_subplot(2, 2, 2)
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df_clean['Day_of_Week'].value_counts().reindex(day_order)
ax2.bar(day_counts.index, day_counts.values, color='#3498db')
ax2.set_title('Orders by Day')
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

# Payment Methods
ax3 = fig.add_subplot(2, 2, 3)
payment = df_clean['Payment_Method'].value_counts()
ax3.pie(payment.values, labels=payment.index, autopct='%1.1f%%')
ax3.set_title('Payment Methods')

# Monthly Trend
ax4 = fig.add_subplot(2, 2, 4)
monthly = df_clean.groupby(['Year', 'Month']).size().reset_index(name='Orders')
monthly['Period'] = monthly['Year'].astype(str) + '-' + monthly['Month'].astype(str).str.zfill(2)
ax4.plot(monthly['Period'], monthly['Orders'], marker='o', color='#27ae60')
ax4.set_title('Monthly Trend')
for i, label in enumerate(ax4.xaxis.get_ticklabels()):
    if i % 4 != 0:
        label.set_visible(False)
plt.setp(ax4.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.savefig('../outputs/dashboard.png', dpi=100)
plt.show()

In [None]:
# Top Users Analysis
top_users = df_clean.groupby('User_Name').agg({
    'Order_ID': 'count',
    'Order_Amount': 'sum',
    'Tip_Amount': 'sum'
}).rename(columns={
    'Order_ID': 'Total_Orders',
    'Order_Amount': 'Total_Spent',
    'Tip_Amount': 'Total_Tips'
}).sort_values('Total_Orders', ascending=False)

top_users.head(10)

In [None]:
# Largest Orders
df_clean.nlargest(10, 'Order_Amount')[['User_Name', 'Restaurant_Name', 'Order_Amount', 'Tip_Amount', 'City']]

In [None]:
# Project Summary
summary = {
    'Total Orders': len(df_clean),
    'Total Users': df_clean['User_ID'].nunique(),
    'Total Restaurants': df_clean['Restaurant_Name'].nunique(),
    'Cities Covered': df_clean['City'].nunique(),
    'Total Order Value': f"₹{df_clean['Order_Amount'].sum():,.0f}",
    'Total Tips': f"₹{df_clean['Tip_Amount'].sum():,.0f}",
    'Average Order': f"₹{df_clean['Order_Amount'].mean():.0f}",
    'Average Tip': f"₹{df_clean['Tip_Amount'].mean():.0f}",
    'Most Popular Restaurant': df_clean['Restaurant_Name'].mode()[0],
    'Most Popular Cuisine': df_clean['Cuisine_Type'].mode()[0]
}

pd.Series(summary)