# Zomato User Order Analysis Across India

**Course:** 602 - Data Analytics using Python

**Project Title:** Exploratory Data Analysis and Order Prediction for Zomato Users in India

---

## Dataset Overview
- **Source:** Synthetic data generated for analysis
- **Records:** 5000 orders from 800 users
- **Domain:** Food Delivery / Business Analytics

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
print('‚úÖ Libraries imported successfully!')

---
## Task 1: Data Understanding

In [None]:
# Load the dataset
df = pd.read_csv('../data/zomato_orders.csv')
print('Dataset loaded successfully!')
print(f'Shape: {df.shape}')

In [None]:
# Display first 5 rows
print('\nüìä First 5 Rows:')
df.head()

In [None]:
# Display last 5 rows
print('\nüìä Last 5 Rows:')
df.tail()

In [None]:
# Dataset shape and column names
print(f'\nüìê Dataset Shape: {df.shape[0]} rows √ó {df.shape[1]} columns')
print(f'\nüìã Column Names:\n{df.columns.tolist()}')

In [None]:
# Data types
print('\nüî¢ Data Types:')
df.dtypes

In [None]:
# Identify Quantitative and Qualitative Data
print('\nüìä DATA TYPE CLASSIFICATION')
print('=' * 50)

print('\nüî¢ QUANTITATIVE DATA:')
print('  Discrete: Order_Count_Year, Year, Month')
print('  Continuous: Order_Amount, Tip_Amount, Delivery_Time_Mins')

print('\nüìù QUALITATIVE DATA:')
print('  Nominal: City, Restaurant_Name, Cuisine_Type, User_Name, Payment_Method')
print('  Ordinal: Day_of_Week, Month')

---
## Task 2: Exploratory Data Analysis (EDA)

### 2.1 Univariate Analysis

In [None]:
# Univariate Analysis - Order Amount
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['Order_Amount'], bins=30, color='#FF6B6B', edgecolor='white', alpha=0.8)
axes[0].set_title('Distribution of Order Amount', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Order Amount (‚Çπ)')
axes[0].set_ylabel('Frequency')

# Boxplot
axes[1].boxplot(df['Order_Amount'], vert=True)
axes[1].set_title('Boxplot of Order Amount', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Order Amount (‚Çπ)')

plt.tight_layout()
plt.savefig('../outputs/univariate_order_amount.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Univariate Analysis - Tip Amount
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
tip_data = df['Tip_Amount'].dropna()
axes[0].hist(tip_data, bins=30, color='#4ECDC4', edgecolor='white', alpha=0.8)
axes[0].set_title('Distribution of Tip Amount', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Tip Amount (‚Çπ)')
axes[0].set_ylabel('Frequency')

# Boxplot
axes[1].boxplot(tip_data, vert=True)
axes[1].set_title('Boxplot of Tip Amount', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Tip Amount (‚Çπ)')

plt.tight_layout()
plt.savefig('../outputs/univariate_tip_amount.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# City-wise Order Count
plt.figure(figsize=(12, 6))
city_counts = df['City'].value_counts()
colors = plt.cm.viridis(np.linspace(0, 1, len(city_counts)))
bars = plt.bar(city_counts.index, city_counts.values, color=colors, edgecolor='white')
plt.title('Orders by City', fontsize=14, fontweight='bold')
plt.xlabel('City')
plt.ylabel('Number of Orders')
plt.xticks(rotation=45, ha='right')

# Add value labels
for bar, val in zip(bars, city_counts.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, str(val), 
             ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../outputs/orders_by_city.png', dpi=150, bbox_inches='tight')
plt.show()

### 2.2 Bivariate Analysis

In [None]:
# Order Amount vs Tip Amount
plt.figure(figsize=(10, 6))
plt.scatter(df['Order_Amount'], df['Tip_Amount'], alpha=0.5, c='#FF6B6B', edgecolors='white', s=50)
plt.title('Order Amount vs Tip Amount', fontsize=14, fontweight='bold')
plt.xlabel('Order Amount (‚Çπ)')
plt.ylabel('Tip Amount (‚Çπ)')

# Add trend line
z = np.polyfit(df['Order_Amount'].dropna(), df['Tip_Amount'].dropna(), 1)
p = np.poly1d(z)
plt.plot(df['Order_Amount'].sort_values(), p(df['Order_Amount'].sort_values()), 
         'r--', linewidth=2, label='Trend Line')
plt.legend()
plt.tight_layout()
plt.savefig('../outputs/bivariate_order_vs_tip.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Order Amount vs Delivery Time
plt.figure(figsize=(10, 6))
plt.scatter(df['Order_Amount'], df['Delivery_Time_Mins'], alpha=0.5, c='#4ECDC4', edgecolors='white', s=50)
plt.title('Order Amount vs Delivery Time', fontsize=14, fontweight='bold')
plt.xlabel('Order Amount (‚Çπ)')
plt.ylabel('Delivery Time (Minutes)')
plt.tight_layout()
plt.savefig('../outputs/bivariate_order_vs_delivery.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Correlation Matrix
numeric_cols = ['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins', 'Order_Count_Year', 'Year', 'Month']
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='RdYlGn', center=0, 
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../outputs/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

### 2.3 Multivariate Analysis

In [None]:
# Pairplot for numerical variables
pairplot_cols = ['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']
sns.pairplot(df[pairplot_cols].dropna(), diag_kind='kde', 
             plot_kws={'alpha': 0.5, 'edgecolor': 'white'})
plt.suptitle('Pair Plot: Order, Tip, Delivery Time', y=1.02, fontsize=14, fontweight='bold')
plt.savefig('../outputs/multivariate_pairplot.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Average Order Amount by City and Cuisine
pivot_data = df.pivot_table(values='Order_Amount', index='City', columns='Cuisine_Type', aggfunc='mean')

plt.figure(figsize=(14, 8))
sns.heatmap(pivot_data, annot=True, fmt='.0f', cmap='YlOrRd', linewidths=0.5)
plt.title('Average Order Amount by City √ó Cuisine', fontsize=14, fontweight='bold')
plt.xlabel('Cuisine Type')
plt.ylabel('City')
plt.tight_layout()
plt.savefig('../outputs/multivariate_city_cuisine.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Task 3: Handling Missing Data and Outliers

In [None]:
# Check for missing values
print('\nüîç MISSING VALUES ANALYSIS')
print('=' * 50)
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage (%)': missing_pct})
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Handle missing values using Mean/Median
df_clean = df.copy()

# Fill Tip_Amount with median (since tips can be skewed)
df_clean['Tip_Amount'].fillna(df_clean['Tip_Amount'].median(), inplace=True)

# Fill Delivery_Time_Mins with mean
df_clean['Delivery_Time_Mins'].fillna(df_clean['Delivery_Time_Mins'].mean(), inplace=True)

print('‚úÖ Missing values handled!')
print(f'Missing values remaining: {df_clean.isnull().sum().sum()}')

In [None]:
# Detect Outliers using Boxplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']):
    axes[i].boxplot(df_clean[col].dropna())
    axes[i].set_title(f'Outliers in {col}', fontweight='bold')
    axes[i].set_ylabel(col)

plt.suptitle('Outlier Detection using Boxplots', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../outputs/outlier_detection.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nüìä OUTLIER IMPACT:')
print('- Outliers in Order_Amount indicate premium/large orders')
print('- High tips may indicate generous customers or large group orders')
print('- Delivery time outliers may indicate traffic or restaurant delays')

---
## Task 4: Spread of Data

In [None]:
# Statistical measures
print('\nüìä SPREAD OF DATA ANALYSIS')
print('=' * 60)

for col in ['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']:
    data = df_clean[col].dropna()
    print(f'\nüìà {col}:')
    print(f'   Mean: {data.mean():.2f}')
    print(f'   Median: {data.median():.2f}')
    print(f'   Std Dev: {data.std():.2f}')
    print(f'   Skewness: {data.skew():.2f}')
    print(f'   Kurtosis: {data.kurtosis():.2f}')

In [None]:
# Distribution plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']):
    data = df_clean[col].dropna()
    sns.histplot(data, kde=True, ax=axes[i], color=['#FF6B6B', '#4ECDC4', '#45B7D1'][i])
    axes[i].axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.1f}')
    axes[i].axvline(data.median(), color='green', linestyle='--', label=f'Median: {data.median():.1f}')
    axes[i].set_title(f'Distribution of {col}', fontweight='bold')
    axes[i].legend()

plt.suptitle('Data Distribution Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../outputs/data_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Task 5: Automating EDA using Python

In [None]:
# Using built-in Pandas functions
print('\nüìä AUTOMATED EDA')
print('=' * 60)

# describe()
print('\n1. df.describe() - Statistical Summary:')
df_clean.describe()

In [None]:
# info()
print('\n2. df.info() - Data Types and Memory:')
df_clean.info()

In [None]:
# isnull()
print('\n3. df.isnull().sum() - Missing Values:')
print(df_clean.isnull().sum())

In [None]:
# corr()
print('\n4. df.corr() - Correlation Matrix:')
df_clean[['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins', 'Order_Count_Year']].corr()

In [None]:
# Reusable EDA function
def automated_eda(dataframe, column):
    """Reusable function for quick EDA on any column"""
    data = dataframe[column].dropna()
    print(f'\nüìä EDA for {column}')
    print('=' * 40)
    print(f'Count: {len(data)}')
    print(f'Mean: {data.mean():.2f}')
    print(f'Median: {data.median():.2f}')
    print(f'Std: {data.std():.2f}')
    print(f'Min: {data.min():.2f}')
    print(f'Max: {data.max():.2f}')
    print(f'Skewness: {data.skew():.2f}')
    print(f'Kurtosis: {data.kurtosis():.2f}')
    return data.describe()

# Test the function
automated_eda(df_clean, 'Order_Amount')

---
## Task 6: Regression Analysis

In [None]:
# Identify variables
print('\nüìä REGRESSION ANALYSIS')
print('=' * 50)
print('\nüéØ Dependent Variable (Y): Tip_Amount')
print('üìä Independent Variables (X): Order_Amount, Delivery_Time_Mins')

In [None]:
# Covariance
print('\nüìà COVARIANCE MATRIX:')
cov_cols = ['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']
print(df_clean[cov_cols].cov())

In [None]:
# Correlation
print('\nüìä CORRELATION ANALYSIS:')
print(df_clean[cov_cols].corr())

print('\nüí° INTERPRETATION:')
corr_val = df_clean['Order_Amount'].corr(df_clean['Tip_Amount'])
print(f'Order Amount ‚Üî Tip Amount correlation: {corr_val:.3f}')
if corr_val > 0.5:
    print('Strong positive correlation: Higher orders get higher tips!')
elif corr_val > 0:
    print('Moderate positive correlation: Tips increase with order amount')
else:
    print('Weak/No correlation between order amount and tips')

---
## Task 7: Supervised Learning - Regression Models

In [None]:
# Prepare data for regression
# Target: Tip_Amount
# Features: Order_Amount, Delivery_Time_Mins

ml_data = df_clean[['Order_Amount', 'Tip_Amount', 'Delivery_Time_Mins']].dropna()

X = ml_data[['Order_Amount']]
y = ml_data['Tip_Amount']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set size: {len(X_train)}')
print(f'Testing set size: {len(X_test)}')

In [None]:
# Simple Linear Regression
print('\nüî∑ SIMPLE LINEAR REGRESSION')
print('=' * 50)

lr_simple = LinearRegression()
lr_simple.fit(X_train, y_train)
y_pred_simple = lr_simple.predict(X_test)

print(f'Coefficient: {lr_simple.coef_[0]:.4f}')
print(f'Intercept: {lr_simple.intercept_:.4f}')
print(f'\nEquation: Tip = {lr_simple.intercept_:.2f} + {lr_simple.coef_[0]:.4f} √ó Order_Amount')

In [None]:
# Multiple Linear Regression
print('\nüî∂ MULTIPLE LINEAR REGRESSION')
print('=' * 50)

X_multi = ml_data[['Order_Amount', 'Delivery_Time_Mins']]
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_multi, y, test_size=0.2, random_state=42)

lr_multi = LinearRegression()
lr_multi.fit(X_train_m, y_train_m)
y_pred_multi = lr_multi.predict(X_test_m)

print(f'Coefficients: Order_Amount={lr_multi.coef_[0]:.4f}, Delivery_Time={lr_multi.coef_[1]:.4f}')
print(f'Intercept: {lr_multi.intercept_:.4f}')

In [None]:
# Visualization: Actual vs Predicted
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Simple LR
axes[0].scatter(y_test, y_pred_simple, alpha=0.5, c='#FF6B6B')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
axes[0].set_xlabel('Actual Tip Amount')
axes[0].set_ylabel('Predicted Tip Amount')
axes[0].set_title('Simple Linear Regression', fontweight='bold')

# Multiple LR
axes[1].scatter(y_test_m, y_pred_multi, alpha=0.5, c='#4ECDC4')
axes[1].plot([y_test_m.min(), y_test_m.max()], [y_test_m.min(), y_test_m.max()], 'k--', lw=2)
axes[1].set_xlabel('Actual Tip Amount')
axes[1].set_ylabel('Predicted Tip Amount')
axes[1].set_title('Multiple Linear Regression', fontweight='bold')

plt.suptitle('Actual vs Predicted: Regression Models', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../outputs/regression_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Task 8-9: Overfitting and Underfitting Analysis

In [None]:
# Compare Training vs Testing Error
print('\nüìä OVERFITTING / UNDERFITTING ANALYSIS')
print('=' * 50)

# Training predictions
y_train_pred = lr_simple.predict(X_train)
y_test_pred = lr_simple.predict(X_test)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'\nüìà Simple Linear Regression:')
print(f'   Training MSE: {train_mse:.4f}')
print(f'   Testing MSE:  {test_mse:.4f}')
print(f'   Training R¬≤:  {train_r2:.4f}')
print(f'   Testing R¬≤:   {test_r2:.4f}')

if test_mse > train_mse * 1.5:
    print('\n‚ö†Ô∏è OVERFITTING: Test error is significantly higher than training error')
elif train_r2 < 0.3:
    print('\n‚ö†Ô∏è UNDERFITTING: Low R¬≤ indicates model is too simple')
else:
    print('\n‚úÖ GOOD FIT: Model generalizes well to unseen data')

In [None]:
# Visualization of Overfitting/Underfitting
metrics = ['Training MSE', 'Testing MSE', 'Training R¬≤', 'Testing R¬≤']
values = [train_mse, test_mse, train_r2, test_r2]
colors = ['#FF6B6B', '#4ECDC4', '#FF6B6B', '#4ECDC4']

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# MSE Comparison
axes[0].bar(['Training', 'Testing'], [train_mse, test_mse], color=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('MSE: Training vs Testing', fontweight='bold')
axes[0].set_ylabel('Mean Squared Error')

# R¬≤ Comparison
axes[1].bar(['Training', 'Testing'], [train_r2, test_r2], color=['#FF6B6B', '#4ECDC4'])
axes[1].set_title('R¬≤ Score: Training vs Testing', fontweight='bold')
axes[1].set_ylabel('R¬≤ Score')

plt.suptitle('Overfitting/Underfitting Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../outputs/overfitting_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Task 10: Classification Task

In [None]:
# Convert to Classification: High Tipper vs Low Tipper
print('\nüìä CLASSIFICATION TASK')
print('=' * 50)

# Create binary target: High Tipper (tip > median) = 1, Low Tipper = 0
median_tip = df_clean['Tip_Amount'].median()
df_clean['High_Tipper'] = (df_clean['Tip_Amount'] > median_tip).astype(int)

print(f'Median Tip Amount: ‚Çπ{median_tip:.2f}')
print(f'\nClass Distribution:')
print(df_clean['High_Tipper'].value_counts())
print(f'\n0 = Low Tipper (Tip ‚â§ ‚Çπ{median_tip:.2f})')
print(f'1 = High Tipper (Tip > ‚Çπ{median_tip:.2f})')

In [None]:
# Prepare data for classification
X_class = df_clean[['Order_Amount', 'Delivery_Time_Mins']].dropna()
y_class = df_clean.loc[X_class.index, 'High_Tipper']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_c, y_train_c)
y_pred_c = log_reg.predict(X_test_c)

print('‚úÖ Logistic Regression model trained!')

---
## Task 11-12: Model Evaluation

In [None]:
# Regression Model Evaluation
print('\nüìä REGRESSION MODEL EVALUATION')
print('=' * 50)

print('\nüî∑ Simple Linear Regression:')
print(f'   MSE:  {mean_squared_error(y_test, y_pred_simple):.4f}')
print(f'   MAE:  {mean_absolute_error(y_test, y_pred_simple):.4f}')
print(f'   R¬≤:   {r2_score(y_test, y_pred_simple):.4f}')

print('\nüî∂ Multiple Linear Regression:')
print(f'   MSE:  {mean_squared_error(y_test_m, y_pred_multi):.4f}')
print(f'   MAE:  {mean_absolute_error(y_test_m, y_pred_multi):.4f}')
print(f'   R¬≤:   {r2_score(y_test_m, y_pred_multi):.4f}')

In [None]:
# Classification Model Evaluation
print('\nüìä CLASSIFICATION MODEL EVALUATION')
print('=' * 50)

print('\nüî∑ Logistic Regression:')
print(f'   Accuracy: {accuracy_score(y_test_c, y_pred_c):.4f}')

print('\nüìã Classification Report:')
print(classification_report(y_test_c, y_pred_c, target_names=['Low Tipper', 'High Tipper']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test_c, y_pred_c)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Low Tipper', 'High Tipper'],
            yticklabels=['Low Tipper', 'High Tipper'])
plt.title('Confusion Matrix: High/Low Tipper Classification', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('../outputs/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Task 13: Data Visualization Summary

In [None]:
# Key Insights Dashboard
fig = plt.figure(figsize=(16, 12))

# 1. Top Restaurants
ax1 = fig.add_subplot(2, 2, 1)
top_restaurants = df_clean['Restaurant_Name'].value_counts().head(10)
ax1.barh(top_restaurants.index, top_restaurants.values, color='#FF6B6B')
ax1.set_title('Top 10 Most Ordered Restaurants', fontweight='bold')
ax1.set_xlabel('Number of Orders')
ax1.invert_yaxis()

# 2. Orders by Day of Week
ax2 = fig.add_subplot(2, 2, 2)
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df_clean['Day_of_Week'].value_counts().reindex(day_order)
ax2.bar(day_counts.index, day_counts.values, color='#4ECDC4')
ax2.set_title('Orders by Day of Week', fontweight='bold')
ax2.set_xlabel('Day')
ax2.set_ylabel('Orders')
ax2.tick_params(axis='x', rotation=45)

# 3. Payment Method Distribution
ax3 = fig.add_subplot(2, 2, 3)
payment_counts = df_clean['Payment_Method'].value_counts()
colors_pie = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
ax3.pie(payment_counts.values, labels=payment_counts.index, autopct='%1.1f%%', colors=colors_pie)
ax3.set_title('Payment Method Distribution', fontweight='bold')

# 4. Monthly Order Trend
ax4 = fig.add_subplot(2, 2, 4)
monthly_orders = df_clean.groupby(['Year', 'Month']).size().reset_index(name='Orders')
monthly_orders['Period'] = monthly_orders['Year'].astype(str) + '-' + monthly_orders['Month'].astype(str).str.zfill(2)
ax4.plot(monthly_orders['Period'], monthly_orders['Orders'], marker='o', color='#FF6B6B', linewidth=2)
ax4.set_title('Monthly Order Trend', fontweight='bold')
ax4.set_xlabel('Month')
ax4.set_ylabel('Orders')
ax4.tick_params(axis='x', rotation=45)
# Show only every 6th label
for i, label in enumerate(ax4.xaxis.get_ticklabels()):
    if i % 6 != 0:
        label.set_visible(False)

plt.suptitle('üìä Zomato Order Analysis Dashboard', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../outputs/dashboard.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top Users Analysis
print('\nüë• TOP USERS ANALYSIS')
print('=' * 60)

# Users who ordered most times in a year
top_orderers = df_clean.groupby('User_Name').agg({
    'Order_ID': 'count',
    'Order_Amount': 'sum',
    'Tip_Amount': 'sum'
}).rename(columns={
    'Order_ID': 'Total_Orders',
    'Order_Amount': 'Total_Spent',
    'Tip_Amount': 'Total_Tips'
}).sort_values('Total_Orders', ascending=False)

print('\nüèÜ TOP 10 Most Frequent Orderers:')
print(top_orderers.head(10))

In [None]:
# Largest Orders
print('\nüí∞ TOP 10 LARGEST ORDERS:')
largest_orders = df_clean.nlargest(10, 'Order_Amount')[[
    'Order_ID', 'User_Name', 'Restaurant_Name', 'Order_Amount', 'Tip_Amount', 'City'
]]
print(largest_orders.to_string(index=False))

In [None]:
# Final Summary
print('\n' + '=' * 60)
print('üìä PROJECT SUMMARY')
print('=' * 60)
print(f'\nüì¶ Total Orders: {len(df_clean):,}')
print(f'üë• Total Users: {df_clean["User_ID"].nunique():,}')
print(f'üè™ Total Restaurants: {df_clean["Restaurant_Name"].nunique()}')
print(f'üåÜ Cities Covered: {df_clean["City"].nunique()}')
print(f'\nüí∞ Total Order Value: ‚Çπ{df_clean["Order_Amount"].sum():,.2f}')
print(f'üíµ Total Tips Given: ‚Çπ{df_clean["Tip_Amount"].sum():,.2f}')
print(f'üìà Average Order Amount: ‚Çπ{df_clean["Order_Amount"].mean():.2f}')
print(f'üìä Average Tip Amount: ‚Çπ{df_clean["Tip_Amount"].mean():.2f}')
print(f'\nüèÜ Most Popular Restaurant: {df_clean["Restaurant_Name"].mode()[0]}')
print(f'üçï Most Popular Cuisine: {df_clean["Cuisine_Type"].mode()[0]}')
print(f'üåÜ City with Most Orders: {df_clean["City"].mode()[0]}')
print('\n‚úÖ PROJECT COMPLETED SUCCESSFULLY!')