In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
print("\nCustomer Data Analysis for India\n")


Customer Data Analysis for India



In [7]:
# STEP 1: LOAD DATASET

print("== Load Dataset ==\n")
df = pd.read_csv("customer_dataset_new (1).csv")
df

== Load Dataset ==



Unnamed: 0,customer_id,gender,age,payment_method,monthly_spending,visits_per_month
0,1001,Female,51,Cash,4354,3
1,1002,Other,60,UPI,3039,7
2,1003,Female,63,Credit Card,886,10
3,1004,Female,18,Wallet,3093,18
4,1005,Female,33,Cash,4707,4
...,...,...,...,...,...,...
495,1496,Female,44,UPI,2323,10
496,1497,Female,36,Cash,4577,1
497,1498,Female,63,Credit Card,285,2
498,1499,Other,56,Debit Card,4876,15


In [8]:
print("Original Dataset Shape:", df.shape)

Original Dataset Shape: (500, 6)


In [9]:
# STEP 2: DATA CLEANING

print("\n== Missing Values Before Cleaning ==\n")
print(df.isnull().sum())


== Missing Values Before Cleaning ==

customer_id         0
gender              0
age                 0
payment_method      0
monthly_spending    0
visits_per_month    0
dtype: int64


In [None]:
# Numeric columns → mean
df["age"] = df["age"].fillna(df["age"].mean())
df["monthly_spending"] = df["monthly_spending"].fillna(df["monthly_spending"].mean())
df["visits_per_month"] = df["visits_per_month"].fillna(df["visits_per_month"].mean())

In [None]:
# Categorical columns → mode
df["gender"] = df["gender"].fillna(df["gender"].mode()[0])
df["payment_method"] = df["payment_method"].fillna(df["payment_method"].mode()[0])

In [None]:
# Remove duplicates
df = df.drop_duplicates()

print("\nDataset Shape After Cleaning:", df.shape)

In [None]:
# STEP 3: VISUALIZATIONS

# line chart : 

age_payment_counts = df.groupby(["age","payment_method"])["customer_id"].count().reset_index()
age_payment_counts.rename(columns={"customer_id":"count"}, inplace=True)

plt.figure(figsize=(12,6))
sns.lineplot(data=age_payment_counts, x="age", y="count", hue="payment_method", marker="o")
plt.title("Payment Method Usage Across Different Ages")
plt.xlabel("Age")
plt.ylabel("Customer Count")
plt.show()

In [None]:
# Bar Chart : 

gender_counts = df.groupby("gender")["customer_id"].count().reset_index()

plt.figure(figsize=(8,6))
sns.barplot(data=gender_counts, x="gender", y="customer_id", palette="Set2")
plt.title("Customer Count by Gender")
plt.xlabel("Gender")
plt.ylabel("Number of Customers")
plt.show()

In [None]:
# scatter chart :

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x="age", y="monthly_spending", hue="gender")
plt.title("Age vs Monthly Spending")
plt.xlabel("Age")
plt.ylabel("Monthly Spending (INR)")
plt.show()

In [None]:
#  hiastogrm : 

plt.figure(figsize=(8,6))
sns.histplot(df["age"], bins=20, kde=True)
plt.title("Age Distribution of Customers")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
#  heatmap :

plt.figure(figsize=(8,6))
sns.heatmap(df[["age","monthly_spending","visits_per_month"]].corr(), 
            annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# STEP 4: FINAL SUMMARY

print("\n===== FINAL SUMMARY REPORT =====")

print("Total Records:", len(df))

print("Average Age:", round(df["age"].mean(), 2))
print("Average Monthly Spending (₹):", round(df["monthly_spending"].mean(), 2))
print("Average Visits per Month:", round(df["visits_per_month"].mean(), 2))

print("\nMost Common Gender:", df["gender"].mode()[0])
print("Most Used Payment Method:", df["payment_method"].mode()[0])

print("\nHighest Spending Customer ID:",
      df.loc[df["monthly_spending"].idxmax(), "customer_id"])

print("Customer With Most Visits:",
      df.loc[df["visits_per_month"].idxmax(), "customer_id"])

print("\nGender with Highest Average Spending:",
      df.groupby("gender")["monthly_spending"].mean().idxmax())

print("Payment Method with Highest Average Spending:",
      df.groupby("payment_method")["monthly_spending"].mean().idxmax())