### Business Questions to Answer
1. Who are our most valuable customers? What defines them?
2. Are there distinct customer groups with similar spending behaviors? How can we target
them effectively?
3. What demographic factors (e.g., age, gender, income) influence spending habits?
4. What specific actions can MallCo take to improve retention and boost spending?


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### 1. Loading the Dataset


In [None]:
# Load the data
data = pd.read_csv('../data/Mall_Customers.csv')

#### 2. Basic dataset information
- Sample
- Shape
- Info
- Missing values
- Duplicated values
- Basic statistics (numerical variables)
- Distribution (categorical variables)

In [None]:
# Get a reproducible random sample by setting random_state
sample = data.sample(n=20, random_state=42)
print(sample)

In [None]:
# 1. Basic dataset information
print("1. Basic dataset information:")
print("\nDataset Shape:", data.shape)
print("\nDataset Info:")
data.info()

# 2. Check for missing values
print("-" * 50)
print("\n2. Missing Values:")
print(data.isna().sum())

# 3. Check for duplicates
print("-" * 50)
print("\n3. Number of duplicates:", data.duplicated().sum())

# 4. Basic statistics for numerical columns
print("-" * 50)
print("\n4. Basic Statistics (Numerical Columns):")
print(data.describe())

# 5. Value counts for categorical columns
print("-" * 50)
print("\n5. Categorical Columns Distribution:")
# Automatically identify categorical columns
for col in data.select_dtypes(include=['object', 'category']).columns:
    print(f"\n{col} Distribution:")
    print(data[col].value_counts())


#### 4. Univariate Analysis

##### Numerical variables
- Histogram with kde (distribution)
- Individual variable summary statistics (describe & mode)

In [None]:
# 1. Distribution Plots for Numerical Variables
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Distribution of Numerical Variables', fontsize=16)

# Age Distribution
sns.histplot(data=data, x='Age', kde=True, ax=axes[0])
axes[0].set_title('Distribution of Age', fontsize=14)
axes[0].set_xlabel('Age (years)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)

# Annual Income Distribution
sns.histplot(data=data, x='Annual Income (k$)', kde=True, ax=axes[1])
axes[1].set_title('Distribution of Annual Income', fontsize=14)
axes[1].set_xlabel('Annual Income (thousands $)', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)

# Spending Score Distribution
sns.histplot(data=data, x='Spending Score (1-100)', kde=True, ax=axes[2])
axes[2].set_title('Distribution of Spending Score', fontsize=14)
axes[2].set_xlabel('Spending Score (1-100)', fontsize=12)
axes[2].set_ylabel('Count', fontsize=12)

plt.tight_layout()
plt.show()

# Print summary statistics for numerical variables
print("\nSummary Statistics for Numerical Variables:")
print("\nAge Statistics:")
print(data['Age'].describe().round(2))
print("\nAnnual Income Statistics:")
print(data['Annual Income (k$)'].describe().round(2))
print("\nSpending Score Statistics:")
print(data['Spending Score (1-100)'].describe().round(2))

# Print mode for each numerical variable
print("\nMode Values:")
print("Age Mode:", data['Age'].mode().values[0])
print("Annual Income Mode:", data['Annual Income (k$)'].mode().values[0])
print("Spending Score Mode:", data['Spending Score (1-100)'].mode().values[0])

##### Categorical variables
- Barplot (percentage distribution)


In [None]:
# 2. Categorical Analysis (Gender)
plt.figure(figsize=(6, 4))

# Bar plot with percentages
gender_counts = data['Gender'].value_counts()
gender_percentages = (gender_counts / len(data) * 100).round(1)

sns.barplot(x=gender_percentages.index, y=gender_percentages.values)
plt.title('Gender Distribution (%)')
plt.ylabel('Percentage')
# Add percentage labels on top of each bar
for i, v in enumerate(gender_percentages):
    plt.text(i, v, f'{v}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()



#### 5. Multivariate analysis

- Scatter plot (permutate through combinations of numerical variables)
- Correlation coefficient (consider including correlation coefficient analysis by relevant categorical variables)
- Boxplot (distribution of numerical variables against categorical variables, per categorical variable) (where relevant)

In [None]:
# Create scatter plots with trend lines
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Relationships between Variables')

# 1. Age vs Spending Score
sns.scatterplot(data=data, 
                x='Age', 
                y='Spending Score (1-100)',
                hue='Gender',  # Color points by gender
                ax=axes[0])
sns.regplot(data=data,
            x='Age',
            y='Spending Score (1-100)',
            scatter=False,  # Don't add points again
            color='red',
            ax=axes[0])
axes[0].set_title('Age vs Spending Score')

# 2. Annual Income vs Spending Score
sns.scatterplot(data=data,
                x='Annual Income (k$)',
                y='Spending Score (1-100)',
                hue='Gender',
                ax=axes[1])
sns.regplot(data=data,
            x='Annual Income (k$)',
            y='Spending Score (1-100)',
            scatter=False,
            color='red',
            ax=axes[1])
axes[1].set_title('Annual Income vs Spending Score')

# 3. Age vs Annual Income
sns.scatterplot(data=data,
                x='Age',
                y='Annual Income (k$)',
                hue='Gender',
                ax=axes[2])
sns.regplot(data=data,
            x='Age',
            y='Annual Income (k$)',
            scatter=False,
            color='red',
            ax=axes[2])
axes[2].set_title('Age vs Annual Income')

plt.tight_layout()
plt.show()

In [None]:
# Print correlation coefficients
print("\nCorrelation Coefficients:")
correlations = data[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].corr()
print(correlations.round(3))

# Additional insights: Calculate correlations by gender
print("\nCorrelations by Gender:")
for gender in data['Gender'].unique():
    print(f"\n{gender} Customers:")
    gender_corr = data[data['Gender'] == gender][['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].corr()
    print(gender_corr.round(3))

In [None]:
# Set up the plotting style
sns.set_style("whitegrid")
sns.set_context("notebook")

# Create box plots for numerical variables by gender
plt.figure(figsize=(15, 5))

# Age distribution by Gender
plt.subplot(1, 3, 1)
sns.boxplot(data=data, x='Gender', y='Age')
plt.title('Age Distribution by Gender')

# Annual Income distribution by Gender
plt.subplot(1, 3, 2)
sns.boxplot(data=data, x='Gender', y='Annual Income (k$)')
plt.title('Income Distribution by Gender')

# Spending Score distribution by Gender
plt.subplot(1, 3, 3)
sns.boxplot(data=data, x='Gender', y='Spending Score (1-100)')
plt.title('Spending Score Distribution by Gender')

plt.tight_layout()
plt.show()

#### 6. Group Analysis
- Binning relevant numerical variables into categories to see catgory patterns against target variable
- Barplots
- Boxplots
- Summary statistics by variable groups
- Consider including categorical variable layer to analysis where relevant

In [None]:
# 1. Create age brackets
data['Age_Group'] = pd.cut(data['Age'], 
                          bins=[0, 20, 30, 40, 50, 60, 100],
                          labels=['<20', '21-30', '31-40', '41-50', '51-60', '60+'])

# 2. Create income brackets
data['Income_Group'] = pd.cut(data['Annual Income (k$)'],
                             bins=[0, 30, 60, 90, 120, 150],
                             labels=['Low (≤30k)', 'Lower-Mid (31-60k)', 
                                   'Upper-Mid (61-90k)', 'High (91-120k)', 'Very High (>120k)'])

In [None]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Group Analysis of Spending Patterns', fontsize=16)

# 1. Average Spending Score by Age Group and Gender
sns.barplot(data=data, x='Age_Group', y='Spending Score (1-100)', 
            ax=axes[0,0])
axes[0,0].set_title('Average Spending Score by Age Group')
axes[0,0].set_xlabel('Age Group')
axes[0,0].set_ylabel('Average Spending Score')

# 2. Average Spending Score by Income Group and Gender
sns.barplot(data=data, x='Income_Group', y='Spending Score (1-100)', 
            ax=axes[0,1])
axes[0,1].set_title('Average Spending Score by Income Group')
axes[0,1].set_xlabel('Income Group')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Average Spending Score by Age Group and Gender
sns.barplot(data=data, x='Age_Group', y='Spending Score (1-100)', 
            hue='Gender', ax=axes[1,0])
axes[1,0].set_title('Average Spending Score by Age Group and Gender')
axes[1,0].set_xlabel('Age Group')
axes[1,0].set_ylabel('Average Spending Score')

# 4. Average Spending Score by Income Group and Gender
sns.barplot(data=data, x='Income_Group', y='Spending Score (1-100)', 
            hue='Gender', ax=axes[1,1])
axes[1,1].set_title('Average Spending Score by Income Group and Gender')
axes[1,1].set_xlabel('Income Group')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Group Analysis of Spending Patterns', fontsize=16)

# 1. Box Plot of Spending Score by Age Group
sns.boxplot(data=data, x='Age_Group', y='Spending Score (1-100)', ax=axes[0,0])
axes[0,0].set_title('Distribution of Spending Score by Age Group')
axes[0,0].set_xlabel('Age Group')

# 2. Box Plot of Spending Score by Income Group
sns.boxplot(data=data, x='Income_Group', y='Spending Score (1-100)', ax=axes[0,1])
axes[0,1].set_title('Distribution of Spending Score by Income Group')
axes[0,1].set_xlabel('Income Group')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Box Plot of Spending Score by Age Group and Gender
sns.boxplot(data=data, x='Age_Group', y='Spending Score (1-100)', hue='Gender', ax=axes[1,0])
axes[1,0].set_title('Distribution of Spending Score by Age Group')
axes[1,0].set_xlabel('Age Group')

# 4. Box Plot of Spending Score by Income Group and Gender
sns.boxplot(data=data, x='Income_Group', y='Spending Score (1-100)', hue='Gender', ax=axes[1,1])
axes[1,1].set_title('Distribution of Spending Score by Income Group')
axes[1,1].set_xlabel('Income Group')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()



In [None]:
# Print summary statistics
print("\nSummary Statistics by Age Group:")
age_group_stats = data.groupby('Age_Group')['Spending Score (1-100)'].agg(['mean', 'std', 'count']).round(2)
print(age_group_stats)

print("\nSummary Statistics by Income Group:")
income_group_stats = data.groupby('Income_Group')['Spending Score (1-100)'].agg(['mean', 'std', 'count']).round(2)
print(income_group_stats)

print("\nSummary Statistics by Age Group and Gender:")
age_gender_stats = data.groupby(['Age_Group', 'Gender'])['Spending Score (1-100)'].agg(['mean', 'std', 'count']).round(2)
print(age_gender_stats)

#### 7. Segmentation Analysis
- For further analysis of obvious clusters observed from multivariate scatterplot analysis (where relevant to EDA objective)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
sns.set_style("whitegrid")
sns.set_context("notebook")

# Prepare data for clustering
# 2D clustering
X_2d = data[['Annual Income (k$)', 'Spending Score (1-100)']].values
scaler_2d = StandardScaler()
X_2d_scaled = scaler_2d.fit_transform(X_2d)

# Apply K-means clustering (k=5 based on visible clusters in scatter plot)
kmeans_2d = KMeans(n_clusters=5, random_state=42)
data['Segment_2d'] = kmeans_2d.fit_predict(X_2d_scaled)

# Visualize 2D segments
plt.figure(figsize=(15, 10))

# Create scatter plot
scatter = plt.scatter(data['Annual Income (k$)'], 
                     data['Spending Score (1-100)'],
                     c=data['Segment_2d'], 
                     cmap='viridis')

# Add cluster centers with labels
centers = scaler_2d.inverse_transform(kmeans_2d.cluster_centers_)
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='x', s=200, linewidth=3, label='Cluster Centers')

# Add annotations for each cluster center
for i, center in enumerate(centers):
    plt.annotate(f'Cluster {i}', 
                (center[0], center[1]),
                xytext=(10, 10), textcoords='offset points',
                bbox=dict(facecolor='white', edgecolor='black', alpha=0.7))

plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segments based on Income and Spending Score')
plt.colorbar(scatter, label='Segment')
plt.legend()
plt.show()

# Print segment characteristics
print("\n2D Segment Characteristics:")
segment_stats = data.groupby('Segment_2d').agg({
    'Age': ['mean', 'min', 'max'],
    'Annual Income (k$)': ['mean', 'min', 'max'],
    'Spending Score (1-100)': ['mean', 'min', 'max'],
    'CustomerID': 'count'
}).round(2)
    
# Add descriptive labels to each cluster
cluster_descriptions = {
    0: "Average Income - Average Spenders",    # Purple cluster in the middle
    1: "High Income - High Spenders",          # Dark blue cluster top right
    2: "Low Income - High Spenders",           # Teal cluster top left
    3: "High Income - Low Spenders",           # Green cluster bottom right
    4: "Low Income - Low Spenders"             # Yellow cluster bottom left
}

# Print detailed segment characteristics with cluster labels
for cluster in range(5):
    print(f"\nCluster {cluster}: {cluster_descriptions[cluster]}")
    print("-" * 50)
    stats = segment_stats.iloc[cluster]
    print(f"Number of customers: {stats[('CustomerID', 'count')]}")
    print(f"Age: {stats[('Age', 'mean')]} (range: {stats[('Age', 'min')]}-{stats[('Age', 'max')]})")
    print(f"Income: ${stats[('Annual Income (k$)', 'mean')]}k (range: ${stats[('Annual Income (k$)', 'min')]}k-${stats[('Annual Income (k$)', 'max')]}k)")
    print(f"Spending Score: {stats[('Spending Score (1-100)', 'mean')]} (range: {stats[('Spending Score (1-100)', 'min')]}-{stats[('Spending Score (1-100)', 'max')]})")

# Create a more detailed segment profile
data['Cluster_Profile'] = data['Segment_2d'].map(cluster_descriptions)

print("\nSegment Distribution with Profiles:")
print(data['Cluster_Profile'].value_counts())



### Key Insights & Takeaways

