## Spending Personality Test

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import json

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
#import hdbscan

from mpl_toolkits.mplot3d import axes3d
import plotly.express as px

In [None]:
# Import dataset 
df = pd.read_csv("./data/data.csv", sep='\t')

# Basic Info
df.shape
df.info()
display(df.head())
print("Shape:", df.shape)

### Understanding the Data

In [None]:
display(df.describe(include="number").T.round(2))


In [None]:
# Quick category peek
for col in ["Education", "Marital_Status"]:
    if col in df.columns:
        print(f"\nValue counts for {col}:")
        print(df[col].value_counts(dropna=False).head(10))

### Cleaning

In [None]:
## Clean Categorical Columns
df['Marital_Status'] = df['Marital_Status'].str.strip()
df['Education'] = df['Education'].str.strip()

In [None]:
# Fixing Wired Martial_Status Entries
martial_map = {
    'Alone': 'Single',
    'Absurd': 'Single',
    'YOLO': 'Single',
}

df['Marital_Status_norm'] = df['Marital_Status'].replace(martial_map)


# Normalize 'Education' Entries
edu_map = {
    '2n Cycle': 'Graduate',
    'Graduation': 'Graduate',
    'Master': 'Post-Graduate',
    'PhD': 'Post-Graduate',
    'Basic': 'Basic'
}

df['Education_norm'] = df['Education'].replace(edu_map)

In [None]:
# Parse Data
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], errors='coerce', dayfirst=True)

In [None]:
# Impute Missing Income 
df['Income']= pd.to_numeric(df['Income'], errors='coerce')
group_median = df.groupby(['Education_norm', 'Marital_Status_norm'])['Income'].transform('median')
df['Income'] = df['Income'].fillna(group_median)
df['Income'] = df['Income'].fillna(df['Income'].median())

print("Remaining nulls in income:", df['Income'].isnull().sum())
df[['Education', 'Education_norm', 'Marital_Status', 'Marital_Status_norm']].head(10)

### Feature Engineering

In [None]:
# Reference date - Today for our age & tenure calculations
REF_DATE = pd.Timestamp.today().normalize()

# Calculate Customer Age
df['Age'] = REF_DATE.year - df['Year_Birth']

# Children QTY
df['Children'] = df['Kidhome'] + df['Teenhome']

# Customer Tenure  : Measures loyalty or relationship age
df['Customer_For_Days'] = (REF_DATE - df['Dt_Customer']).dt.days
df['Customer_For_years'] = (df['Customer_For_Days'] / 365.25).round(1)


# Total Spending ( Sum of all Mnt* columns)
mnt_cols = [col for col in df.columns if col.startswith('Mnt')]
df['TotalMnt'] = df[mnt_cols].sum(axis=1)

# Total Purchase 
purchase_cols = [col for col in df.columns if col.startswith('Num') and col.endswith('Purchases')]
df['TotalPurchases'] = df[purchase_cols].sum(axis=1)

In [None]:

# Total Purchase 
purchase_cols = [col for col in df.columns if col.startswith('Num') and col.endswith('Purchases')]
df['TotalPurchases'] = df[purchase_cols].sum(axis=1)

# Average Spend Per Purchase 
df["AvgMntPerPurchase"] = np.where(df['TotalPurchases'] > 0,
                                   df['TotalMnt'] / df['TotalPurchases'], 
                                   np.nan)

# Replacing rows with 0 transactions to 1 as not possible to generate a spend with no txs
df["TotalPurchases"] = df["TotalPurchases"].replace(
    {
        0 : 1
    }
)

In [None]:
# CHANNEL SHARES
for channel in ['Web', 'Catalog', 'Store']:
    col = f'Num{channel}Purchases'
    if col in df.columns:
        df[f'{channel}PurchaseShare'] = np.where(df['TotalPurchases'] > 0,
                                                 df[col] / df['TotalPurchases'],
                                                 np.nan)
        


# Deal Purchase Rate
df['DealPurchaseRate'] = np.where(df['TotalPurchases'] > 0,
                                  df['NumDealsPurchases'] / df['TotalPurchases'],
                                  np.nan)


# Campaign Response Rate
cmp_cols = [col for col in df.columns if col.startswith('AcceptedCmp')]
df['CampaignsAccepted'] = df[cmp_cols].sum(axis=1)
df['Responded'] = (df['Response'] == 1).astype(int)

In [None]:
# Checking the changes 
df[['Age', 'Children', 'Customer_For_years', 'TotalMnt', 'TotalPurchases', 
    'AvgMntPerPurchase', 'WebPurchaseShare', 'CatalogPurchaseShare',
    'StorePurchaseShare', 'DealPurchaseRate', 'CampaignsAccepted', 'Responded']].head(10)

## Data Cleaning & Feature Engineering Summary

### 1. Data Cleaning
We started by inspecting the dataset for missing values, incorrect data types, and inconsistent categories.

- **Missing Values:** Only the `Income` column had missing entries (~1%).  
  → Imputed using the **median income** of each `(Education_norm, Marital_Status_norm)` group.  
- **Categorical Normalization:**
  - Mapped unusual `Marital_Status` values like `"YOLO"`, `"Absurd"`, and `"Alone"` to `"Single"`.
  - Simplified `Education` levels by merging similar categories:
    - `"2n Cycle"` and `"Graduation"` → `"Graduate"`.
    - `"Master"` and `"PhD"` → `"Postgraduate"`.
    - `"Basic"` kept as `"Basic"`.
- **Date Parsing:** Converted `Dt_Customer` to proper datetime format for tenure calculations.

### 2. Feature Engineering
Created meaningful new variables to better represent customer behavior and demographics:

| Feature | Description | Insight |
|----------|--------------|----------|
| **Age** | `current_year - Year_Birth` | Captures life stage and spending maturity. |
| **Children** | `Kidhome + Teenhome` | Indicates family size and dependency load. |
| **Customer_For_years** | Tenure calculated from `Dt_Customer`. | Reflects customer loyalty and relationship length. |
| **TotalMnt** | Sum of all `Mnt*` (spending) columns. | Represents overall spending volume. |
| **TotalPurchases** | Sum of all `Num*Purchases` columns. | Measures overall purchase activity. |
| **AvgMntPerPurchase** | `TotalMnt / TotalPurchases`. | Shows average order value per transaction. |
| **WebPurchaseShare**, **CatalogPurchaseShare**, **StorePurchaseShare** | Ratio of purchases per channel. | Highlights preferred shopping channels. |
| **DealPurchaseRate** | `NumDealsPurchases / TotalPurchases`. | Indicates price sensitivity or bargain-hunting behavior. |
| **CampaignsAccepted** | Sum of all `AcceptedCmp*` columns. | Shows how often the customer accepted previous campaigns. |
| **Responded** | Binary flag from `Response` (1 if accepted last campaign). | Reflects latest engagement behavior. |

### 3. Why These Features Matter
These engineered features summarize customer spending habits and engagement patterns.  
They help us:
- Compare **planned vs. impulsive** shoppers (e.g., high deal rate vs. low deal rate).
- Understand **loyalty** and **tenure** (longer customers may behave differently).
- Detect **channel preferences** (online vs. in-store).
- Prepare for building **spending personality profiles** (Saver, Splurger, Planner, etc.).

---

The dataset is now cleaned, consistent, and enriched with features ready for exploratory analysis.


## Exploratory Data Analysis

In [None]:
# Numeric Summary
display(df[['Income', 'Age', 'Children', 'TotalMnt', 'TotalPurchases',
            'AvgMntPerPurchase', 'DealPurchaseRate', 'CampaignsAccepted',
            'Customer_For_years']].describe().T.round(2))

In [None]:
Check_zero_puchase  = df['TotalPurchases'] == 0
print("Number of customers with zero purchases:", Check_zero_puchase.sum())

In [None]:
# Histograms
def plot_hist(series, title, bins=30):
    plt.figure()
    series.dropna().plot(kind='hist', bins=bins)
    plt.title(title)
    plt.xlabel(series.name)
    plt.ylabel('Frequency')
    plt.show()

for col in ['Income', 'Age', 'TotalMnt', 'TotalPurchases', 'AvgMntPerPurchase']:
    if col in df.columns:
        plot_hist(df[col], f'{col} distribution')

What the charts tell us
1. **Income distribution**
- Very right-skewed: most customers earn under $100K, but there are a few extreme outliers up to $600K+.
- These outliers will heavily influence correlations and averages.

2. **Age distribution**
- Centered around 45–65 years, meaning this customer base is mostly middle-aged adults.
- The one record showing Age >120 is clearly an outlier — likely a data entry error (e.g., wrong birth year). <br/>
→ We might need to cap Age at 90 later.

3. **TotalMnt (Total Spend)**
- Also right-skewed: most people spend under $1,000, but a small fraction go up to $2,500+.
- This indicates a few heavy buyers — possibly our “Splurgers” or “High-value” personalities later.

4.**Total Purchases**
- Ranges roughly 0–40, average ~15 — a wide range in engagement.
- Suggests there are light shoppers and heavy repeat buyers — a strong behavioral differentiator.

5. **AvgMntPerPurchase**
- Extremely right-skewed — most purchases are small, but some individuals spend hundreds per purchase.

In [None]:
# Relationship Check

plt.figure()
plt.scatter(df['Income'], df['TotalMnt'], alpha=0.5)
plt.title('Income vs Total Spend')
plt.xlabel('Income')
plt.ylabel('Total Spend(TotalMnt)')
plt.show()

plt.figure()
plt.scatter(df['Age'], df['TotalMnt'], alpha=0.5)
plt.title('Age vs Total Spend')
plt.xlabel('Age')
plt.ylabel('Total Spend(TotalMnt)')
plt.show()

plt.figure()
plt.scatter(df['DealPurchaseRate'], df['TotalMnt'], alpha=0.5)
plt.title('Deal Purchase Rate vs Total Spend')
plt.xlabel('Deal Purchase Rate')
plt.ylabel('Total Spend(TotalMnt)')
plt.show()

### Relationships**
**Income vs Total Spend**
- Clear positive relationship: higher income → higher spend, but with a wide vertical spread.<br/>
→ Interpretation: income affects spending capacity, but not alone — other factors (deals, habits, family size) matter.

**Age vs Total Spend**
- No obvious linear trend — spending seems scattered across ages.<br/>
→ Spending behavior is not strongly age-driven.

**Deal Purchase Rate vs Total Spend**
- Negative trend: as deal rate increases, total spend decreases.<br/>
→ Bargain hunters tend to spend less overall — consistent with the “Saver” archetype we’ll define later.

In [None]:
# Select Numeric Columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Correlation Matrix
corr = df[numeric_cols].corr()

# Focus on correlations with Spending Behavior
target_vars =['TotalMnt', 'TotalPurchases']
for target in target_vars:
    if target in corr.columns:
        print(f"\nTop correlations with {target}:")
        display(corr[target].sort_values(ascending=False).head(10))
        display(corr[target].sort_values(ascending=True).head(10))
            

In [None]:
# Visual Correlation Heatmap
key_cols = [
    'Income', 'Age' , 'Children', 'Customer_For_years', 'TotalMnt', 'TotalPurchases',
    'AvgMntPerPurchase', 'WebPurchaseShare', 'catalogPurchaseShare',
    'StorePurchaseShare', 'DealPurchaseRate', 'CampaignsAccepted', 'Responded'
]


key_cols = [c for c in key_cols if c in df.columns]

plt.figure(figsize=(10, 8))
plt.imshow(df[key_cols].corr(), cmap='coolwarm', aspect='auto')
plt.colorbar(label='Correlation Coefficient')
plt.xticks(range(len(key_cols)), key_cols, rotation=90)
plt.yticks(range(len(key_cols)), key_cols)
plt.title("Correlation Matric(Key Spending Features)")
plt.tight_layout()
plt.show()

## Correlation Analysis — Key Insights

### Overview
The correlation matrix and feature relationships reveal strong behavioral patterns that connect spending capacity, activity level, and shopping style.


###  1. Income and Total Spending
- **Strong positive correlation (~0.6–0.7)**  
  Higher income customers consistently spend more overall.  
  → *Income* is a key driver of spending capacity.


###  2. Total Purchases and Deal Purchase Rate
- **Moderate negative correlation (~–0.4)**  
  Customers who rely more on discounts make fewer purchases overall.  
  → Reflects *Saver* or *Deal-Seeker* behavior — they wait for promotions and buy selectively.


###  3. Channel Behavior
- **Store Purchase Share:** Negatively correlated with Total Purchases (~–0.3)  
  → In-store shoppers purchase less frequently (more deliberate, planned).

- **Web Purchase Share:** Positively correlated (likely +0.3)  
  → Online shoppers tend to buy more often — possibly more impulsive.


###  4. Family Impact
- **Children / Kidhome:** Negatively correlated with Total Purchases (~–0.25 to –0.48)  
  → More kids at home → tighter budgets and reduced discretionary spending.  
  Indicates a cautious, *Saver-type* pattern.


###  5. Age
- Weak negative correlation with Total Purchases.  
  → Spending doesn’t vary much by age; older customers are not necessarily higher spenders.


### 6.  Marketing Engagement
- **CampaignsAccepted & Responded:** Mild positive correlation with Total Spend (~+0.2).  
  → Engaged customers spend slightly more and may represent loyal *Planner* types.


###  Key Takeaways

| Behavior Dimension | Strong Indicators | Interpretation |
|--------------------|------------------|----------------|
| **Spending Capacity** | `Income`, `TotalMnt` | Financial ability to spend |
| **Spending Activity** | `TotalPurchases`, `AvgMntPerPurchase` | Frequency and basket size |
| **Impulse vs. Planning** | `DealPurchaseRate`, `WebPurchaseShare`, `StorePurchaseShare` | How spontaneous or deliberate the buyer is |
| **Engagement & Responsiveness** | `CampaignsAccepted`, `Responded` | Brand interaction and marketing response |

###  Overall Insight
- **High Income + Low Deal Rate → Splurger / Luxury Buyer**  
- **Low Income + High Deal Rate → Saver / Bargain Hunter**  
- **Moderate Income + Balanced Behavior → Planner / Practical Buyer**

These relationships form the foundation for our upcoming **Spending Personality Scoring Rules**, which will classify customers into interpretable segments for the MVP.


### Defining Personality Type

1. Saver → cautious, deal-seeking, low impulse
2. Splurger → high spenders, low restraint
3. Planner → balanced, consistent, thoughtful spenders
4. Impulse Buyer → spontaneous, high-frequency, high online activity

In [None]:
# Creating flags

def flag_by_percentile(series, low_q=0.33, high_q=0.67):
    low, high = series.quantile([low_q, high_q])
    return pd.cut(series, bins=[-float('inf'), low, high, float('inf')], labels=['Low', 'Medium', 'High'])


df['Income_level'] = flag_by_percentile(df['Income'])
df['Spending_level'] = flag_by_percentile(df['TotalMnt'])
df['Deal_level'] = flag_by_percentile(df['DealPurchaseRate'])
df['Online_level'] = flag_by_percentile(df['WebPurchaseShare'])
df['Response_level'] = pd.cut(         #### Beacuse most of the values are zero we use cut instead of quantile(I had an error priror to it)
    df['CampaignsAccepted'],
    bins=[-np.inf, 0, 1, np.inf],
    labels=['Low', 'Mid', 'High'],
    include_lowest=True
)


# Sanity Check
df[['Income_level', 'Spending_level', 'Deal_level', 'Online_level', 'Response_level']].head(10)

In [None]:
# Applying Personalyti Logis

def classify_personality(row):
    # Saver: deal-seeking, modes income/spend
    if row['Deal_level'] == 'High' and row['Income_level'] == 'High':
        return 'Saver'  
    
    # Splurger: high spenders, High Income
    elif row['Income_level'] == 'High' and row['Spending_level'] == 'High' and row['Deal_level'] != 'High':
        return 'Splurger'
    
    #Impulse Buyer: high online share, high deal rate
    elif row['Online_level'] == 'High' and row['Deal_level'] == 'Low':
        return 'Impulse Buyer'
    
    # Planner: balanced or engaged with campaigns
    elif row['Response_level'] == 'High' or (row['Deal_level'] == 'Medium' and row['Online_level'] == 'Medium'):
        return 'Planner'
    
    else: 
        return 'Planner' # Default to Planner
    

df['Personality_Type'] = df.apply(classify_personality, axis=1)
df['Personality_Type'].value_counts()

In [None]:
# Comparing Feature Profiles by Personality Type

summary = df.groupby('Personality_Type')[['Income', 'TotalMnt', 'TotalPurchases',
                                        'DealPurchaseRate', 'WebPurchaseShare',
                                        'AvgMntPerPurchase', 'CampaignsAccepted']].mean().round(2)

display(summary)


# Visualizing Personality Types
summary.plot(kind='bar', subplots=True, layout=(3,3), figsize=(12,10), legend=False, sharex=True)
plt.suptitle('Average Feature Values per Personality Type', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## Clustering

In [None]:
# Dropping unusable and redundant columns
df = df.drop(["ID", "Z_Revenue","Z_CostContact", "Marital_Status", "Education"], axis=1)

# Checking for missing values
df.isna().sum().sort_values(ascending=False)


In [None]:
# Viewing rows with missing values
df[df["AvgMntPerPurchase"].isna()]

Dropping the 4 rows with missing values as they seem to just add noise. Could be an error in the data entry as never saw so many products at $1.

In [None]:
# Dropping rows
df = df[~df["AvgMntPerPurchase"].isna()]

# Confirming changes
df.isna().sum().sort_values(ascending=False).head(3)

In [None]:
# Decalring column with hierachical values for ordinal encoding
hierarchical_cols = ["Income_level","Spending_level","Deal_level","Online_level","Response_level","Personality_Type"]

# Ordinal Encoding the hierachical features
oe = OrdinalEncoder()
df[hierarchical_cols] = oe.fit_transform(df[hierarchical_cols])

# Viewing transformations
df[hierarchical_cols]

In [None]:
# Declaring categorical columns to be One hot Encoded
cat_cols = ["Marital_Status_norm","Education_norm"]
            
# OHEncoding categorical features
ohe = OneHotEncoder(
    handle_unknown="error"
)

ohe.fit(df[cat_cols])

df_ohe = ohe.transform(df[cat_cols]).toarray()

# Viewing results
df_ohe = pd.DataFrame(df_ohe, columns=ohe.get_feature_names_out())

In [None]:
# Combining ecnoded features with original dataset
df_ohe = pd.concat([df,df_ohe], axis=1)
df_ohe = df_ohe.drop(["Marital_Status_norm", "Education_norm","Dt_Customer"], axis=1)

df_ohe.columns

In [None]:
# Declaring numeric features for scaling
numeric = list(set(df_ohe.columns) - set(ohe.get_feature_names_out()) - set(hierarchical_cols))

numeric

In [None]:
# Checking for missing values
print(f"Missing values: {df_ohe.isna().sum().sort_values(ascending=False)[0]}")

# Dropping rows with missing values
df_ohe = df_ohe.dropna()

# Confirming values dropped
print(f"Missing values left: {df_ohe.isna().sum()[0]}")
# Scaling numeric features
scaler = StandardScaler()

# Transforming numeric features
df_scaled = df_ohe.copy()
df_scaled[numeric] = scaler.fit_transform(df_ohe[numeric])

df_scaled[numeric]

In [None]:
# Declaring random seed value
state = 42

# Clustering data and plotting inertial for optimal number of clusters
inertia_scores = []
cluster_range = range(3,9)

for i in cluster_range:
    # print("Number of clusters: ",i)
    kmeans = KMeans(
        n_clusters = i,
        n_init = "auto",
        max_iter = 500,
        # verbose = 4,
        random_state = state,
    )
    kmeans.fit(df_scaled)
    inertia_scores.append(kmeans.inertia_)

# Plotting inertia scores from each iteration
plt.figure(figsize=(6,9))
plt.plot(cluster_range, inertia_scores, marker = 'o')
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Check")

plt.show()

Using the inertia scores from each iteration we can ideally see an "elbow" which would indicate the ideal number of clusters. Being that an elbow has not formed even with 8 clusters, we can conclude that the current dataset as is cannot be properly clustered into a reasonable amount of clusters.

We will try a different approach and reduce the dimensioanlity of our dataset into a small number of principal components

In [None]:
# Getting labels and centroids
kmeans_pred = kmeans.predict(df_scaled)
cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

n_components = 2

# Instatiating the PCA tool with 2 principal components
pca = PCA(n_components=n_components, random_state=state)

# Getting principal components and their values
df_pca = pca.fit_transform(df_scaled)

# Converting results into df
df_pca = pd.DataFrame(df_pca)

In [None]:
sns.scatterplot(x=df_pca[0], y=df_pca[1], c=labels, s=100, alpha=0.3)

plt.title("Principle Component Clustering")
plt.legend(title="Cluster", labels=labels)


In [None]:
n_components = 3
# Decalaring column names for pc dataframe
col_names = [f"pc_{i+1}" for i in range(n_components)]

# Instantiating PCA 
pca = PCA(n_components=n_components, random_state=state)
df_pca = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(df_pca, columns=col_names)

kmeans_pca = KMeans(n_clusters=4)

pca_pred = kmeans_pca.fit_predict(df_pca)
# pca_pred = pd.DataFrame(pca_pred, columns=df_pca.columns)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
colors = ['r','g','b','y']
cluster_colors = [colors[label] for label in pca_pred]
axes3d.Axes3D.scatter(xs=df_pca["pc_1"], ys=df_pca["pc_2"], zs=df_pca["pc_3"], ax=ax, c=cluster_colors)

# plt.scatter(cluster_centers[:,0], cluster_centers[:,1], label="Cluster Centroids", marker='*', s=300, c='r')
plt.legend(labels=labels)
plt.show()

In [None]:
labels

In [None]:
df_pca["cluster"] = pca_pred.astype('str')

fig = px.scatter_3d(
    df_pca,
    x="pc_1",
    y="pc_2",
    z="pc_3",
    color = "cluster",
    title = "Interactive 3D Cluster Map",
    opacity = 0.3
)

fig.update_traces(marker=dict(size=4))
fig.show()

### Initial Principal Component Analysis

In [None]:
# Extracting original column names
col_names = list(df_scaled.columns)

loadings_df = pd.DataFrame(pca.components_.T, columns=["pc1", "pc_2", "pc_3", "pc_4", "pc_5"], index=col_names)

loadings_df

In [None]:
print(f"Explained variance by our PCs: {pca.explained_variance_ratio_}")
print(f"Total variance in 3Pcs: {pca.explained_variance_ratio_.sum()}")

We're not seeing enough variance within the dataset. We'll increase the number of components and try with larger number of componets.