In [47]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.stats.mstats import trimmed_var
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

In [11]:
df  = pd.read_csv("../Dataset/SCFP2022.csv")
df.head()

Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
0,1,11,3027.95612,2,70,5,9,3,2,2,...,4,2,4,2,1,8,3,3,2,1
1,1,12,3054.900065,2,70,5,9,3,2,2,...,4,2,5,2,1,8,3,3,2,1
2,1,13,3163.637766,2,70,5,9,3,2,2,...,4,2,4,2,1,8,3,3,1,1
3,1,14,3166.228463,2,70,5,9,3,2,2,...,3,2,4,1,1,6,3,2,1,1
4,1,15,3235.624715,2,70,5,9,3,2,2,...,3,2,4,2,1,8,3,3,1,1


# In this notebook we are going to focus households which are credit fearful and those having Networth less than 2 milllion , because the households having Networth greater than 2 million dollars have very less chances of credit being denied and they act as outliers in the data of credit fearful people.

In [12]:
filter  = (df["TURNFEAR"] == 1) & (df["NETWORTH"] < 2e6)
df = df[filter]
df.shape

(3672, 356)

## We want to cluster this dataset based on multiple features now , but as the dataset has 356 columns , its difficult to gather information about each column, hence we will use those features which have high variance means they have more data and variability

In [13]:
# Calculate variance, get 10 largest features
top_ten_var = df.var().sort_values().tail(10)

In [14]:
top_ten_var

HOMEEQ      1.481548e+10
FIN         1.561213e+10
DEBT        1.988860e+10
NHNFIN      2.512559e+10
KGBUS       3.167079e+10
HOUSES      3.381911e+10
KGTOTAL     3.853314e+10
NETWORTH    6.670754e+10
NFIN        7.102944e+10
ASSET       1.102278e+11
dtype: float64

In [22]:
fig = px.bar(
    
    x=top_ten_var,
    y=top_ten_var.index,
    title= "SCF: High Variance Features"

)
fig.update_layout(xaxis_title="Variance",yaxis_title="Feature", height = 500)

fig.show()

In [23]:
# Create a boxplot of `NHNFIN`
fig = px.box(
    data_frame=df,
    x="NHNFIN",
    title="Distribution of Non-home, Non-Financial Assets"
)
fig.update_layout(xaxis_title="Values [$]")

fig.show()

## Even though we removed the outliers from the original dataset , some features are still affected by outliers mainly which are wealthy households

In [26]:
# Calculate trimmed variance
top_ten_trim_var = df.apply(trimmed_var,limits=(0.1,0.1)).sort_values().tail(10)
top_ten_trim_var

NH_MORT     9.661827e+08
MRTHEL      9.885072e+08
KGTOTAL     9.963638e+08
PLOAN1      1.120744e+09
HOMEEQ      1.494964e+09
DEBT        2.775392e+09
NETWORTH    5.946834e+09
HOUSES      6.387043e+09
NFIN        1.258410e+10
ASSET       1.751559e+10
dtype: float64

In [27]:
# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    
    x=top_ten_trim_var,
    y=top_ten_trim_var.index,
    title="SCF: High Variance Features"
)
fig.update_layout(xaxis_title="Trimmed Variance", yaxis_title="Feature")

fig.show()

In [28]:
high_var_cols = top_ten_trim_var.tail(5).index.to_list()
high_var_cols

['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']

In [29]:
X = df[high_var_cols]
print("X shape:", X.shape)
X.head()

X shape: (3672, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
20,11000,4100.0,0,14000.0,15100.0
21,10000,5500.0,0,14000.0,15500.0
22,10000,4500.0,0,13000.0,14500.0
23,10000,5600.0,0,14000.0,15600.0
24,11000,4500.0,0,14000.0,15500.0


In [30]:
X_summary = X.aggregate(["mean","std"])
X_summary

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
mean,72319.986383,104823.999891,86763.071895,140926.500163,177143.986275
std,141026.965379,258278.019155,183899.727419,266513.495317,332005.68933


In [33]:
# Instantiate transformer
ss = StandardScaler()

# Transform `X`
X_scaled_data = ss.fit_transform(X)

# Put `X_scaled_data` into DataFrame
X_scaled = pd.DataFrame(X_scaled_data,columns=X.columns)

print("X_scaled shape:", X_scaled.shape)
X_scaled.head()

X_scaled shape: (3672, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,-0.43487,-0.390036,-0.47186,-0.476313,-0.488142
1,-0.441961,-0.384615,-0.47186,-0.476313,-0.486937
2,-0.441961,-0.388487,-0.47186,-0.480065,-0.48995
3,-0.441961,-0.384227,-0.47186,-0.476313,-0.486636
4,-0.43487,-0.388487,-0.47186,-0.476313,-0.486937


In [36]:
n_clusters = range(2,13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    #building the pipeline for the model
    model  = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42))
    #training of model
    model.fit(X)
    # calculating inertia for each model and adding to list inertia_errors
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    # calculating sillhouette score for each model and appending to list sillsilhouette_scores
    silhouette_scores.append(silhouette_score(X,model.named_steps["kmeans"].labels_))

print("Inertia:", inertia_errors[:3])
print()
print("Silhouette Scores:", silhouette_scores[:3])

























Inertia: [8330.713703262289, 5668.186693086036, 4638.125329170422]

Silhouette Scores: [0.7623318800335575, 0.6991628816143209, 0.6816110141663074]


In [37]:
# Create line plot of `inertia_errors` vs `n_clusters`
fig = px.line(
    
    x=n_clusters,
    y=inertia_errors,
    title="K-Means Model: Inertia vs Number of Clusters"
)

fig.update_layout(xaxis_title  = "Number of Clusters(k)", yaxis_title = "Inertia")

fig.show()

In [38]:
# Create a line plot of `silhouette_scores` vs `n_clusters`
fig = px.line(   
    x = n_clusters,
    y = silhouette_scores,
    title = "K-Means Model: Silhouette Score vs Number of Clusters"
)

fig.update_layout(xaxis_title = "Number of Clusters(k)", yaxis_title = "Silhouette Score")

fig.show()

# Building final model with 5 clusters

In [43]:
final_model = make_pipeline(
    
    StandardScaler(),
    KMeans(n_clusters=5, random_state=42)
)

final_model.fit(X)






In [44]:
labels = final_model.named_steps["kmeans"].labels_

In [45]:
xgb = X.groupby(labels).mean()
xgb

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,390531.5,288001.0,528497.536946,606566.8,678532.4
1,24287.42,8932.813,6511.275416,23020.86,33220.23
2,122451.1,232304.6,199394.56869,294325.7,354755.7
3,1224280.0,774062.5,622500.0,1348630.0,1998342.0
4,164776.6,1198156.0,478177.966102,1024213.0,1362932.0


In [46]:
# Create side-by-side bar chart of `xgb`
fig = px.bar(
    xgb,
    barmode="group",
    title= "Mean Household Finances by Cluster"
)

fig.update_layout(xaxis_title="Cluster", yaxis_title= "Value [$]")

fig.show()

In [48]:
# Instantiate transformer
pca = PCA(n_components=2, random_state=42)

# Transform `X`
X_t = pca.fit_transform(X)

# Put `X_t` into DataFrame
X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])

print("X_pca shape:", X_pca.shape)
X_pca.head()

X_pca shape: (3672, 2)


Unnamed: 0,PC1,PC2
0,-251204.231325,25992.79361
1,-250476.854167,27493.459458
2,-252087.985364,26975.015056
3,-250366.202109,27561.475569
4,-250761.623093,26264.858052


In [49]:
# Create scatter plot of `PC2` vs `PC1`

fig  = px.scatter(
    data_frame = X_pca,
    x = "PC1",
    y = "PC2",
    color = labels.astype(str),
    title= "PCA Representation of Clusters"

)

fig.update_layout(xaxis_title = "PC1", yaxis_title = "PC2")

fig.show()