In [2]:
import pandas as pd
import plotly.express as px
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
def wrangle(filepath):
    df = pd.read_csv(filepath)
    mask = (df["TURNFEAR"] == 1) & (df["NETWORTH"] < 2e6)
    df = df[mask]
    return df

In [4]:
df = wrangle("/content/6.csv")
print(df.shape)
df.head()

(4418, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


In [5]:
top_ten_var = df.var().sort_values().tail(10)
top_ten_var

PLOAN1      1.140894e+10
ACTBUS      1.251892e+10
BUS         1.256643e+10
KGTOTAL     1.346475e+10
DEBT        1.848252e+10
NHNFIN      2.254163e+10
HOUSES      2.388459e+10
NETWORTH    4.847029e+10
NFIN        5.713939e+10
ASSET       8.303967e+10
dtype: float64

In [6]:
fig = px.bar(
 x= top_ten_var,
    y= top_ten_var.index,
    title="SCF: High Variance Features",

)
fig.update_layout(xaxis_title="Variance",yaxis_title="Feature")
fig.show()

horizontal boxplot of `"NHNFIN"` to determine if the values are skewed

In [7]:
fig = px.box(
data_frame= df,
    x="NHNFIN",
    title= "Distribution of Non-home, Non-Financial Assets",
)
fig.update_layout(xaxis_title="Value [$]")

fig.show()

In [9]:
top_ten_trim_var = df.apply(trimmed_var).sort_values().tail(10)
top_ten_trim_var

WAGEINC     5.550737e+08
HOMEEQ      7.338377e+08
NH_MORT     1.333125e+09
MRTHEL      1.380468e+09
PLOAN1      1.441968e+09
DEBT        3.089865e+09
NETWORTH    3.099929e+09
HOUSES      4.978660e+09
NFIN        8.456442e+09
ASSET       1.175370e+10
dtype: float64

In [10]:
fig = fig = px.bar(
 x= top_ten_trim_var,
    y= top_ten_trim_var.index,
    title="SCF: High Trimmed Variance Features",

)
fig.update_layout(xaxis_title="Trimmed Variance",yaxis_title="Feature")

fig.show()

## Split

In [11]:
high_var_cols = top_ten_trim_var.tail(5).index.to_list()
high_var_cols

['DEBT', 'NETWORTH', 'HOUSES', 'NFIN', 'ASSET']

In [12]:
X = df[high_var_cols]
print("X shape:", X.shape)
X.head()

X shape: (4418, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
5,12200.0,-6710.0,0.0,3900.0,5490.0
6,12600.0,-4710.0,0.0,6300.0,7890.0
7,15300.0,-8115.0,0.0,5600.0,7185.0
8,14100.0,-2510.0,0.0,10000.0,11590.0
9,15400.0,-5715.0,0.0,8100.0,9685.0


# Build Model

In [13]:
X_summary = X.aggregate(["mean","std"]).astype(int)
X_summary

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
mean,72701,76387,74530,117330,149089
std,135950,220159,154546,239038,288166


In [15]:
x = X["DEBT"]
x.head()

5    12200.0
6    12600.0
7    15300.0
8    14100.0
9    15400.0
Name: DEBT, dtype: float64

In [19]:
x_scaled = (x - x.mean()) / x.std()
round(x_scaled.mean())

0

In [20]:
# Instantiate transformer
ss = StandardScaler()

# Transform `X`
X_scaled_data = ss.fit_transform(X)

# Put `X_scaled_data` into DataFrame
X_scaled = pd.DataFrame(X_scaled_data,columns= X.columns)

print("X_scaled shape:", X_scaled.shape)
X_scaled.head()

X_scaled shape: (4418, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,-0.445075,-0.377486,-0.48231,-0.474583,-0.498377
1,-0.442132,-0.368401,-0.48231,-0.464541,-0.490047
2,-0.42227,-0.383868,-0.48231,-0.46747,-0.492494
3,-0.431097,-0.358407,-0.48231,-0.449061,-0.477206
4,-0.421534,-0.372966,-0.48231,-0.45701,-0.483818


In [21]:
X_scaled_summary = X_scaled.aggregate(["mean","std"]).astype(int)
X_scaled_summary

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
mean,0,0,0,0,0
std,1,1,1,1,1


In [22]:
n_clusters = range(2,13)
inertia_errors = []
silhouette_scores = []

for k in n_clusters:
    model=make_pipeline(
        StandardScaler(), 
        KMeans(n_clusters=k,random_state=42)
                       )
    model.fit(X)
    
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    silhouette_scores.append(model.named_steps["kmeans"].labels_)

print("Inertia:", inertia_errors[:3])
print()
print("Silhouette Scores:", silhouette_scores[:3])

























Inertia: [11028.058082607175, 7190.526303575358, 5924.997726868039]

Silhouette Scores: [array([1, 1, 1, ..., 1, 1, 1], dtype=int32), array([1, 1, 1, ..., 0, 0, 0], dtype=int32), array([0, 0, 0, ..., 1, 1, 1], dtype=int32)]


In [23]:
# Create line plot of `inertia_errors` vs `n_clusters`
fig = px.line(
    x=n_clusters, 
    y=inertia_errors,
    title="K-Means Model: Inertia vs Number of Clusters"
)
fig.update_layout(xaxis_title= "NO. of Clusters (K)",yaxis_title="Inertia")
fig.show()

In [25]:
final_model = make_pipeline(StandardScaler(),KMeans(n_clusters=4,random_state=42))
final_model.fit(X)





In [26]:
labels = final_model.named_steps["kmeans"].labels_
print(labels[:5])

[0 0 0 0 0]


In [27]:
xgb = X.groupby(labels).mean()
xgb

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,26551.075439,13676.153182,13745.637777,27226.05,40227.23
1,218112.818182,174713.441558,257403.246753,330588.4,392826.3
2,116160.779817,965764.155963,264339.449541,780061.1,1081925.0
3,732937.575758,760397.575758,826136.363636,1276227.0,1493335.0


In [28]:
fig = px.bar(
xgb,
    barmode="group"
)

fig.show()

In [31]:
pca = PCA(n_components=2,random_state=42)

X_t = pca.fit_transform(X)

X_pca = pd.DataFrame(X_t,columns=["PC1","PC2"])

print("X_pca shape:", X_pca.shape)
X_pca.head()

X_pca shape: (4418, 2)


Unnamed: 0,PC1,PC2
0,-221525.42453,-22052.273003
1,-217775.100722,-22851.358068
2,-219519.642175,-19023.646333
3,-212195.720367,-22957.107039
4,-215540.507551,-20259.749306


In [32]:
fig = px.scatter(
data_frame=X_pca,
x="PC1",
    y="PC2",
    color=labels.astype(str),
    title="PCA Representation of Clusters"
)

fig.show()