In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv("card_cust.csv")
df.head(2)

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0
1,10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0


### 전처리

In [3]:
df.isna().sum()

CUST_ID                              0
BALANCE                              0
BALANCE_FREQUENCY                    0
PURCHASES                            0
ONEOFF_PURCHASES                     0
INSTALLMENTS_PURCHASES               0
CASH_ADVANCE                         0
PURCHASES_FREQUENCY                  0
ONEOFF_PURCHASES_FREQUENCY           0
PURCHASES_INSTALLMENTS_FREQUENCY     0
CASH_ADVANCE_FREQUENCY               0
CASH_ADVANCE_TRX                     0
PURCHASES_TRX                        0
CREDIT_LIMIT                         0
PAYMENTS                             0
MINIMUM_PAYMENTS                    74
PRC_FULL_PAYMENT                     0
TENURE                               0
dtype: int64

In [4]:
# df["CREDIT_LIMIT"] = df["CREDIT_LIMIT"].fillna(df["CREDIT_LIMIT"].mean())
df["MINIMUM_PAYMENTS"] = df["MINIMUM_PAYMENTS"].fillna(df["MINIMUM_PAYMENTS"].mean())

In [6]:
df.isna().sum().sum()

0

In [7]:
df_base = df.copy()

In [8]:
df_base.dtypes

CUST_ID                               int64
BALANCE                             float64
BALANCE_FREQUENCY                   float64
PURCHASES                           float64
ONEOFF_PURCHASES                    float64
INSTALLMENTS_PURCHASES              float64
CASH_ADVANCE                        float64
PURCHASES_FREQUENCY                 float64
ONEOFF_PURCHASES_FREQUENCY          float64
PURCHASES_INSTALLMENTS_FREQUENCY    float64
CASH_ADVANCE_FREQUENCY              float64
CASH_ADVANCE_TRX                    float64
PURCHASES_TRX                       float64
CREDIT_LIMIT                        float64
PAYMENTS                            float64
MINIMUM_PAYMENTS                    float64
PRC_FULL_PAYMENT                    float64
TENURE                              float64
dtype: object

### Q1.

In [9]:
df_q1 = df_base[["TENURE", "CREDIT_LIMIT", "BALANCE"]].copy()
df_q1.head(1)

Unnamed: 0,TENURE,CREDIT_LIMIT,BALANCE
0,12.0,1000.0,40.900749


In [10]:
df_q1["TENURE"].unique()

array([12.,  8., 11.,  9., 10.,  7.,  6.])

In [15]:
t = 6

df_q1_sub = df_q1.loc[df_q1["TENURE"] == t, ]
val_corr = df_q1_sub[["CREDIT_LIMIT", "BALANCE"]].corr().iloc[0, 1]
val_corr

0.8680561878004212

In [16]:
ls_corr = []
for t in df_q1["TENURE"].unique():
    df_q1_sub = df_q1.loc[df_q1["TENURE"] == t, ]
    val_corr = df_q1_sub[["CREDIT_LIMIT", "BALANCE"]].corr().iloc[0, 1]
    ls_corr = ls_corr + [val_corr]
    
ls_corr

[0.4608334883447319,
 0.820696287552146,
 0.38036039135870947,
 0.08547391277361133,
 0.2914823429859828,
 0.9484046188238867,
 0.8680561878004212]

In [19]:
round(max(ls_corr), 2)

0.95

In [24]:
df_corr = df.groupby("TENURE")[["CREDIT_LIMIT", "BALANCE"]].corr()
df_corr

In [28]:
df_corr_reset = df_corr.reset_index()
df_corr_reset.loc[df_corr_reset["level_1"] == "CREDIT_LIMIT", "BALANCE"].round(2).max()

0.95

In [32]:
df_corr.xs("BALANCE", level = 1)["CREDIT_LIMIT"].round(2).max()

0.95

### Q2.

In [34]:
df_q2 = df_base.drop(columns = "CUST_ID").copy()

In [36]:
arr_q2_nor = StandardScaler().fit_transform(df_q2)

In [37]:
arr_q2_nor[:1, ]

array([[-0.84876759, -0.41987944, -0.4419358 , -0.3740477 , -0.39530091,
        -0.48235389, -0.87270064, -0.80432139, -0.71961989, -0.68470093,
        -0.45791753, -0.56411617, -1.16166861, -0.55739615, -0.44372465,
        -0.46554357,  0.28242902]])

In [40]:
ls_sil = []
for k in range(2, 6):
    model = KMeans(n_clusters = k, random_state = 1234)
    model.fit(arr_q2_nor)
    val_sil = silhouette_score(arr_q2_nor, labels = model.labels_)
    ls_sil = ls_sil + [val_sil]



In [41]:
ls_sil

[0.3075281530456079,
 0.19636128772937608,
 0.207150984946399,
 0.19274056144483248]

In [42]:
model = KMeans(n_clusters = 2, random_state = 1234)
model.fit(arr_q2_nor)



In [44]:
# model.labels_
df_q2["cluster"] = model.labels_
df_q2.head(2)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,cluster
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0,0
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0,0


In [46]:
df_q2.groupby("cluster")["ONEOFF_PURCHASES"].mean().round(2).max()

3946.19

In [47]:
df_q2["cluster"].value_counts()

0    802
1    198
Name: cluster, dtype: int64

### Q3.

In [48]:
df_base.head(1)

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0


In [54]:
16 % 4, 13 % 4

(0, 1)

In [57]:
df_train = df_base.loc[(df_base["CUST_ID"] % 4) != 0, ].drop(columns = "CUST_ID")
df_test  = df_base.loc[(df_base["CUST_ID"] % 4) == 0, ].drop(columns = "CUST_ID")
len(df_train), len(df_test)

(752, 248)

In [70]:
df_train.head(2)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0


In [71]:
df_train["BALANCE"].mean()

2418.7994263976066

In [59]:
model_dt_r = DecisionTreeRegressor(random_state = 1234)
model_dt_r.fit(X = df_train.drop(columns = "ONEOFF_PURCHASES"),
               y = df_train["ONEOFF_PURCHASES"])
pred = model_dt_r.predict(df_test.drop(columns = "ONEOFF_PURCHASES"))
pred[:4]

array([1500.,    0., 1490.,    0.])

In [60]:
from sklearn.metrics import mean_squared_error

In [63]:
round(mean_squared_error(y_true = df_test["ONEOFF_PURCHASES"],
                         y_pred = pred) ** 0.5, 1)

1039.2

In [64]:
y_t = df_test["ONEOFF_PURCHASES"]
y_p = pred

In [68]:
((y_t - y_p) ** 2).mean() ** 0.5

1039.193967231063

In [69]:
import sklearn
sklearn.__version__

'1.2.1'