In [16]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [17]:
np.random.seed(42)
customer_orders_weighted = pd.DataFrame({
    "CustomerID": np.arange(1, 1001),
    "OrderDate": pd.date_range(start="2023-01-01", periods=1000, freq="D"),
    "Repurchase": np.random.choice([0, 1], size=1000, p=[0.7, 0.3])
})
customer_orders_weighted["OrderAmount"] = np.where(
    customer_orders_weighted["Repurchase"] == 1,
    np.random.randint(500, 1000, size=1000),
    np.random.randint(10, 500, size=1000)
)


customer_orders_random = pd.DataFrame({
    "CustomerID": np.arange(1, 1001),
    "OrderDate": pd.date_range(start="2023-01-01", periods=1000, freq="D"),
    "Repurchase": np.random.choice([0, 1], size=1000, p=[0.7, 0.3]),
    "OrderAmount": np.random.randint(10, 1000, size=1000)
})

In [18]:
customer_orders_weighted

Unnamed: 0,CustomerID,OrderDate,Repurchase,OrderAmount
0,1,2023-01-01,0,335
1,2,2023-01-02,1,511
2,3,2023-01-03,1,817
3,4,2023-01-04,0,388
4,5,2023-01-05,0,436
...,...,...,...,...
995,996,2025-09-22,0,140
996,997,2025-09-23,1,770
997,998,2025-09-24,0,227
998,999,2025-09-25,1,931


In [19]:
customer_orders_random

Unnamed: 0,CustomerID,OrderDate,Repurchase,OrderAmount
0,1,2023-01-01,0,137
1,2,2023-01-02,0,831
2,3,2023-01-03,1,486
3,4,2023-01-04,1,769
4,5,2023-01-05,0,413
...,...,...,...,...
995,996,2025-09-22,1,347
996,997,2025-09-23,1,969
997,998,2025-09-24,1,175
998,999,2025-09-25,0,392


In [20]:
X = customer_orders_random[['OrderAmount']]
y = customer_orders_random[['Repurchase']]

## trian,test 나누기
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=111)

In [21]:
## DT 의사결정나무
dt = DecisionTreeClassifier(random_state=111)
dt.fit(X_train, y_train)

## y_pred
y_pred = dt.predict(X_test)

In [22]:
dt

DecisionTreeClassifier(random_state=111)

In [23]:
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0])

In [24]:
## 정확도 평가
acc =accuracy_score(y_test, y_pred)
print(acc)

0.555


In [25]:
X = customer_orders_weighted[['OrderAmount']]
y = customer_orders_weighted[['Repurchase']]

## trian,test 나누기

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=111)

## DT 의사결정나무
dt = DecisionTreeClassifier(random_state=111)
dt.fit(X_train, y_train)

## y_pred
y_pred = dt.predict(X_test)

## 정확도 평가
acc =accuracy_score(y_test, y_pred)
print(acc)

1.0


### 결측값을 만들어서, 해당 결측값을 다양하게 처리해 보고 실제 성능도 확인해 보자!

In [26]:
customer_orders_random

Unnamed: 0,CustomerID,OrderDate,Repurchase,OrderAmount
0,1,2023-01-01,0,137
1,2,2023-01-02,0,831
2,3,2023-01-03,1,486
3,4,2023-01-04,1,769
4,5,2023-01-05,0,413
...,...,...,...,...
995,996,2025-09-22,1,347
996,997,2025-09-23,1,969
997,998,2025-09-24,1,175
998,999,2025-09-25,0,392


In [27]:
missing_indices=np.random.choice(customer_orders_random.index, size= int(0.1 * len(customer_orders_random)), replace=False)
customer_orders_random.loc[missing_indices, 'OrderAmount'] = np.nan #na값으로 바꾼다

In [28]:
#isna().sum() na값이 존재한다.
customer_orders_random.isna().sum()

CustomerID       0
OrderDate        0
Repurchase       0
OrderAmount    100
dtype: int64

In [30]:
# NA값 제거하기
X= customer_orders_random['OrderAmount'].values.reshape(-1,1)
y= customer_orders_random['Repurchase']

In [31]:
## NA값을 어떻게 나눌까?

X_no_na =X[~np.isnan(X[:,0])] # NA값이 아닌 X의 값
y_no_na =y[~np.isnan(X[:,0])] # NA값이 아닌 X의 인덱스 기준의 y의 값


X_train, X_test, y_train, y_test = train_test_split(X_no_na,y_no_na, test_size=0.2, random_state=111)
## DT 의사결정나무
dt = DecisionTreeClassifier(random_state=111)
dt.fit(X_train, y_train)

## y_pred
y_pred = dt.predict(X_test)

## 정확도 평가
acc =accuracy_score(y_test, y_pred)
print(acc)

0.5555555555555556


## 0으로 대치 방식

In [32]:
X_zero_imputed = np.where(np.isnan(X), 0, X)  # 0으로 대치
X_train, X_test, y_train, y_test = train_test_split(X_zero_imputed, y, test_size=0.2, random_state=111)

dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

acc_zero_imputed = accuracy_score(y_test, y_pred)
print("0으로 대치한 정확도:", acc_zero_imputed)

0으로 대치한 정확도: 0.6


- KNN 방식보다 0.01정도 올라갔다