In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random

In [19]:
def resumetable(df):
    print(f'Dataset shape: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=[' Type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'Feature'})
    summary['Missing Values Count'] = df.isnull().sum().values
    summary['Missing Values Percentage'] = df.isnull().sum().values / len(df) * 100
    summary['Unique Values Count'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    summary['Second to Last Value'] = df.iloc[-2].values
    summary['Last Value'] = df.iloc[-1].values
    return summary

def seed_everything(seed = 9234):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = True

seed_everything()

In [20]:
data = pd.read_csv("./Data/Soo_T_PATIENTS_DAILY_WHOLE_0812.csv")

In [21]:
resumetable(data)

Dataset shape: (26010, 59)


Unnamed: 0,Feature,Type,Missing Values Count,Missing Values Percentage,Unique Values Count,First Value,Second Value,Third Value,Second to Last Value,Last Value
0,sido,object,0,0.0,17,경기도,충청북도,대구광역시,경상남도,인천광역시
1,create_date,object,0,0.0,1530,2014-05-01,2014-05-01,2014-05-01,2023-09-30,2023-09-30
2,sido_cd,int64,0,0.0,17,31,33,22,38,23
3,weekend_yn,int64,0,0.0,2,0,0,0,1,1
4,grid_x,int64,0,0.0,15,60,69,89,91,55
5,grid_y,int64,0,0.0,16,120,107,90,77,124
6,sat_x,float64,0,0.0,1,0.0,0.0,0.0,0.0,0.0
7,sat_y,float64,0,0.0,1,0.0,0.0,0.0,0.0,0.0
8,min_ta,float64,0,0.0,253,9.6,10.5,13.5,18.3,18.9
9,max_ta,float64,0,0.0,259,24.2,23.7,25.3,25.5,23.3


In [22]:
lag_cols = [
    "min_tafeel", "max_tafeel", "mean_tafeel", "gap_tafeel",
    "ta_min_six_am1", "ta_max_six_am1", "ta_mean_six_am1", "ta_min_six_am2", "ta_max_six_am2", "ta_mean_six_am2",
    "ta_min_six_pm1", "ta_max_six_pm1", "ta_mean_six_pm1", "ta_min_six_pm2", "ta_max_six_pm2", "ta_mean_six_pm2",
    "min_ws", "max_ws", "mean_ws",
]

lag_data_1 = data[lag_cols][17:-17].reset_index(drop=True)
lag_data_2 = data[lag_cols][:-34].reset_index(drop=True)
lag_data_1.columns =  ["lag1_" + col for col in lag_data_1.columns]
lag_data_2.columns =  ["lag2_" + col for col in lag_data_2.columns]

In [23]:
lag_data_2.shape, lag_data_1.shape

((25976, 19), (25976, 19))

In [24]:
from category_encoders import BinaryEncoder
be = BinaryEncoder()

sido_en = be.fit_transform(data["sido"])

In [25]:
# data = pd.concat()pd.DataFrame([sido_en, data.jenks_cluster])
new_data = pd.concat(
    [
        sido_en, data.min_tafeel, data.max_tafeel, data.mean_tafeel, data.gap_tafeel,
        data.ta_min_six_am1, data.ta_max_six_am1, data.ta_mean_six_am1, data.ta_min_six_am2, data.ta_max_six_am2, data.ta_mean_six_am2,
        data.ta_min_six_pm1, data.ta_max_six_pm1, data.ta_mean_six_pm1, data.ta_min_six_pm2, data.ta_max_six_pm2, data.ta_mean_six_pm2,
        data.min_ws, data.max_ws, data.mean_ws,
        data.jenks_cluster,
    ],
    axis=1,
)
new_data = new_data[34:].reset_index(drop=True)
new_data = pd.concat([new_data, lag_data_1], axis=1)
new_data = pd.concat([new_data, lag_data_2], axis=1)
print(new_data.shape)
new_data

(25976, 63)


Unnamed: 0,sido_0,sido_1,sido_2,sido_3,sido_4,min_tafeel,max_tafeel,mean_tafeel,gap_tafeel,ta_min_six_am1,...,lag2_ta_mean_six_am2,lag2_ta_min_six_pm1,lag2_ta_max_six_pm1,lag2_ta_mean_six_pm1,lag2_ta_min_six_pm2,lag2_ta_max_six_pm2,lag2_ta_mean_six_pm2,lag2_min_ws,lag2_max_ws,lag2_mean_ws
0,0,0,0,0,1,10.17,18.43,14.4,8.3,9.0,...,15,21,24,22,14,21,17,0.3,3.5,1.7
1,0,0,0,1,0,10.26,17.55,14.1,7.3,9.0,...,15,21,23,22,15,22,18,0.0,3.7,1.6
2,0,0,0,1,1,12.07,19.11,15.7,7.0,12.0,...,17,21,25,23,18,24,20,0.1,3.7,1.9
3,0,0,1,0,0,10.75,18.73,14.7,8.0,8.0,...,15,20,23,22,12,21,16,0.1,4.1,1.6
4,0,0,1,0,1,13.11,20.11,17.0,7.0,13.0,...,17,22,25,24,19,24,21,0.4,4.7,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,0,1,1,0,1,18.54,23.87,21.6,5.3,22.0,...,24,26,29,28,23,26,24,0.7,3.5,2.5
25972,0,1,1,1,0,18.16,24.52,21.6,6.4,15.0,...,22,24,26,25,20,24,21,0.0,1.5,0.7
25973,0,1,1,1,1,17.84,23.65,21.1,5.8,19.0,...,23,25,27,26,20,25,22,1.1,3.8,2.2
25974,1,0,0,0,0,20.86,25.05,23.1,4.2,21.0,...,25,26,30,29,21,26,23,0.0,2.9,1.4


In [26]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import accuracy_score

In [27]:
X = new_data.drop(columns=["jenks_cluster"])
y = new_data.jenks_cluster

seed_num = 43

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_num)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.1, stratify=y_temp, random_state=seed_num)

In [28]:
X_train.shape, X_valid.shape, X_test.shape

((18702, 62), (2078, 62), (5196, 62))

In [29]:
y_train.shape, y_valid.shape, y_test.shape

((18702,), (2078,), (5196,))

In [32]:
def objective(trial):
    param = {
        'verbosity': 0,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        # 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'eval_stopping_rounds': 10,
        'tree_method': 'gpu_hist',
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_num)
    accuracy_scores = []
    
    for train_idx, valid_idx in skf.split(X_train, y_train):
        X_train_fold, X_valid_fold = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_train_fold, y_valid_fold = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
        model = XGBClassifier(**param)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_valid_fold, y_valid_fold)],
            verbose=False,
        )

        y_pred = model.predict(X_valid_fold)
        accuracy = accuracy_score(y_valid_fold, y_pred)
        accuracy_scores.append(accuracy)
    
    return np.mean(accuracy_scores)

In [33]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(f"Best trial: {study.best_trial.value}")
print(f"Best parameters: {study.best_params}")


[I 2024-09-24 16:48:32,931] A new study created in memory with name: no-name-dd242377-15f5-476d-bcef-0010419a966c
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
[I 2024-09-24 16:50:13,326] Trial 0 finished with value: 0.8480379434707469 and parameters: {'n_estimators': 866, 'max_depth': 10, 'learning_rate': 0.016981173397840506}. Best is trial 0 with value: 0.8480379434707469.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
[I 2024-09-24 16:51:04,373] Trial 1 finished with value: 0.8486261644703081 and parameters: {'n_estimators': 717, 'max_depth': 8, 'learning_rate': 0.02394662128669831}. Best is trial 1 with value: 0.8486261644703081.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
[I 2024-09-24 16:51:23,346] Trial 2 finished with value: 0.8468613585260597 and parameters: {'n_estimators': 129, 'max_depth': 10, 'learning_rate': 0.03747517884300082}. Best is trial 1 with value: 0.8486261644703081.
  'learning_r

Best trial: 0.8516201021489007
Best parameters: {'n_estimators': 199, 'max_depth': 5, 'learning_rate': 0.03552389404617955}


In [34]:
# 3. 최적의 모델로 최종 테스트 데이터 성능 평가
best_model = XGBClassifier(**study.best_params)
best_model.fit(X_train, y_train)

preds = best_model.predict(X_test)


In [35]:
from sklearn.metrics import confusion_matrix # 혼동행렬
from sklearn.metrics import accuracy_score # 정확도
from sklearn.metrics import precision_score # 정밀도
from sklearn.metrics import recall_score # 재현율
from sklearn.metrics import f1_score # f1 스코어

In [36]:
def cal_class_score(y_test, preds, type_average="macro"):
    mask_class_0 = y_test == 0
    mask_class_1 = y_test == 1
    mask_class_2 = y_test == 2
    
    results = pd.DataFrame(
        {
            "class0": cal_matrix(y_test[mask_class_0], preds[mask_class_0], type_average),
            "class1": cal_matrix(y_test[mask_class_1], preds[mask_class_1], type_average),
            "class2": cal_matrix(y_test[mask_class_2], preds[mask_class_2], type_average),
        }
    )
    results = results.T
    results.columns = ["accuracy", "precision", "recall", "F1"]
    return results

def cal_matrix(y_test, preds, type_average):
    result_class = []
    result_class.append(accuracy_score(y_test, preds))
    result_class.append(precision_score(y_test, preds, average=type_average).tolist())
    result_class.append(recall_score(y_test, preds, average=type_average).tolist())
    result_class.append(f1_score(y_test, preds, average=type_average).tolist())
    return result_class

In [37]:
test = cal_class_score(y_test, preds, type_average="macro")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
test

Unnamed: 0,accuracy,precision,recall,F1
class0,0.945838,0.5,0.472919,0.486082
class1,0.593863,0.333333,0.197954,0.248396
class2,0.4,0.333333,0.133333,0.190476


In [39]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds, digits=3))

              precision    recall  f1-score   support

           0      0.895     0.946     0.920      3988
           1      0.706     0.594     0.645      1108
           2      0.816     0.400     0.537       100

    accuracy                          0.860      5196
   macro avg      0.806     0.647     0.701      5196
weighted avg      0.853     0.860     0.854      5196

