In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random

In [6]:
def resumetable(df):
    print(f'Dataset shape: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=[' Type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'Feature'})
    summary['Missing Values Count'] = df.isnull().sum().values
    summary['Missing Values Percentage'] = df.isnull().sum().values / len(df) * 100
    summary['Unique Values Count'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    summary['Second to Last Value'] = df.iloc[-2].values
    summary['Last Value'] = df.iloc[-1].values
    return summary

def seed_everything(seed = 9234):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = True

seed_everything()

In [7]:
data = pd.read_csv("../Data/Soo_T_PATIENTS_DAILY_WHOLE_0812.csv")

In [8]:
resumetable(data)

Dataset shape: (26010, 59)


Unnamed: 0,Feature,Type,Missing Values Count,Missing Values Percentage,Unique Values Count,First Value,Second Value,Third Value,Second to Last Value,Last Value
0,sido,object,0,0.0,17,경기도,충청북도,대구광역시,경상남도,인천광역시
1,create_date,object,0,0.0,1530,2014-05-01,2014-05-01,2014-05-01,2023-09-30,2023-09-30
2,sido_cd,int64,0,0.0,17,31,33,22,38,23
3,weekend_yn,int64,0,0.0,2,0,0,0,1,1
4,grid_x,int64,0,0.0,15,60,69,89,91,55
5,grid_y,int64,0,0.0,16,120,107,90,77,124
6,sat_x,float64,0,0.0,1,0.0,0.0,0.0,0.0,0.0
7,sat_y,float64,0,0.0,1,0.0,0.0,0.0,0.0,0.0
8,min_ta,float64,0,0.0,253,9.6,10.5,13.5,18.3,18.9
9,max_ta,float64,0,0.0,259,24.2,23.7,25.3,25.5,23.3


In [13]:
cols = ["max_tafeel", "ta_max_six_pm2", "mean_wbtemp", "popular_man", "max_ta", "popular_woman", "max_wbtemp", "ta_min_six_pm1",
        "ta_mean_six_pm2", "ta_mean_six_pm1", "mean_tafeel", "agriculture_woman", "agriculture_man", "ta_max_six_am2", "min_wbtemp",
        "jenks_cluster"]

In [14]:
new_data = data[cols]
print(new_data.shape)
new_data.head()

(26010, 16)


Unnamed: 0,max_tafeel,ta_max_six_pm2,mean_wbtemp,popular_man,max_ta,popular_woman,max_wbtemp,ta_min_six_pm1,ta_mean_six_pm2,ta_mean_six_pm1,mean_tafeel,agriculture_woman,agriculture_man,ta_max_six_am2,min_wbtemp,jenks_cluster
0,22.04,21,11.6,6219813,24.2,6138017,14.2,21,17,22,17.0,188594,186278,21,8.8,0
1,22.16,22,12.0,796141,23.7,782792,14.9,21,18,22,17.5,95065,92687,21,9.3,0
2,23.66,24,14.2,1241119,25.3,1252145,16.1,21,20,23,19.6,24142,25045,21,12.7,0
3,21.88,21,12.3,931536,23.6,940024,14.6,20,16,22,17.2,128116,122465,20,9.5,0
4,23.95,24,14.4,1356182,25.3,1344612,16.6,22,21,24,19.9,229448,217473,22,12.2,0


In [20]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

In [21]:
X = new_data.drop(columns=["jenks_cluster"])
y = new_data.jenks_cluster

seed_num = 43

In [24]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_num) # 20% test data set
# SMOTE
X_resampled, y_resampled = SMOTE(random_state=seed_num).fit_resample(X_temp, y_temp)
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.125, stratify=y_resampled, random_state=seed_num) # 10% valid data set

In [25]:
X_train.shape, X_valid.shape, X_test.shape

((41944, 15), (5993, 15), (5202, 15))

In [26]:
y_train.shape, y_valid.shape, y_test.shape

((41944,), (5993,), (5202,))

In [29]:
def objective(trial):
    param = {
        'verbosity': 0,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'eval_stopping_rounds': 10,
        'tree_method': 'gpu_hist',
    }
    
    model = XGBClassifier(**param)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False,
    )
    
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    
    return accuracy

In [30]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(f"Best trial: {study.best_trial.value}")
print(f"Best parameters: {study.best_params}")


[I 2024-09-26 15:58:42,592] A new study created in memory with name: no-name-1db1941d-0fa2-47f3-9dbb-e5b47522f045
[I 2024-09-26 15:59:08,623] Trial 0 finished with value: 0.8788586684465209 and parameters: {'n_estimators': 887, 'max_depth': 10, 'learning_rate': 0.0020211908098971064}. Best is trial 0 with value: 0.8788586684465209.
[I 2024-09-26 15:59:13,753] Trial 1 finished with value: 0.8878691807108293 and parameters: {'n_estimators': 197, 'max_depth': 10, 'learning_rate': 0.019843724607529662}. Best is trial 1 with value: 0.8878691807108293.
[I 2024-09-26 15:59:22,661] Trial 2 finished with value: 0.8738528282996829 and parameters: {'n_estimators': 451, 'max_depth': 9, 'learning_rate': 0.007756552180288447}. Best is trial 1 with value: 0.8878691807108293.
[I 2024-09-26 15:59:29,604] Trial 3 finished with value: 0.9095611546804605 and parameters: {'n_estimators': 767, 'max_depth': 7, 'learning_rate': 0.05903598797414824}. Best is trial 3 with value: 0.9095611546804605.
[I 2024-09-2

Best trial: 0.9252461204738862
Best parameters: {'n_estimators': 931, 'max_depth': 9, 'learning_rate': 0.08839000192227013}


In [31]:
# 3. 최적의 모델로 최종 테스트 데이터 성능 평가
best_model = XGBClassifier(**study.best_params)
best_model.fit(X_train, y_train)

preds = best_model.predict(X_test)


In [32]:
from sklearn.metrics import confusion_matrix # 혼동행렬
from sklearn.metrics import accuracy_score # 정확도
from sklearn.metrics import precision_score # 정밀도
from sklearn.metrics import recall_score # 재현율
from sklearn.metrics import f1_score # f1 스코어

In [33]:
def cal_class_score(y_test, preds, type_average="macro"):
    mask_class_0 = y_test == 0
    mask_class_1 = y_test == 1
    mask_class_2 = y_test == 2
    
    results = pd.DataFrame(
        {
            "class0": cal_matrix(y_test[mask_class_0], preds[mask_class_0], type_average),
            "class1": cal_matrix(y_test[mask_class_1], preds[mask_class_1], type_average),
            "class2": cal_matrix(y_test[mask_class_2], preds[mask_class_2], type_average),
        }
    )
    results = results.T
    results.columns = ["accuracy", "precision", "recall", "F1"]
    return results

def cal_matrix(y_test, preds, type_average):
    result_class = []
    result_class.append(accuracy_score(y_test, preds))
    result_class.append(precision_score(y_test, preds, average=type_average).tolist())
    result_class.append(recall_score(y_test, preds, average=type_average).tolist())
    result_class.append(f1_score(y_test, preds, average=type_average).tolist())
    return result_class

In [34]:
test = cal_class_score(y_test, preds, type_average="macro")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
test

Unnamed: 0,accuracy,precision,recall,F1
class0,0.884105,0.333333,0.294702,0.312829
class1,0.650722,0.333333,0.216907,0.262803
class2,0.59596,0.333333,0.198653,0.248945


In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds, digits=3))

              precision    recall  f1-score   support

           0      0.913     0.884     0.898      3995
           1      0.590     0.651     0.619      1108
           2      0.522     0.596     0.557        99

    accuracy                          0.829      5202
   macro avg      0.675     0.710     0.691      5202
weighted avg      0.837     0.829     0.832      5202

