<a href="https://colab.research.google.com/github/hayannn/MeMI_ALFFEL_DATATHON/blob/main/%5Bkeyword2%EC%B0%A8%ED%95%84%ED%84%B0%EB%A7%81%2BOptuna%ED%8A%9C%EB%8B%9D%5D_MeMI_ICU_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 변경점
- 적용 코드2
```python
antibiotics = [
    'GENTAMICIN', 'OXACILLIN', 'ERYTHROMYCIN', 'PENICILLIN',
       'LEVOFLOXACIN', 'NITROFURANTOIN', 'PIPERACILLIN/TAZO', 'MEROPENEM',
       'CEFTAZIDIME', 'CEFAZOLIN', 'CEFEPIME', 'TRIMETHOPRIM/SULFA',
       'TOBRAMYCIN', 'IMIPENEM', 'CEFTRIAXONE', 'CIPROFLOXACIN',
       'VANCOMYCIN', 'CLINDAMYCIN', 'TETRACYCLINE', 'RIFAMPIN',
       'CHLORAMPHENICOL', 'AMPICILLIN', 'LINEZOLID', 'PIPERACILLIN',
       'AMPICILLIN/SULBACTAM', 'CEFUROXIME', 'PENICILLIN G', 'DAPTOMYCIN',
       'AMIKACIN', 'CEFPODOXIME'
]
pattern = '|'.join(antibiotics)
antibiotics_prescriptions = data[data['DRUG'].str.contains(pattern, case=False, na=False)]
```

```python
def data_rename(df, renames):
    renames_df = df[df['DRUG'].str.contains(renames, case=False, na=False)]
    unique_list = renames_df['DRUG'].unique()
    df['DRUG'] = df['DRUG'].replace(unique_list, renames)
    return df
```

```python
for name in antibiotics:
    antibiotics_prescriptions = data_rename(antibiotics_prescriptions, name)
```

---

# 데이터셋 준비

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
import os
import pandas as pd

# CSV 파일들이 위치한 디렉토리 경로
dir_path = '/content/drive/MyDrive/mimic-iii-clinical-database-1.4/'

# 불러올 파일 리스트
target_files = ['ICUSTAYS.csv', 'PRESCRIPTIONS.csv', 'LABEVENTS.csv', 'ADMISSIONS.csv']

# 디렉토리 내 모든 파일 목록을 가져옴
files = os.listdir(dir_path)

# 지정된 파일만 불러오기
for file in files:
    file_path = os.path.join(dir_path, file)

    # 파일 이름이 target_files 목록에 있는지 확인
    if file in target_files and file.endswith('.csv'):
        try:
            # 파일을 읽어서 DataFrame으로 저장
            df = pd.read_csv(file_path, low_memory=False, on_bad_lines='skip')

            # 열 이름을 소문자로 변환
            df.columns = [col.lower() for col in df.columns]

            # 'row_id' 컬럼이 있으면 삭제
            if 'row_id' in df.columns:
                df = df.drop(columns=['row_id'])

            # DataFrame 이름을 파일 이름에 맞게 동적으로 할당
            dataframe_name = file.split('.')[0].lower()
            globals()[dataframe_name] = df

            print(f"Loaded DataFrame: {dataframe_name}, Shape: {df.shape}")

        except Exception as e:
            print(f"Error processing {file}: {e}")
    else:
        print(f"Skipping file: {file}")

Skipping file: CPTEVENTS.csv
Skipping file: D_ICD_DIAGNOSES.csv
Skipping file: D_ITEMS.csv
Loaded DataFrame: admissions, Shape: (58976, 18)
Skipping file: checksum_md5_unzipped.txt
Skipping file: D_ICD_PROCEDURES.csv
Skipping file: CALLOUT.csv
Skipping file: checksum_md5_zipped.txt
Skipping file: D_CPT.csv
Skipping file: CAREGIVERS.csv
Skipping file: DATETIMEEVENTS.csv
Skipping file: D_LABITEMS.csv
Loaded DataFrame: icustays, Shape: (61532, 11)
Skipping file: LICENSE.txt
Loaded DataFrame: labevents, Shape: (27854055, 8)
Skipping file: DIAGNOSES_ICD.csv
Skipping file: INPUTEVENTS_MV.csv
Skipping file: INPUTEVENTS_CV.csv
Skipping file: MICROBIOLOGYEVENTS.csv
Skipping file: DRGCODES.csv
Skipping file: README.md
Skipping file: SERVICES.csv
Skipping file: PROCEDURES_ICD.csv
Skipping file: PROCEDUREEVENTS_MV.csv
Skipping file: NOTEEVENTS.csv
Skipping file: Mimic_Data_Review.ipynb
Loaded DataFrame: prescriptions, Shape: (4156450, 18)
Skipping file: OUTPUTEVENTS.csv
Skipping file: SHA256SUMS.t

In [None]:
icustays.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime,los
0,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,2198-02-18 05:26:11,3.249
1,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788
2,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939
3,271,173727,249196,carevue,MICU,SICU,52,23,2120-08-07 23:12:42,2120-08-10 00:39:04,2.06
4,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202


## Analyzing antibiotic usage in ICU patients and predicting treatment success

Step 1: Data Loading and Preprocessing

In [None]:
# Merge ICUSTAYS with ADMISSIONS to include 'hospital_expire_flag'
icustays = icustays.merge(admissions[['subject_id', 'hadm_id', 'hospital_expire_flag']],
                          on=['subject_id', 'hadm_id'], how='left')

icustays.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime,los,hospital_expire_flag
0,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,2198-02-18 05:26:11,3.249,1
1,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788,0
2,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939,0
3,271,173727,249196,carevue,MICU,SICU,52,23,2120-08-07 23:12:42,2120-08-10 00:39:04,2.06,0
4,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202,0


In [None]:
# Filter ICU stays data
icustays = icustays[['subject_id', 'hadm_id', 'icustay_id', 'los', 'hospital_expire_flag']]

Step 2: Filter Antibiotics Data
- Extract antibiotic prescriptions from the PRESCRIPTIONS table.

- V2

In [None]:
antibiotics = [
    'GENTAMICIN', 'OXACILLIN', 'ERYTHROMYCIN', 'PENICILLIN',
       'LEVOFLOXACIN', 'NITROFURANTOIN', 'PIPERACILLIN/TAZO', 'MEROPENEM',
       'CEFTAZIDIME', 'CEFAZOLIN', 'CEFEPIME', 'TRIMETHOPRIM/SULFA',
       'TOBRAMYCIN', 'IMIPENEM', 'CEFTRIAXONE', 'CIPROFLOXACIN',
       'VANCOMYCIN', 'CLINDAMYCIN', 'TETRACYCLINE', 'RIFAMPIN',
       'CHLORAMPHENICOL', 'AMPICILLIN', 'LINEZOLID', 'PIPERACILLIN',
       'AMPICILLIN/SULBACTAM', 'CEFUROXIME', 'PENICILLIN G', 'DAPTOMYCIN',
       'AMIKACIN', 'CEFPODOXIME'
]

pattern = '|'.join(antibiotics)

antibiotics_prescriptions = prescriptions[prescriptions['drug'].str.contains(pattern, case=False, na=False)]

In [None]:
def data_rename(df, renames):
    renames_df = df[df['drug'].str.contains(renames, case=False, na=False)]

    unique_list = renames_df['drug'].unique()

    df['drug'] = df['drug'].replace(unique_list, renames)

    return df

In [None]:
for name in antibiotics:
    antibiotics_prescriptions = data_rename(antibiotics_prescriptions, name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['drug'] = df['drug'].replace(unique_list, renames)


Step 3: Filter Infection-Related Lab Tests
- Extract relevant infection markers from the LABEVENTS table.

In [None]:
# Filter LABEVENTS for relevant markers (e.g., WBC, CRP, Procalcitonin)
infection_markers = [51300, 51301, 51200, 51000]  # Replace with actual ITEMIDs from LABEVENTS
infection_tests = labevents[labevents['itemid'].isin(infection_markers)]

# Keep relevant columns
infection_tests = infection_tests[['subject_id', 'hadm_id', 'itemid', 'valuenum', 'charttime']]
infection_tests.head()

Unnamed: 0,subject_id,hadm_id,itemid,valuenum,charttime
52,3,,51200,1.8,2101-10-14 03:00:00
69,3,,51301,9.9,2101-10-14 03:00:00
101,3,,51301,9.7,2101-10-15 03:30:00
136,3,,51301,10.5,2101-10-16 04:00:00
190,3,145834.0,51200,2.8,2101-10-22 04:00:00


Step 4: Merge Data
- Combine ICU stay information, antibiotic usage, and lab results into a single dataset.

In [None]:
# Merge ICU stays with antibiotics
data = icustays.merge(antibiotics_prescriptions, on=['subject_id', 'hadm_id'], how='inner')

# Merge with lab test results
data = data.merge(infection_tests, on=['subject_id', 'hadm_id'], how='left')

# Drop rows with missing survival information
data = data.dropna(subset=['hospital_expire_flag'])

# Ensure all date columns are datetime
data['startdate'] = pd.to_datetime(data['startdate'])
data['enddate'] = pd.to_datetime(data['enddate'])
data['charttime'] = pd.to_datetime(data['charttime'])
data.head()

Unnamed: 0,subject_id,hadm_id,icustay_id_x,los,hospital_expire_flag,icustay_id_y,startdate,enddate,drug_type,drug,...,ndc,prod_strength,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,route,itemid,valuenum,charttime
0,268,110404,280836,3.249,1,280836.0,2198-02-16,2198-02-18,MAIN,LEVOFLOXACIN,...,45152010.0,250mg Tab,250,mg,1,TAB,PO,51301.0,9.8,2198-02-16 20:50:00
1,268,110404,280836,3.249,1,280836.0,2198-02-16,2198-02-18,MAIN,LEVOFLOXACIN,...,45152010.0,250mg Tab,250,mg,1,TAB,PO,51200.0,0.0,2198-02-17 02:57:00
2,268,110404,280836,3.249,1,280836.0,2198-02-16,2198-02-18,MAIN,LEVOFLOXACIN,...,45152010.0,250mg Tab,250,mg,1,TAB,PO,51301.0,14.1,2198-02-17 02:57:00
3,268,110404,280836,3.249,1,280836.0,2198-02-16,2198-02-18,MAIN,LEVOFLOXACIN,...,45152010.0,250mg Tab,250,mg,1,TAB,PO,51200.0,1.0,2198-02-11 10:40:00
4,268,110404,280836,3.249,1,280836.0,2198-02-16,2198-02-18,MAIN,LEVOFLOXACIN,...,45152010.0,250mg Tab,250,mg,1,TAB,PO,51301.0,10.3,2198-02-11 10:40:00


Step 5: Feature Engineering

Prepare features for modeling, including:
- Aggregating lab test results.
- Creating antibiotic duration.

In [None]:
# Calculate duration of antibiotic therapy
data['antibiotic_duration'] = (data['enddate'] - data['startdate']).dt.days

# Aggregate lab test results (mean and max values for each lab test per patient)
lab_features = data.groupby(['subject_id', 'hadm_id', 'itemid'])['valuenum'].agg(['mean', 'max']).unstack(fill_value=0)
lab_features.columns = ['_'.join(map(str, col)) for col in lab_features.columns]

# Merge lab features back into the dataset
data = data.groupby(['subject_id', 'hadm_id']).first().reset_index()
data = data.merge(lab_features, on=['subject_id', 'hadm_id'], how='left')

Step 6: Modeling

In [None]:
# Prepare features and labels
X = data.drop(columns=['hospital_expire_flag', 'subject_id', 'hadm_id', 'icustay_id_y', 'drug'])
y = data['hospital_expire_flag']

In [None]:
data.columns

Index(['subject_id', 'hadm_id', 'icustay_id_x', 'los', 'hospital_expire_flag',
       'icustay_id_y', 'startdate', 'enddate', 'drug_type', 'drug',
       'drug_name_poe', 'drug_name_generic', 'formulary_drug_cd', 'gsn', 'ndc',
       'prod_strength', 'dose_val_rx', 'dose_unit_rx', 'form_val_disp',
       'form_unit_disp', 'route', 'itemid', 'valuenum', 'charttime',
       'antibiotic_duration', 'mean_51000.0', 'mean_51200.0', 'mean_51300.0',
       'mean_51301.0', 'max_51000.0', 'max_51200.0', 'max_51300.0',
       'max_51301.0'],
      dtype='object')

# 모델 적용(기존) & 하이퍼파라미터 튜닝(그리드 서치)
- Random Forest

In [None]:
rf_model = RandomForestClassifier(random_state=42)

# 클래스 가중치 설정 추가
# rf_model = RandomForestClassifier(random_state=42, class_weight={0: 1, 1: 6})

# 5배 : 0.7609081245818777
# 6배 : 0.763740095146064
# 7배 ~ : 성능 오히려 하락

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_pred_rf_proba = rf_model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_rf_proba))

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.99      0.93      6502
           1       0.61      0.07      0.13       938

    accuracy                           0.88      7440
   macro avg       0.75      0.53      0.53      7440
weighted avg       0.85      0.88      0.83      7440

ROC-AUC Score: 0.7600305367743171


In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_pred_xgb_proba = xgb_model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_xgb_proba))

Parameters: { "use_label_encoder" } are not used.



Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93      9784
           1       0.54      0.16      0.24      1375

    accuracy                           0.88     11159
   macro avg       0.72      0.57      0.59     11159
weighted avg       0.85      0.88      0.85     11159

ROC-AUC Score: 0.7686489630565674


# 하이퍼파라미터 튜닝
- GridSearch

- Random Forest Hyperparameter Tuning

In [None]:
# Define the model
rf_model = RandomForestClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid Search for hyperparameter tuning
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                              cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit model
grid_search_rf.fit(X_train, y_train)

# Get the best parameters
print("Best parameters:", grid_search_rf.best_params_)

# Use the best model
best_rf_model = grid_search_rf.best_estimator_

# Evaluate on test set
y_pred_rf = best_rf_model.predict(X_test)
y_pred_rf_proba = best_rf_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import classification_report, roc_auc_score
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_rf_proba))

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94      9784
           1       0.65      0.09      0.16      1375

    accuracy                           0.88     11159
   macro avg       0.77      0.54      0.55     11159
weighted avg       0.86      0.88      0.84     11159

ROC-AUC Score: 0.7702528060655616


- XGBoost Hyperparameter Tuning

In [None]:
# Define the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define hyperparameters to tune
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Grid Search for hyperparameter tuning
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb,
                               cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit model
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters
print("Best parameters:", grid_search_xgb.best_params_)

# Use the best model
best_xgb_model = grid_search_xgb.best_estimator_

# Evaluate on test set
y_pred_xgb = best_xgb_model.predict(X_test)
y_pred_xgb_proba = best_xgb_model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_xgb_proba))

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 1.0}
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94      9784
           1       0.65      0.12      0.20      1375

    accuracy                           0.88     11159
   macro avg       0.77      0.55      0.57     11159
weighted avg       0.86      0.88      0.85     11159

ROC-AUC Score: 0.7895984167100275


---

<br>

# 추가) 파라미터 최적화
- 먼저, 최적화된 파라미터 값을 찾은 다음 -> 모델에 적용
- optuna 사용
  - 기존 파라미터 튜닝 방식이 랜덤 포레스트의 경우 과도한 시간 소요가 발생하기 때문에 시간을 줄이기 위한 라이브러리를 찾음
  - 출처 : [Optuna로 하이퍼파라미터 튜닝하기](https://velog.io/@halinee/Optuna%EB%A1%9C-%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D%ED%95%98%EA%B8%B0)

In [None]:
# 설치
!pip install optuna xgboost

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.7-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.7-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.9/78.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

- 랜덤 포레스트

In [None]:
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    class_weight = 'balanced'

    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        class_weight=class_weight,
        random_state=42
    )
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_valid)
    y_pred_rf_proba = rf_model.predict_proba(X_valid)[:, 1]

    return roc_auc_score(y_valid, y_pred_rf_proba)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

rf_study = optuna.create_study(direction='maximize')
rf_study.optimize(objective_rf, n_trials=20)
print("Best Random Forest Parameters:", rf_study.best_params)

best_rf_model = RandomForestClassifier(**rf_study.best_params, random_state=42)
best_rf_model.fit(X_train_full, y_train_full)

y_pred_rf = best_rf_model.predict(X_test)
y_pred_rf_proba = best_rf_model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_rf_proba))

[I 2024-12-05 03:47:49,993] A new study created in memory with name: no-name-54af0c43-ba9e-424f-93b2-7bf80472e6f4
[I 2024-12-05 03:48:21,927] Trial 0 finished with value: 0.770223488128498 and parameters: {'n_estimators': 221, 'max_depth': 17, 'min_samples_split': 9, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 0 with value: 0.770223488128498.
[I 2024-12-05 03:48:29,025] Trial 1 finished with value: 0.7623907838150372 and parameters: {'n_estimators': 72, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 4, 'bootstrap': False}. Best is trial 0 with value: 0.770223488128498.
[I 2024-12-05 03:48:34,168] Trial 2 finished with value: 0.7587243150218659 and parameters: {'n_estimators': 65, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 0 with value: 0.770223488128498.
[I 2024-12-05 03:48:51,386] Trial 3 finished with value: 0.7730583436354626 and parameters: {'n_estimators': 200, 'max_depth': 17, 'min_samples_split': 2

Best Random Forest Parameters: {'n_estimators': 255, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 3, 'bootstrap': True}
Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93      6502
           1       0.70      0.07      0.13       938

    accuracy                           0.88      7440
   macro avg       0.79      0.53      0.53      7440
weighted avg       0.86      0.88      0.83      7440

ROC-AUC Score: 0.7772099317972689


- XGB

In [None]:
def objective_xgb(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 0.1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    subsample = trial.suggest_uniform('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
    gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
    class_weight = {0: 1, 1: trial.suggest_int('class_weight', 1, 7)}

    xgb_model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_valid)
    y_pred_xgb_proba = xgb_model.predict_proba(X_valid)[:, 1]

    return roc_auc_score(y_valid, y_pred_xgb_proba)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(objective_xgb, n_trials=20)
print("Best XGBoost Parameters:", xgb_study.best_params)

best_xgb_model = XGBClassifier(**xgb_study.best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
best_xgb_model.fit(X_train_full, y_train_full)

y_pred_xgb = best_xgb_model.predict(X_test)
y_pred_xgb_proba = best_xgb_model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_xgb_proba))

[I 2024-12-05 03:55:05,165] A new study created in memory with name: no-name-b38b5a02-f32d-42c3-9d46-b6765599adce
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 0.1)
  subsample = trial.suggest_uniform('subsample', 0.5, 1.0)
  colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
  gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-05 03:55:10,280] Trial 0 finished with value: 0.7814365615758717 and parameters: {'n_estimators': 243, 'max_depth': 5, 'learning_rate': 0.017903200152353853, 'min_child_weight': 8, 'subsample': 0.601624312516992, 'colsample_bytree': 0.928174560247244, 'gamma': 0.0212818577647481, 'class_weight': 7}. Best is trial 0 with value: 0.7814365615758717.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 0.1)
  subsample = trial.suggest_uniform('subsample', 0.5, 1.0)
  colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
  gamma = tria

Best XGBoost Parameters: {'n_estimators': 271, 'max_depth': 12, 'learning_rate': 0.05530542777551273, 'min_child_weight': 7, 'subsample': 0.9241201201834832, 'colsample_bytree': 0.5273685647420798, 'gamma': 1.515287778996012, 'class_weight': 4}
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94      6502
           1       0.67      0.13      0.21       938

    accuracy                           0.88      7440
   macro avg       0.78      0.56      0.57      7440
weighted avg       0.86      0.88      0.85      7440

ROC-AUC Score: 0.8033568808416501
