## Setup

In [1]:
import sys
#!{sys.executable} -m pip3 install pandas numpy scikit-learn lightgbm matplotlib duckdb pyarrow
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import warnings
import duckdb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, auc, roc_curve, accuracy_score, 
                             precision_score, recall_score, f1_score, 
                             precision_recall_curve, roc_auc_score, brier_score_loss)
from lightgbm import LGBMClassifier
import lightgbm as lgb
# install pyarrow to work with parquet files
import pyarrow.parquet as pq

#pd.set_option('display.max_rows', None)
random.seed(37)
np.random.seed(37)
warnings.filterwarnings("ignore")

con = duckdb.connect(database=":memory:")

print("Setup complete")

Setup complete


### Control panel- User Input required

Update root location, input filetype, site_name and confirm that race/ethnicity mapping correct. 

In [2]:
#Enter the location for your CLIF-1.0 directory
root_location = 'C:/Users/vchaudha/OneDrive - rush.edu/CLIF-1.0-main'
# either parquet or csv only
filetype = 'csv'
site_name='RUSH'

race_map = {
    'White': 'White',
    'Black or African American': 'Black',
    'Asian': 'Asian',
    'Other': 'Others',
    'Unknown': 'Others',
    'Did Not Encounter': 'Others',
    'Refusal': 'Others',
    'American Indian or Alaska Native': 'Others',
    'Native Hawaiian or Other Pacific Islander': 'Others',
    np.nan: 'Others'
}

ethnicity_map = {
    'Not Hispanic or Latino': 'Not Hispanic or Latino',
    'Hispanic or Latino': 'Hispanic or Latino',
    'Did Not Encounter': 'Others',
    'Refusal': 'Others',
    '*Unspecified': 'Others',
    np.nan: 'Others'
}

finetune=True

In [3]:
adt_filepath = f"{root_location}/rclif/clif_adt.{filetype}"
encounter_filepath = f"{root_location}/rclif/clif_encounter_demographics_dispo.{filetype}"
limited_filepath = f"{root_location}/rclif/clif_limited_identifiers.{filetype}"
demog_filepath = f"{root_location}/rclif/clif_patient_demographics.{filetype}"
vitals_filepath = f"{root_location}/rclif/clif_vitals.{filetype}"
labs_filepath = f"{root_location}/rclif/clif_labs.{filetype}"

## Import data

In [4]:
def read_data(filepath, filetype):
    """
    Read data from file based on file type.
    Parameters:
        filepath (str): Path to the file.
        filetype (str): Type of the file ('csv' or 'parquet').
    Returns:
        DataFrame: DataFrame containing the data.
    """
    if filetype == 'csv':
        return pd.read_csv(filepath)
    elif filetype == 'parquet':
        table = pq.read_table(filepath)
        return table.to_pandas()
    else:
        raise ValueError("Unsupported file type. Please provide either 'csv' or 'parquet'.")
    

def standardize_datetime(df):
    """
    Ensure that all *_dttm variables are in the correct format.
    Convert all datetime columns to a specific precision and remove timezone
    Parameters:
        DataFrame: DataFrame containing the data.
    Returns:
        DataFrame: DataFrame containing the data.
    """
    for col in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            # Here converting to 'datetime64[ns]' for uniformity and removing timezone with 'tz_convert(None)'
            df[col] = df[col].dt.tz_convert(None) if df[col].dt.tz is not None else df[col]
            # If you need to standardize to UTC and keep the timezone:
            # df[col] = df[col].dt.tz_localize('UTC') if df[col].dt.tz is None else df[col].dt.tz_convert('UTC')
    return df

def get_sql_import(filetype):
    if filetype == 'parquet':
        return 'read_parquet'
    if filetype == 'csv':
        return 'read_csv_auto'

sql_import = get_sql_import(filetype=filetype)

# create output directory
output_directory = os.path.join(os.getcwd(), 'output')
os.makedirs(output_directory, exist_ok=True)

In [5]:
location = read_data(adt_filepath, filetype)
encounter = read_data(encounter_filepath, filetype)
limited = read_data(limited_filepath, filetype)
demog = read_data(demog_filepath, filetype)

# Apply the standardization function to each DataFrame
location = standardize_datetime(location)
encounter = standardize_datetime(encounter)
limited = standardize_datetime(limited)
demog = standardize_datetime(demog)

### ICU close to Admission

1. Check ICU location_category between admission_dttmtime and 48hr stop from admission
2. Check ICU stay at least 24 hr (for ICU - OR - ICU including OR in ICU stay 24hr)

In [12]:
join=pd.merge(location[['encounter_id','location_category','in_dttm','out_dttm']],\
              limited[['encounter_id','admission_dttm']], on=['encounter_id'], how='left')

icu_data=pd.merge(join,\
                  encounter[['encounter_id','age_at_admission','disposition']], on=['encounter_id'], how='left')


icu_data['in_dttm'] = pd.to_datetime(icu_data['in_dttm'])
icu_data['admission_dttm'] = pd.to_datetime(icu_data['admission_dttm'])
icu_data['out_dttm'] = pd.to_datetime(icu_data['out_dttm'])
icu_data['age_at_admission'] = icu_data['age_at_admission'].astype(int)

icu_48hr_check = icu_data[
    (icu_data['location_category'] == 'ICU') &
    (icu_data['in_dttm'] >= icu_data['admission_dttm']) &
    (icu_data['in_dttm'] <= icu_data['admission_dttm'] + pd.Timedelta(hours=48)) &
    (icu_data['in_dttm'].dt.year >= 2020) & (icu_data['in_dttm'].dt.year <= 2022) & 
    (icu_data['age_at_admission'] >= 18) & (icu_data['age_at_admission'].notna())
]['encounter_id'].unique()

icu_data=icu_data[icu_data['encounter_id'].isin(icu_48hr_check) & (icu_data['in_dttm'] <= icu_data['admission_dttm'] + pd.Timedelta(hours=72))].reset_index(drop=True)

icu_data = icu_data.sort_values(by=['in_dttm']).reset_index(drop=True)

icu_data["RANK"]=icu_data.sort_values(by=['in_dttm'], ascending=True).groupby("encounter_id")["in_dttm"].rank(method="first", ascending=True).astype(int)


min_icu=icu_data[icu_data['location_category'] == 'ICU'].groupby('encounter_id')['RANK'].min()
icu_data=pd.merge(icu_data, pd.DataFrame(zip(min_icu.index, min_icu.values), columns=['encounter_id', 'min_icu']), on='encounter_id', how='left')
icu_data=icu_data[icu_data['RANK']>=icu_data['min_icu']].reset_index(drop=True)

icu_data.loc[icu_data['location_category'] == 'OR', 'location_category'] = 'ICU'

icu_data['group_id'] = (icu_data.groupby('encounter_id')['location_category'].shift() != icu_data['location_category']).astype(int)
icu_data['group_id'] = icu_data.sort_values(by=['in_dttm'], ascending=True).groupby('encounter_id')['group_id'].cumsum()


icu_data = icu_data.sort_values(by=['in_dttm'], ascending=True).groupby(['encounter_id', 'location_category', 'group_id']).agg(
    min_in_dttm=('in_dttm', 'min'),
    max_out_dttm=('out_dttm', 'max'),
    admission_dttm=('admission_dttm', 'first'),
    age=('age_at_admission', 'first'),
    dispo=('disposition', 'first')
).reset_index()

min_icu=icu_data[icu_data['location_category'] == 'ICU'].groupby('encounter_id')['group_id'].min()
icu_data=pd.merge(icu_data, pd.DataFrame(zip(min_icu.index, min_icu.values), columns=['encounter_id', 'min_icu']), on='encounter_id', how='left')

icu_data=icu_data[(icu_data['min_icu']==icu_data['group_id']) &
         (icu_data['max_out_dttm']-icu_data['min_in_dttm'] >= pd.Timedelta(hours=24))
         ].reset_index(drop=True)


icu_data['after_24hr']=icu_data['min_in_dttm'] + pd.Timedelta(hours=24)

icu_data=icu_data[['encounter_id','min_in_dttm','after_24hr','age','dispo']]

icu_data=pd.merge(icu_data,\
                  demog, on=['encounter_id'], how='left')[['encounter_id','min_in_dttm','after_24hr','age','dispo','sex','ethnicity','race']]
icu_data=icu_data[~icu_data['sex'].isna()].reset_index(drop=True)
icu_data['isfemale']=(icu_data['sex'].str.lower() == 'female').astype(int)
icu_data['isdeathdispo'] = (icu_data['dispo'].str.contains('dead|expired', case=False, regex=True)).astype(int)

icu_data['ethnicity'] = icu_data['ethnicity'].map(ethnicity_map)
icu_data['race'] = icu_data['race'].map(race_map)


del location,encounter,limited,demog

### Vitals

In [13]:
vitals = con.execute(f'''
    SELECT 
        encounter_id,
        CAST(recorded_dttm AS datetime) AS recorded_dttm,
        CAST(vital_value AS float) AS vital_value,
        vital_category 
    FROM 
        {sql_import}('{vitals_filepath}')
    WHERE 
        vital_category IN ('weight_kg', 'pulse', 'sbp', 'dbp', 'temp_c','height_inches') 
        AND encounter_id IN (SELECT DISTINCT encounter_id FROM icu_data);
''').df()

vitals=con.execute('''
PIVOT vitals
ON vital_category
USING first(vital_value)
GROUP BY encounter_id,recorded_dttm;
''').df()

vitals['height_meters'] = vitals['height_inches'] * 0.0254

# Calculate BMI
vitals['bmi'] = vitals['weight_kg'] / (vitals['height_meters'] ** 2)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [14]:
icu_data_agg=pd.merge(icu_data,vitals, on=['encounter_id'], how='left')
icu_data_agg=icu_data_agg[(icu_data_agg['recorded_dttm'] >= icu_data_agg['min_in_dttm']) & (icu_data_agg['recorded_dttm'] <= icu_data_agg['after_24hr'])].reset_index(drop=True)

icu_data_agg = icu_data_agg.groupby(['encounter_id']).agg(
    min_bmi=('bmi', 'min'),
    max_bmi=('bmi', 'max'),
    avg_bmi=('bmi', 'mean'),
    min_weight_kg=('weight_kg', 'min'),
    max_weight_kg=('weight_kg', 'max'),
    avg_weight_kg=('weight_kg', 'mean'),
    min_pulse=('pulse', 'min'),
    max_pulse=('pulse', 'max'),
    avg_pulse=('pulse', 'mean'),
    min_sbp=('sbp', 'min'),
    max_sbp=('sbp', 'max'),
    avg_sbp=('sbp', 'mean'),
    min_dbp=('dbp', 'min'),
    max_dbp=('dbp', 'max'),
    avg_dbp=('dbp', 'mean'),
    min_temp_c=('temp_c', 'min'),
    max_temp_c=('temp_c', 'max'),
    avg_temp_c=('temp_c', 'mean'),
).reset_index()

icu_data=pd.merge(icu_data,icu_data_agg, on=['encounter_id'], how='left')

del vitals,icu_data_agg

### Labs

In [15]:
labs = con.execute(f'''
    SELECT 
        encounter_id,
        CAST(lab_order_dttm AS datetime) AS lab_order_dttm,
        TRY_CAST(lab_value AS float) AS lab_value,
        lab_category
    FROM 
         {sql_import}('{labs_filepath}')
    WHERE 
         ((lab_category='monocyte'               and lab_type_name='standard') OR
        (lab_category='lymphocyte'              and lab_type_name='standard') OR
        (lab_category='basophil'                and lab_type_name='standard') OR
        (lab_category='neutrophil'              and lab_type_name='standard') OR
        (lab_category='albumin'                 and lab_type_name='standard') OR
        (lab_category='ast'                     and lab_type_name='standard') OR
        (lab_category='total_protein'           and lab_type_name='standard') OR
        (lab_category='alkaline_phosphatase'    and lab_type_name='standard') OR
        (lab_category='bilirubin_total'         and lab_type_name='standard') OR
        (lab_category='bilirubin_conjugated'    and lab_type_name='standard') OR
        (lab_category='calcium'                 and lab_type_name='standard') OR
        (lab_category='chloride'                and lab_type_name='standard') OR
        (lab_category='potassium'               and lab_type_name='standard') OR
        (lab_category='sodium'                  and lab_type_name='standard') OR
        (lab_category='glucose_serum'           and lab_type_name='standard') OR
        (lab_category='hemoglobin'              and lab_type_name='standard') OR
        (lab_category='platelet count'          and lab_type_name='standard') OR
        (lab_category='wbc'                     and lab_type_name='standard'))
        AND encounter_id IN (SELECT DISTINCT encounter_id FROM icu_data);
''').df()

labs=con.execute('''
PIVOT labs
ON lab_category
USING first(lab_value)
GROUP BY encounter_id,lab_order_dttm;
''').df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [16]:
icu_data_agg=pd.merge(icu_data,labs, on=['encounter_id'], how='left')
icu_data_agg=icu_data_agg[(icu_data_agg['lab_order_dttm'] >= icu_data_agg['min_in_dttm']) & (icu_data_agg['lab_order_dttm'] <= icu_data_agg['after_24hr'])].reset_index(drop=True)


Lab_variables = [
   'albumin', 'alkaline_phosphatase',
       'ast', 'basophil', 'bilirubin_conjugated', 'bilirubin_total', 'calcium',
       'chloride', 'hemoglobin', 'lymphocyte', 'monocyte', 'glucose_serum', 
       'neutrophil', 'potassium', 'sodium', 'total_protein','platelet count', 
       'wbc'
]
agg_dict = {var: ['min', 'max', 'mean'] for var in Lab_variables}

icu_data_agg = icu_data_agg.groupby('encounter_id').agg(agg_dict).reset_index()

icu_data_agg.columns = ['_'.join(col).strip() if col[1] else col[0] for col in icu_data_agg.columns.values]

icu_data=pd.merge(icu_data,icu_data_agg, on=['encounter_id'], how='left')

#### Model

In [18]:
model_col=['isfemale','age', 'min_bmi', 'max_bmi', 'avg_bmi',
       'min_weight_kg', 'max_weight_kg', 'avg_weight_kg', 'min_pulse',
       'max_pulse', 'avg_pulse', 'min_sbp', 'max_sbp', 'avg_sbp', 'min_dbp',
       'max_dbp', 'avg_dbp', 'min_temp_c', 'max_temp_c', 'avg_temp_c',
       'albumin_min', 'albumin_max', 'albumin_mean',
       'alkaline_phosphatase_min', 'alkaline_phosphatase_max',
       'alkaline_phosphatase_mean', 'ast_min', 'ast_max', 'ast_mean',
       'basophil_min', 'basophil_max', 'basophil_mean',
       'bilirubin_conjugated_min', 'bilirubin_conjugated_max',
       'bilirubin_conjugated_mean', 'bilirubin_total_min',
       'bilirubin_total_max', 'bilirubin_total_mean', 'calcium_min',
       'calcium_max', 'calcium_mean', 'chloride_min', 'chloride_max',
       'chloride_mean', 'glucose_serum_min', 'glucose_serum_max',
       'glucose_serum_mean', 'hemoglobin_min', 'hemoglobin_max',
       'hemoglobin_mean', 'lymphocyte_min', 'lymphocyte_max',
       'lymphocyte_mean', 'monocyte_min', 'monocyte_max', 'monocyte_mean',
       'neutrophil_min', 'neutrophil_max', 'neutrophil_mean',
       'platelet count_min', 'platelet count_max', 'platelet count_mean',
       'potassium_min', 'potassium_max', 'potassium_mean', 'sodium_min',
       'sodium_max', 'sodium_mean', 'total_protein_min', 'total_protein_max',
       'total_protein_mean', 'wbc_min', 'wbc_max', 'wbc_mean']

model=lgb.Booster(model_file=f'{root_location}/projects/Mortality_model/models/lgbm_model_20240429-083130.txt')

#### basic metrics

In [19]:
X_test=icu_data[model_col]
y_test=icu_data['isdeathdispo']

y_pred_proba = model.predict(X_test)
icu_data['pred_proba'] = y_pred_proba
# Calculate metrics at default threshold (0.5)

accuracy = accuracy_score(y_test, (y_pred_proba >= 0.5).astype(int))
recall = recall_score(y_test, (y_pred_proba >= 0.5).astype(int))
precision = precision_score(y_test, (y_pred_proba >= 0.5).astype(int))
roc_auc = roc_auc_score(y_test, y_pred_proba)
brier_score = brier_score_loss(y_test, y_pred_proba)


results_Metric = pd.DataFrame({
    'Metric': ['Accuracy', 'Recall', 'Precision', 'ROC AUC', 'Brier Score Loss'],
    'Value': [accuracy, recall, precision, roc_auc, brier_score],
    'SiteName': [f'{site_name}'] * 5
})

results_Metric.to_csv(f'{output_directory}/result_metrics_{site_name}.csv',index=False)
results_Metric

Unnamed: 0,Metric,Value,SiteName
0,Accuracy,0.956161,RUSH
1,Recall,0.445563,RUSH
2,Precision,0.934741,RUSH
3,ROC AUC,0.950812,RUSH
4,Brier Score Loss,0.032848,RUSH


#### probablity table

In [20]:
prob_df_lgbm = pd.DataFrame({'site_label ':y_test, 'site_proba': y_pred_proba,'Site_name':f"{site_name}" })
prob_df_lgbm.to_csv(f'{output_directory}/Model_probabilities_{site_name}.csv',index=False)
prob_df_lgbm.head()

Unnamed: 0,site_label,site_proba,Site_name
0,0,0.026871,RUSH
1,0,0.012222,RUSH
2,0,0.03655,RUSH
3,0,0.003238,RUSH
4,0,0.031219,RUSH


#### model fairness test accross 'race', 'ethnicity', 'sex'

In [21]:
def calculate_metrics(data, true_col, pred_prob_col, subgroup_cols):
    results = []
    total_count = len(data)

    for subgroup_col in subgroup_cols:
       
        filtered_data = data.dropna(subset=[subgroup_col])
        
        for group in filtered_data[subgroup_col].unique():
            subgroup_data = filtered_data[filtered_data[subgroup_col] == group]
            group_count = len(subgroup_data)
            proportion = group_count / total_count

            if np.unique(subgroup_data[true_col]).size > 1:  # Check if both classes are present
                auc = roc_auc_score(subgroup_data[true_col], subgroup_data[pred_prob_col])
                tn, fp, fn, tp = confusion_matrix(subgroup_data[true_col], (subgroup_data[pred_prob_col] > 0.5).astype(int)).ravel()
                ppv = tp / (tp + fp) if (tp + fp) != 0 else 0
                result = {'Subgroup': subgroup_col, 'Group': group, 'AUC': auc, 'PPV': ppv, 'Group Count': group_count, 'Total Count': total_count, 'Proportion': proportion, 'site_name':f'{site_name}'}
            else:
                result = {'Subgroup': subgroup_col, 'Group': group, 'AUC': 'Not defined', 'PPV': 'Not applicable', 'Group Count': group_count, 'Total Count': total_count, 'Proportion': proportion, 'site_name':f'{site_name}'}
            
            results.append(result)
    
   
    results_df = pd.DataFrame(results)
    return results_df

result_df = calculate_metrics(icu_data, 'isdeathdispo', 'pred_proba', ['race', 'ethnicity', 'sex'])

In [22]:
result_df.to_csv(f'{output_directory}/fairness_test_{site_name}.csv',index=False)
result_df

Unnamed: 0,Subgroup,Group,AUC,PPV,Group Count,Total Count,Proportion,site_name
0,race,White,0.951074,0.951351,5621,14599,0.385026,RUSH
1,race,Others,0.95264,0.900826,2731,14599,0.187068,RUSH
2,race,Black,0.947569,0.942105,5762,14599,0.394685,RUSH
3,race,Asian,0.960787,0.92,485,14599,0.033221,RUSH
4,ethnicity,Not Hispanic or Latino,0.952598,0.941476,11621,14599,0.796013,RUSH
5,ethnicity,Hispanic or Latino,0.945992,0.90678,2874,14599,0.196863,RUSH
6,ethnicity,Others,0.903353,1.0,104,14599,0.007124,RUSH
7,sex,Male,0.950428,0.925926,7779,14599,0.532845,RUSH
8,sex,Female,0.951067,0.944223,6820,14599,0.467155,RUSH


#### thrshold check at site

In [23]:
def top_n_percentile(target_var, pred_proba):
    #thr_list = [0.99,0.97, 0.95,0.90,0.80,0.70,0.60,0.50,0.40,0.30,0.20,0.10]
    thr_list = np.arange(1, 0, -0.01)
    col = ['N Percentile', 'Thr Value','TN','FP','FN','TP','Sensitivity','Specificity','PPV', 'NPV' ,'Recall','Accuracy','site_name']
    result = pd.DataFrame(columns = col)
    i = 0
    
    for thr in thr_list: 
        prob = pd.DataFrame()
        prob['target_var'] = target_var
        prob['pred_proba'] = pred_proba

        thr_value = prob['pred_proba'].quantile(thr)
        prob['pred_proba_bin'] = np.where(prob['pred_proba'] >= thr_value, 1, 0)
        tn,fp,fn,tp = confusion_matrix(prob['target_var'], prob['pred_proba_bin']).ravel()

        sensitivity = tp/(tp+fn)
        specificity = tn/(tn+fp)
        ppv = tp/(tp+fp)
        npv = tn/(tn+fn)
        recall = tp/(tp+fn)
        acc = (tp+tn)/(tp+fn+tn+fp)
        n_prec = 'Top '+ str(np.round((1 - thr) * 100,0))+ "%"
        result.loc[i] = [n_prec,thr_value,tn,fp,fn,tp,sensitivity,specificity ,ppv,npv, recall, acc,f'{site_name}']
        i+=1
    return result
topn=top_n_percentile(y_test,y_pred_proba)

In [24]:
topn.to_csv(f'{output_directory}/Top_N_percentile_PPV_{site_name}.csv',index=False)
topn.head(6)

Unnamed: 0,N Percentile,Thr Value,TN,FP,FN,TP,Sensitivity,Specificity,PPV,NPV,Recall,Accuracy,site_name
0,Top 0.0%,0.992234,13506,0,1092,1,0.000915,1.0,1.0,0.925195,0.000915,0.9252,RUSH
1,Top 1.0%,0.792637,13502,4,951,142,0.129918,0.999704,0.972603,0.934201,0.129918,0.934585,RUSH
2,Top 2.0%,0.647142,13494,12,813,280,0.256176,0.999112,0.958904,0.943175,0.256176,0.943489,RUSH
3,Top 3.0%,0.552321,13482,24,679,414,0.378774,0.998223,0.945205,0.952051,0.378774,0.951846,RUSH
4,Top 4.0%,0.465256,13462,44,553,540,0.494053,0.996742,0.924658,0.960542,0.494053,0.959107,RUSH
5,Top 5.0%,0.383166,13431,75,438,655,0.599268,0.994447,0.89726,0.968419,0.599268,0.964861,RUSH


FineTune

In [25]:
if finetune:
    train_data, test_data = train_test_split(icu_data, test_size=0.5, random_state=42)
    X_train=train_data[model_col]
    y_train=train_data['isdeathdispo']

    #test
    X_test=test_data[model_col]
    y_test=test_data['isdeathdispo']

    lgb_train = lgb.Dataset(X_train, y_train)

    params = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "binary_logloss",
        "num_leaves": 31,
        "learning_rate": 0.05,
        "max_depth":-1}
    gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=f'{root_location}/projects/Mortality_model/models/lgbm_model_20240429-083130.txt')

    y_pred_proba_ft = gbm.predict(X_test)

    accuracy = accuracy_score(y_test, (y_pred_proba_ft >= 0.5).astype(int))
    recall = recall_score(y_test, (y_pred_proba_ft >= 0.5).astype(int))
    precision = precision_score(y_test, (y_pred_proba_ft >= 0.5).astype(int))
    roc_auc = roc_auc_score(y_test, y_pred_proba_ft)
    brier_score = brier_score_loss(y_test, y_pred_proba_ft)


    results_Metric = pd.DataFrame({
        'Metric': ['Accuracy', 'Recall', 'Precision', 'ROC AUC', 'Brier Score Loss'],
        'Value': [accuracy, recall, precision, roc_auc, brier_score],
        'SiteName': [f'{site_name}'] * 5,
        'FineTune': ['Yes'] * 5,
    })
    results_Metric.to_csv(f'{output_directory}/result_metrics_{site_name}_ft.csv',index=False)


    model_filename = f"{output_directory}/lgbm_model_{site_name}_ft.txt"

    # Save the model using LightGBM's built-in function
    model.save_model(model_filename)

    print(results_Metric)

[LightGBM] [Info] Number of positive: 555, number of negative: 6744
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12985
[LightGBM] [Info] Number of data points in the train set: 7299, number of used features: 74
             Metric     Value SiteName FineTune
0          Accuracy  0.956849     RUSH      Yes
1            Recall  0.455390     RUSH      Yes
2         Precision  0.917603     RUSH      Yes
3           ROC AUC  0.953190     RUSH      Yes
4  Brier Score Loss  0.032189     RUSH      Yes
