In [None]:
#python version 3.11.7
import pkg_resources
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from boruta import BorutaPy 
import optuna 
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
import pickle 
from statannot import add_stat_annotation 
import shap

#np.random.seed(42)

#for boruta
np.int = np.int32
np.float = np.float64
np.bool = np.bool_


In [None]:
import os 
data_dir = 'dataset/'
KEGG_key = 'KEGG_'
all_files = os.listdir(data_dir)
KEGG_files = [file for file in all_files if KEGG_key in file and file.endswith('.csv')]


In [None]:

raw_dfs = {}
for file in KEGG_files:
    file_path = os.path.join(data_dir, file)
    df_name = file.replace('.csv', '')  # Remove the file extension from the file name
    df_name = df_name.replace('KEGG_', '')
    df_name = df_name.replace('_frame', '')
    raw_dfs[df_name] = pd.read_csv(file_path, index_col=1)

In [None]:
for df_name, df in raw_dfs.items():
    print(f'Raw Dim of {df_name} ::: {df.shape}')
    df = df[df.columns.difference(['Unnamed: 0'])]
    df = df.dropna(how = 'any')
    print(f'After processing {df_name} ::: {df.shape}')
    print('\n')
    raw_dfs[df_name] = df
dfs = raw_dfs.copy()

In [None]:
# Define XGBoost Regressor
def XGBRegressor_objective(trial):
    params = {
        'eval_metric' : 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        # use exact for small dataset.
        "tree_method": "exact",
        "eta": trial.suggest_float("eta",1e-2,0.1,log = True),
        # L2 regularization weight.
        "reg_lambda": trial.suggest_float('reg_lambda', 1e-3, 10.0),
        # L1 regularization weight.
        "reg_alpha": trial.suggest_float('reg_alpha', 1e-3, 10.0),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.6,1,step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9, step=0.1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.2, 0.9, step=0.1),
        'learning_rate': trial.suggest_float('learning_rate', 1e-8, 1.0, log=True),
        "max_depth" : trial.suggest_int("max_depth", 1, 9),
        'min_child_weight' :  trial.suggest_int("min_child_weight", 2, 10),
        'n_jobs' : -1,
    }
    model = XGBRegressor(**params)
    xg_cv = -1 * cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs= -1)
    return np.mean(xg_cv)


In [None]:
for df_name, df in dfs.items():
    X = df.drop('ARID1A', axis=1)
    y = df['ARID1A']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    study = optuna.create_study(direction='minimize')
    study.optimize(XGBRegressor_objective, n_trials = 500)
    best_params = study.best_params
    cur_model = XGBRegressor(**best_params)
    cur_model.fit(X_train, y_train)
    joblib.dump(cur_model,f'Models/KEGG_Model/{df_name}_model.pkl')