In [None]:
#python version 3.11.7
import pkg_resources
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from boruta import BorutaPy 
import optuna 
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
import pickle 
from statannot import add_stat_annotation 
import shap

#np.random.seed(42)

#for boruta
np.int = np.int32
np.float = np.float64
np.bool = np.bool_


In [None]:

UCEC_full = pd.read_csv("dataset/TCGA_UCEC_scaled.csv",sep=',', index_col=0)
UCEC_full = UCEC_full[UCEC_full.columns.difference(['RNA_count'])]
UCEC_full = UCEC_full.dropna(how = 'any')
UCEC_full.columns

In [None]:
# The column where each feature is located may vary, so you'll need to manually adjust it

# UCEC_ARID1A: TCPA protein expression column
UCEC_ARID1A = UCEC_full.iloc[:,0]
UCEC_RNA = UCEC_full.iloc[:,68]
UCEC_Mut = UCEC_full.iloc[:,[2,3,4,6]]
UCEC_CNV = UCEC_full.iloc[:,1]
UCEC_Met = UCEC_full.iloc[:,7:35]
UCEC_miRNA = UCEC_full.iloc[:,35:68]

In [None]:
# Define XGBoost Regressor
def XGBRegressor_objective(trial):
    params = {
        'eval_metric' : 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        # use exact for small dataset.
        "tree_method": "exact",
        "eta": trial.suggest_float("eta",1e-2,0.1,log = True),
        # L2 regularization weight.
        "reg_lambda": trial.suggest_float('reg_lambda', 1e-3, 10.0),
        # L1 regularization weight.
        "reg_alpha": trial.suggest_float('reg_alpha', 1e-3, 10.0),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.6,1,step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9, step=0.1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.2, 0.9, step=0.1),
        'learning_rate': trial.suggest_float('learning_rate', 1e-8, 1.0, log=True),
        "max_depth" : trial.suggest_int("max_depth", 1, 9),
        'min_child_weight' :  trial.suggest_int("min_child_weight", 2, 10),
        'n_jobs' : -1,
    }
    model = XGBRegressor(**params)
    xg_cv = -1 * cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs= -1)
    return np.mean(xg_cv)

In [None]:
X = pd.concat([UCEC_RNA, UCEC_Mut, UCEC_Met], axis = 1)
y = UCEC_ARID1A
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
study = optuna.create_study(direction='minimize')
study.optimize(XGBRegressor_objective, n_trials=500) 
best_params = study.best_params
cur_model = XGBRegressor(**best_params)
cur_model.fit(X_train, y_train)
joblib.dump(cur_model, "Models/Omics_Model/Omics_model.pkl")