In [None]:
#python version 3.11.7
import pkg_resources
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from boruta import BorutaPy 
import optuna 
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
import pickle 
from statannot import add_stat_annotation 
import shap

#np.random.seed(42)

#for boruta
np.int = np.int32
np.float = np.float64
np.bool = np.bool_


## Omics prediction

In [None]:
Result_matrix = pd.DataFrame(columns=['test_RMSE','test_R2','test_R'])


In [None]:

UCEC_full = pd.read_csv("dataset/TCGA_UCEC_scaled.csv",sep=',', index_col=0)
UCEC_full.index = UCEC_full['X']
UCEC_full = UCEC_full[UCEC_full.columns.difference(['RNA_count'])]
UCEC_full = UCEC_full.dropna(how = 'any')
# The column where each feature is located may vary, so you'll need to manually adjust it
UCEC_ARID1A = UCEC_full.iloc[:,0]
UCEC_RNA = UCEC_full.iloc[:,68]
UCEC_Mut = UCEC_full.iloc[:,[2,3,4,6]]
UCEC_CNV = UCEC_full.iloc[:,1]
UCEC_Met = UCEC_full.iloc[:,7:35]
UCEC_miRNA = UCEC_full.iloc[:,35:68]


In [None]:
X = pd.concat([UCEC_RNA, UCEC_Mut, UCEC_Met], axis = 1)
y = UCEC_ARID1A
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cur_model = joblib.load("/Models/Omics_Model/Omics_model.pkl")
cur_y_pred = cur_model.predict(X_test)
cur_rmse = np.sqrt(mean_squared_error(y_test, cur_y_pred))
cur_r2 = r2_score(y_test, cur_y_pred)
cur_corr = np.corrcoef(y_test, cur_y_pred)[0,1]    
Omics_y_pred = cur_y_pred 
Result_matrix.loc["RNA_Mut_Met"] = [cur_rmse,cur_r2,cur_corr]


In [None]:
Prediction_matrix = pd.DataFrame({"y_test" : y_test,"RNA_Mut_Met": Omics_y_pred}, index=y_test.index)
Prediction_matrix

## KEGG prediction

In [None]:
import os 
data_dir = 'dataset/'
KEGG_key = 'KEGG_'
all_files = os.listdir(data_dir)
KEGG_files = [file for file in all_files if KEGG_key in file and file.endswith('.csv')]

raw_dfs = {}
for file in KEGG_files:
    file_path = os.path.join(data_dir, file)
    df_name = file.replace('.csv', '')  # Remove the file extension from the file name
    df_name = df_name.replace('KEGG_', '')
    df_name = df_name.replace('_frame', '')
    raw_dfs[df_name] = pd.read_csv(file_path, index_col=1)

for df_name, df in raw_dfs.items():
    df = df[df.columns.difference(['Unnamed: 0'])]
    df = df.dropna(how = 'any')
    raw_dfs[df_name] = df
dfs = raw_dfs.copy()

In [None]:
for df_name, df in dfs.items():
    X = df.drop('ARID1A', axis=1)
    y = df['ARID1A']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    cur_model = joblib.load(f'/Models/KEGG_Model/{df_name}_model.pkl')
    cur_y_pred = cur_model.predict(X_test)
    cur_rmse = np.sqrt(mean_squared_error(y_test, cur_y_pred))
    cur_r2 = r2_score(y_test, cur_y_pred)
    cur_corr = np.corrcoef(y_test, cur_y_pred)[0,1]    
    globals()[df_name + "_y_pred"] = cur_y_pred
    Result_matrix.loc["KEGG_" + df_name] = [cur_rmse,cur_r2,cur_corr]
    Prediction_matrix["KEGG_" + df_name] = cur_y_pred

## BioGRID prediction

In [None]:
PPI_df = pd.read_csv("dataset/UCEC_ARID1A_BioGRID_PPI.csv", sep = ',', index_col =1)
PPI_df = PPI_df[PPI_df.columns.difference(['Unnamed: 0'])]
PPI_df = PPI_df.dropna(how = 'any')
X = PPI_df.drop('ARID1A',axis=1)
y = PPI_df['ARID1A']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cur_model = joblib.load("/Models/BioGRID_Model/BioGRID_model.pkl")
cur_y_pred = cur_model.predict(X_test)
cur_rmse = np.sqrt(mean_squared_error(y_test, cur_y_pred))
cur_r2 = r2_score(y_test, cur_y_pred)
cur_corr = np.corrcoef(y_test, cur_y_pred)[0,1]    
BioGRID_y_pred = cur_y_pred 
Result_matrix.loc["BioGRID"] = [cur_rmse,cur_r2,cur_corr]
Prediction_matrix["BioGRID"] = cur_y_pred

In [None]:
Result_matrix

In [None]:
kegg_df = Result_matrix.filter(like="KEGG_", axis=0)

# Rank each metric individually (lower RMSE is better, higher R2 and R are better)
kegg_df["RMSE_rank"] = kegg_df["test_RMSE"].rank(ascending=True)
kegg_df["R2_rank"] = kegg_df["test_R2"].rank(ascending=False)
kegg_df["R_rank"] = kegg_df["test_R"].rank(ascending=False)

# Compute mean rank
kegg_df["mean_rank"] = kegg_df[["RMSE_rank", "R2_rank", "R_rank"]].mean(axis=1)

# Select top 5 models based on mean rank
top_5_kegg = kegg_df.sort_values("mean_rank").head(5)


In [None]:
kegg_sorted = kegg_df.sort_values("mean_rank")
mean_ranks = kegg_sorted["mean_rank"]
labels = kegg_sorted.index

# Create the bar plot
plt.figure(figsize=(10, 6))
plt.bar(labels, mean_ranks, color='skyblue')
plt.ylabel('Mean Rank', fontsize=12)
plt.title('Mean Rank of RMSE, R, R-squared', fontsize=14)
plt.xticks(rotation=45,ha = 'right', fontsize=10)
plt.tight_layout()


## Prediction result analysis

In [None]:
Prediction_matrix["KEGG_average"] =  Prediction_matrix.iloc[:,2:17].mean(axis = 1)

In [None]:
Prediction_matrix["3_model_avg"] = Prediction_matrix.iloc[:,[1,17,18]].mean(axis=1)

In [None]:
cur_y_pred = Prediction_matrix["KEGG_average"]
cur_rmse = np.sqrt(mean_squared_error(y_test, cur_y_pred))
cur_r2 = r2_score(y_test, cur_y_pred)
cur_corr = np.corrcoef(y_test, cur_y_pred)[0,1]    
Result_matrix.loc["KEGG_average"] = [cur_rmse,cur_r2,cur_corr]
Result_matrix.loc['KEGG_average']

In [None]:
cur_y_pred = Prediction_matrix["3_model_avg"]
cur_rmse = np.sqrt(mean_squared_error(y_test, cur_y_pred))
cur_r2 = r2_score(y_test, cur_y_pred)
cur_corr = np.corrcoef(y_test, cur_y_pred)[0,1]    
Result_matrix.loc["3_model_avg"] = [cur_rmse,cur_r2,cur_corr]
Result_matrix.loc['3_model_avg']

In [None]:
High_performance_pathway = ['Proteasome', 'Ribosome',
       'RNA_degradation', 'Ubiquitin', 'mRNA_surveillance_pathway']

In [None]:
Prediction_matrix.columns

In [None]:
# Select the index of High performace pathway
Prediction_matrix['KEGG_Selected'] = Prediction_matrix.loc[:,['KEGG_' + s for s in High_performance_pathway]].mean(axis = 1)

Prediction_matrix["3_model_Selected"] = Prediction_matrix.iloc[:,[1,17,20]].mean(axis=1)

In [None]:
cur_y_pred = Prediction_matrix["KEGG_Selected"]
cur_rmse = np.sqrt(mean_squared_error(y_test, cur_y_pred))
cur_r2 = r2_score(y_test, cur_y_pred)
cur_corr = np.corrcoef(y_test, cur_y_pred)[0,1]    
Result_matrix.loc["KEGG_Selected"] = [cur_rmse,cur_r2,cur_corr]
Result_matrix.loc['KEGG_Selected']

In [None]:
cur_y_pred = Prediction_matrix["3_model_Selected"]
cur_rmse = np.sqrt(mean_squared_error(y_test, cur_y_pred))
cur_r2 = r2_score(y_test, cur_y_pred)
cur_corr = np.corrcoef(y_test, cur_y_pred)[0,1]    
Result_matrix.loc["3_model_Selected"] = [cur_rmse,cur_r2,cur_corr]
Result_matrix.loc['3_model_Selected']

In [None]:
fig, ax = plt.subplots(figsize = (6,6))
df = Result_matrix.iloc[[0,16,17,19,18,20]]
sns.barplot(df, x = df.index, y = "test_RMSE", color="grey")
plt.xticks(rotation = 45)
plt.title("Ensemble Result : test_RMSE")
ax.bar_label(ax.containers[0])
plt.tight_layout()


In [None]:
fig, ax = plt.subplots(figsize = (6,6))
df = Result_matrix.iloc[[0,16,17,19,18,20]]
sns.barplot(df, x = df.index, y = "test_R2", color="grey")
plt.xticks(rotation = 45)
plt.title("Ensemble Result : test_R2")
ax.bar_label(ax.containers[0])
plt.tight_layout()


In [None]:
fig, ax = plt.subplots(figsize = (6,6))
df = Result_matrix.iloc[[0,16,17,19,18,20]]
sns.barplot(df, x = df.index, y = "test_R", color="grey")
plt.xticks(rotation = 45)
plt.title("Ensemble Result : test_R")
ax.bar_label(ax.containers[0])
plt.tight_layout()


In [None]:
g = sns.regplot(data= Prediction_matrix, x = '3_model_avg', y = 'y_test',color = '#363737')
g.text(-.4,0.55,r'${{R^2}}$ : {:3.4f}'.format(Result_matrix.loc['3_model_avg',"test_R2"]), fontsize = 12)
g.text(-.4,0.45,r'${{R }}$ : {:3.4f}'.format(Result_matrix.loc['3_model_avg',"test_R"]), fontsize = 12)

In [None]:
g = sns.regplot(data= Prediction_matrix, x = '3_model_Selected', y = 'y_test',color = '#363737')
plt.xlabel('Predicted protein', fontsize = 20)
plt.ylabel('Observed protein', fontsize = 20)
g.text(-.43,0.55,r'${{R^2}}$ : {:3.4f}'.format(Result_matrix.loc['3_model_Selected',"test_R2"]), fontsize = 15)
g.text(-.43,0.45,r'${{R }}$ : {:3.4f}'.format(Result_matrix.loc['3_model_Selected',"test_R"]), fontsize = 15)
plt.gca().set_position([0, 0, 1, 1])


In [None]:
fig, ax = plt.subplots(figsize = (6,6))
df = Prediction_matrix.iloc[:,[0,1,17,18,20,19,21]]
sns.boxplot(df, color="grey")
plt.xticks(rotation = 45)
plt.title("Prediction Result")
plt.tight_layout()
