In [None]:
#python version 3.11.7
import pkg_resources
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from boruta import BorutaPy 
import optuna 
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier 
from lightgbm import LGBMClassifier 
import pickle 
from statannot import add_stat_annotation 
import shap
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import log_loss
from imblearn.over_sampling import SMOTE
from sklearn.metrics import r2_score

from imblearn.over_sampling import SMOTE
from sklearn.multioutput import MultiOutputClassifier
from imblearn.pipeline import make_pipeline


np.random.seed(42)

#for boruta
np.int = np.int32
np.float = np.float64
np.bool = np.bool_



In [None]:
UCEC_full = pd.read_csv("dataset/TCGA_UCEC_scaled.csv",sep=',', index_col=0)
UCEC_full.index = UCEC_full['X']
UCEC_full = UCEC_full[UCEC_full.columns.difference(['RNA_count'])]
UCEC_full = UCEC_full.dropna(how = 'any')
# The column where each feature is located may vary, so you'll need to manually adjust it
UCEC_ARID1A = UCEC_full.iloc[:,0]
UCEC_RNA = UCEC_full.iloc[:,68]
UCEC_Mut = UCEC_full.iloc[:,[2,3,4,6]]
UCEC_CNV = UCEC_full.iloc[:,1]
UCEC_Met = UCEC_full.iloc[:,7:35]
UCEC_miRNA = UCEC_full.iloc[:,35:68]


In [None]:
import os 
data_dir = 'dataset/'
KEGG_key = 'KEGG_'
all_files = os.listdir(data_dir)
KEGG_files = [file for file in all_files if KEGG_key in file and file.endswith('.csv')]

raw_dfs = {}
for file in KEGG_files:
    file_path = os.path.join(data_dir, file)
    df_name = file.replace('.csv', '')  # Remove the file extension from the file name
    df_name = df_name.replace('KEGG_', '')
    df_name = df_name.replace('_frame', '')
    raw_dfs[df_name] = pd.read_csv(file_path, index_col=1)

for df_name, df in raw_dfs.items():
    df = df[df.columns.difference(['Unnamed: 0'])]
    df = df.dropna(how = 'any')
    raw_dfs[df_name] = df
dfs = raw_dfs.copy()

In [None]:
High_performance_pathway = ['Proteasome', 'Ribosome',
       'RNA_degradation', 'Ubiquitin', 'mRNA_surveillance_pathway']

In [None]:
PPI_df = pd.read_csv("dataset/UCEC_ARID1A_BioGRID_PPI.csv", sep = ',', index_col =1)
PPI_df = PPI_df[PPI_df.columns.difference(['Unnamed: 0'])]

In [None]:
X = PPI_df.dropna(how = 'any').drop('ARID1A',axis=1)
y = PPI_df.dropna(how = 'any')['ARID1A']
print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
Predicted_group = UCEC_full.drop(index = X.index).index
Observed_group = X.index

In [None]:
full_label_df = pd.DataFrame(index = UCEC_full.index, columns=['Observed_ARID1A','Predicted_ARID1A','Activity_q1'])

In [None]:
full_label_df['Observed_ARID1A'] = UCEC_ARID1A


## Regression Prediction 

In [None]:
Prediction_matrix = pd.DataFrame(index=UCEC_full.index)

In [None]:
X = pd.concat([UCEC_RNA, UCEC_Mut, UCEC_Met], axis = 1)
y = UCEC_ARID1A
print(X.loc[Observed_group,:].shape)
print(y.loc[Observed_group].shape)

In [None]:
cur_model = joblib.load("/Models/Omics_Model/Omics_model.pkl")
cur_model.fit(X.loc[Observed_group,:], y.loc[Observed_group]) ## finalize the model with full dataset 
Omics_y_pred = cur_model.predict(X)
Prediction_matrix['Omics'] = Omics_y_pred

In [None]:
for df_name, df in dfs.items():
    X = df.drop('ARID1A', axis=1)
    y = df['ARID1A']
    X_filtered = df.dropna(how='any').drop('ARID1A', axis=1)
    y_filtered = df.dropna(how='any')['ARID1A']
    cur_model = joblib.load(f'/Models/KEGG_Model/{df_name}_model.pkl')
    cur_model.fit(X_filtered, y_filtered) ## finalize the model with full dataset 
    cur_y_pred = cur_model.predict(X)
    Prediction_matrix["KEGG_" + df_name] = np.nan
    Prediction_matrix.loc[X.index,"KEGG_" + df_name] = cur_y_pred

In [None]:
X = PPI_df.drop('ARID1A',axis=1)
y = PPI_df['ARID1A']
X_filtered = PPI_df.dropna(how='any').drop('ARID1A', axis=1)
y_filtered = PPI_df.dropna(how='any')['ARID1A']
cur_model = joblib.load("/Models/BioGRID_Model/BioGRID_model.pkl")
cur_model.fit(X_filtered, y_filtered) ## finalize the model with full dataset 
cur_y_pred = cur_model.predict(X)
Prediction_matrix["BioGRID"] = np.nan
Prediction_matrix.loc[X.index,"BioGRID"] = cur_y_pred

In [None]:
print(Prediction_matrix.shape)
print(Prediction_matrix.dropna(how = 'any').shape)

In [None]:
Prediction_matrix['KEGG_Selected'] = Prediction_matrix.loc[:,['KEGG_' + s for s in High_performance_pathway]].mean(axis = 1) 


In [None]:
Prediction_matrix['3_model_selected'] = Prediction_matrix.iloc[:,[0,16,17]].mean(axis=1)

In [None]:
full_label_df.loc[Predicted_group,'Predicted_ARID1A'] = Prediction_matrix.loc[Predicted_group,'3_model_selected']

In [None]:
full_label_df.loc[Observed_group,'Predicted_ARID1A'] = full_label_df.loc[Observed_group,'Observed_ARID1A']

In [None]:
full_label_df.iloc[:,0:2].to_csv('dataset/UCEC_Observed_Predicted_ARID1A.csv')

### 3 level separation for Observed & Predicted group

In [None]:
# Convert the integer vector into labels
def assign_label(value, Q1, Q3):
    if np.isnan(value):
        return np.nan
    elif value <= Q1:
        return 'Low'
    elif value >= Q3:
        return 'High'
    else:
        return 'Medium'

In [None]:
Q1 = np.percentile(full_label_df.loc[Observed_group,'Observed_ARID1A'], 25)
Q3 = np.percentile(full_label_df.loc[Observed_group,'Observed_ARID1A'], 75)
full_label_df['Observed_ARID1A'] = np.array([assign_label(x, Q1, Q3) for x in full_label_df['Observed_ARID1A']])

In [None]:
Q1 = np.percentile(full_label_df['Predicted_ARID1A'], 25)
Q3 = np.percentile(full_label_df['Predicted_ARID1A'], 75)
full_label_df['Predicted_ARID1A'] = np.array([assign_label(x, Q1, Q3) for x in full_label_df['Predicted_ARID1A']])

## Classification Labeling

In [None]:
UCEC_label = pd.read_csv("dataset/UCEC_554_targets_Functional_data.csv",sep=',', index_col=0)
Targets = pd.read_csv("dataset/554_targets_summary.csv",sep=',', index_col=0)
Targets = Targets.loc[Targets['Gene Symbol'].isin(list(UCEC_label.columns)),:]
UCEC_label_q1 = UCEC_label.loc[:,UCEC_label.describe().loc['freq',:] < (539 * 0.75)]






In [None]:
UCEC_label_q1.mode(axis=1)[0].value_counts()

## Add the classification label 

In [None]:
full_label_df['Activity_q1'] = UCEC_label_q1.mode(axis=1)[0]



## Generated Label
* Observed group uses observe ARID1A expression,
* Predicted group uses predicted ARID1A expression. 
* Both groups use Activity_q1 label 

## 1. mRNA based label
- 3 level separation based on ARID1A processed mRNA

## 2. protein based label
- 3 level separation based on ARID1A observed protein (only observed group)

## 3. functional state  based label
- Additional predicted protein expression (+ predicted group) + Activity label 

### Extract the intersection
- TRUE = ARID1A protein High ^ Activity Active 
- FALSE = ARID1A protein Low ^ Aativity Inactive 

In [None]:
UCEC_RNA_TPM = pd.read_csv('dataset/UCEC_mRNA_TPM_matrix.csv', index_col = 0)
UCEC_RNA_TPM = UCEC_RNA_TPM['ARID1A']


In [None]:
full_label_df['mRNA_based_label'] = UCEC_RNA_TPM
Q1 = np.percentile(UCEC_RNA_TPM, 25)
Q3 = np.percentile(UCEC_RNA_TPM, 75)
full_label_df['mRNA_based_label'] = np.array([assign_label(x, Q1, Q3) for x in full_label_df['mRNA_based_label']])
full_label_df['mRNA_based_label'].value_counts()

In [None]:
full_label_df['protein_based_label'] = full_label_df['Observed_ARID1A']
full_label_df['protein_based_label'].value_counts()

In [None]:
full_label_df['functional_state_based_label'] = np.nan
full_label_df.loc[full_label_df.loc[(full_label_df['Observed_ARID1A'] == 'Low')].loc[(full_label_df['Activity_q1'] == False)].index,'functional_state_based_label'] = "Low"
full_label_df.loc[full_label_df.loc[(full_label_df['Observed_ARID1A'] == 'High')].loc[(full_label_df['Activity_q1'] == True)].index,'functional_state_based_label'] = "High"
full_label_df.loc[full_label_df.loc[Predicted_group,:].loc[(full_label_df['Predicted_ARID1A'] == 'Low')].loc[(full_label_df['Activity_q1'] == False)].index,'functional_state_based_label'] = "Low"
full_label_df.loc[full_label_df.loc[Predicted_group,:].loc[(full_label_df['Predicted_ARID1A'] == 'High')].loc[(full_label_df['Activity_q1'] == True)].index,'functional_state_based_label'] = "High"


In [None]:
full_label_df["functional_state_based_label"].value_counts()

In [None]:
full_label_df = full_label_df.loc[:,['mRNA_based_label', 'protein_based_label', 'functional_state_based_label']]
full_label_df.to_csv("dataset/UCEC_DEG_labels_mRNA_protein_functional_state_based_label.csv")