In [None]:
#python version 3.11.7
import pkg_resources
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from boruta import BorutaPy 
import optuna 
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
import pickle 
from statannot import add_stat_annotation 
import shap

#np.random.seed(42)

#for boruta
np.int = np.int32
np.float = np.float64
np.bool = np.bool_


In [None]:
data_dir = 'dataset/'

In [None]:
PPI_df = pd.read_csv("dataset/UCEC_ARID1A_BioGRID_PPI.csv", sep = ',', index_col =1)
PPI_df = PPI_df[PPI_df.columns.difference(['Unnamed: 0'])]


In [None]:
PPI_df = PPI_df.dropna(how = 'any')
X = PPI_df.drop('ARID1A',axis=1)
y = PPI_df['ARID1A']


In [None]:
#Train, Test set split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#check the correlation of PPI genes and ARID1A expression
correlations = X.apply(lambda col: np.corrcoef(col, y)[0,1])
correlations = correlations.sort_values()

## Random Forest & Linear regression model for feature
* RF model
* Linear model 
* Compare RMSE, R2, R
* Better result becomes a feature

In [None]:
Result_matrix = pd.DataFrame(columns=['train_RMSE','train_R2','train_R','test_RMSE','test_R2','test_R'])

In [None]:
def RandomForestRegressor_objective(trial):
    params = { 
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000, step=100),
        'max_depth': trial.suggest_int('max_depth', 5, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap',[True, False]),
        'n_jobs' : -1,
        'random_state' : 42
    }
    
    model = RandomForestRegressor(**params)
    rf_cv = -1 * cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs= -1)
    return np.mean(rf_cv)


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(RandomForestRegressor_objective, n_trials=300)
best_params = study.best_params
RF_model = RandomForestRegressor(**best_params)
RF_model.fit(X_train, y_train)
train_y_pred = RF_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_y_pred))
train_r2 = r2_score(y_train, train_y_pred)
train_corr = np.corrcoef(y_train, train_y_pred)[0,1]
test_y_pred = RF_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_y_pred))
test_r2 = r2_score(y_test, test_y_pred)
test_corr = np.corrcoef(y_test, test_y_pred)[0,1]
Result_matrix.loc['RF_model'] = [train_rmse,train_r2,train_corr,test_rmse,test_r2,test_corr]


In [None]:
# Define Ridge Regressor
def ridge_objective(trial):
    params = { 
        # alpha = regularization strength
        'alpha' : trial.suggest_float('alpha', 0.1, 100, log=True),
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'random_state' : 42
    }
    
    model = Ridge(**params)
    ridge_cv = -1 * cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs= -1)

    return np.mean(ridge_cv)

In [None]:
ridge_study = optuna.create_study(direction='minimize')
ridge_study.optimize(ridge_objective, n_trials = 300)
ridge_best_params = ridge_study.best_params
linear_model = Ridge(**ridge_best_params)
linear_model.fit(X_train, y_train)
train_y_pred = linear_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_y_pred))
train_r2 = r2_score(y_train, train_y_pred)
train_corr = np.corrcoef(y_train, train_y_pred)[0,1]
test_y_pred = linear_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_y_pred))
test_r2 = r2_score(y_test, test_y_pred)
test_corr = np.corrcoef(y_test, test_y_pred)[0,1]
Result_matrix.loc['Ridge'] = [train_rmse,train_r2,train_corr,test_rmse,test_r2,test_corr]

In [None]:
Result_matrix
# In the paper, linear model was selected 

In [None]:
raw_PPI_df = pd.read_csv("dataset/UCEC_ARID1A_BioGRID_PPI.csv", sep = ',', index_col =1)
raw_PPI_df = raw_PPI_df[raw_PPI_df.columns.difference(['Unnamed: 0'])]
linear_model.fit(X, y)
y_pred = linear_model.predict(raw_PPI_df.drop('ARID1A', axis=1))


In [None]:
Biogrid_feature = pd.DataFrame(y_pred, columns = ['BioGRID'], index = raw_PPI_df.index)
Biogrid_feature.to_csv("/dataset/Biogrid_feature.csv")
