In [None]:
#python version 3.11.7
import pkg_resources
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from boruta import BorutaPy 
import optuna 
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
import pickle 
from statannot import add_stat_annotation 
import shap

#np.random.seed(42)

#for boruta
np.int = np.int32
np.float = np.float64
np.bool = np.bool_


In [None]:
import os 
data_dir = 'dataset/'
KEGG_key = 'KEGG_'
all_files = os.listdir(data_dir)
KEGG_files = [file for file in all_files if KEGG_key in file and file.endswith('.csv')]


In [None]:

raw_dfs = {}
for file in KEGG_files:
    file_path = os.path.join(data_dir, file)
    df_name = file.replace('.csv', '')  # Remove the file extension from the file name
    raw_dfs[df_name] = pd.read_csv(file_path, index_col=1)

In [None]:
for df_name, df in raw_dfs.items():
    df = df[df.columns.difference(['Unnamed: 0'])]
    print(f'{df_name}:{df.shape}')
    raw_dfs[df_name] = df
dfs = raw_dfs.copy()

In [None]:
#preprocessing 
for df_name, df in dfs.items():
    df = df.dropna(how = 'any')
    dfs[df_name] = df

In [None]:
for df_name, df in dfs.items():
    print(f'{df_name}:{df.shape}')

In [None]:
# choose "KEGG_Aminoacyl_tRNA_biosynthesis_frame" for index example. 
pred_matrix = pd.DataFrame(index=raw_dfs['KEGG_Aminoacyl_tRNA_biosynthesis_frame'].index,
                           columns=dfs.keys())

In [None]:
Result_matrix = pd.DataFrame(index = dfs.keys(), columns=['train_RMSE','train_R2','train_R','test_RMSE','test_R2','test_R'])

In [None]:
# Define Ridge Regressor
def ridge_objective(trial):
    params = { 
        # alpha = regularization strength
        'alpha' : trial.suggest_float('alpha', 0.1, 100, log=True),
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'random_state' : 42
    }
    
    model = Ridge(**params)
    ridge_cv = -1 * cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs= -1)

    return np.mean(ridge_cv)

In [None]:
len(raw_dfs[df_name])

In [None]:
for i, (df_name, df) in enumerate(dfs.items()):
    X = df.drop('ARID1A', axis=1)
    y = df['ARID1A']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    ridge_study = optuna.create_study(direction='minimize')
    ridge_study.optimize(ridge_objective, n_trials = 300)
    ridge_best_params = ridge_study.best_params
    linear_model = Ridge(**ridge_best_params)
    linear_model.fit(X_train, y_train)
    train_y_pred = linear_model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_y_pred))
    train_r2 = r2_score(y_train, train_y_pred)
    train_corr = np.corrcoef(y_train, train_y_pred)[0,1]
    test_y_pred = linear_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_y_pred))
    test_r2 = r2_score(y_test, test_y_pred)
    test_corr = np.corrcoef(y_test, test_y_pred)[0,1]
    Result_matrix.loc[df_name] = [train_rmse,train_r2,train_corr,test_rmse,test_r2,test_corr]
    linear_model.fit(X, y)
    pred_matrix[df_name] = linear_model.predict(raw_dfs[df_name].drop('ARID1A', axis=1)) 

In [None]:
Result_matrix

In [None]:
pred_matrix.to_csv('dataset/KEGG_feature.csv')
