## Test of different regression models

In [None]:
import numpy as np
import os
import pandas as pd
from sklearn.pipeline import make_pipeline
import itertools
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import joblib
from scipy.stats import pearsonr

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from tabpfn import TabPFNRegressor  
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from catboost import CatBoostRegressor

In [None]:
# please specify the path to the root directory of the repository
ROOT_PATH = '/Users/wolffjoa/data_local/hicgan/hyperparameter'

In [None]:
merged_df = pd.read_csv(os.path.join(ROOT_PATH, 'merged_df.csv'))

In [None]:
merged_df

In [None]:
# Prepare the data
features = ['pearson_AUC', 'hicrep', 'TAD_fraction', 'TAD_fraction_exact_match', 'TAD_score_MSE']
feature_mapping = {
    'pearson_AUC': 'Pearson AUC',
    'hicrep': 'HiCRep',
    'TAD_fraction': 'TAD fraction',
    'TAD_fraction_exact_match': 'TAD FEM',
    'TAD_score_MSE': 'TAD score MSE'
}
# Split the data into training and testing sets
X = merged_df[features]
y = merged_df['ELO']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a dictionary to store the models
models = {}

# Generate all combinations of 2 to all features
combinations = []
for r in range(2, len(features) + 1):
    combinations.extend(itertools.combinations(features, r))



# Initialize a dictionary to store the results
results = {}

# Save the MSE, Pearson correlation, and p-value in the dictionary

# Perform regression for each combination and degree
algorithms = ['linear regression', 'random forest', 'gradientBoosting', 'xgboost', 'catboost', 'TabPFN']

for algo in algorithms:
    print(f"Running {algo} regression...")
    for i, combo in enumerate(combinations):
        mse_scores = []
        degrees = range(1, 6)
        for degree in degrees:
            poly = PolynomialFeatures(degree)
            X_poly_train = poly.fit_transform(X_train[list(combo)])
            X_poly_test = poly.transform(X_test[list(combo)])

            if algo == 'linear regression':
                model = LinearRegression()
            elif algo == 'random forest':
                model = RandomForestRegressor(random_state=42)
            elif algo == 'gradientBoosting':
                model = GradientBoostingRegressor(random_state=42)
            elif algo == 'xgboost':
                model = xgb.XGBRegressor(random_state=42, verbosity=0)
            elif algo == 'catboost':
                model = CatBoostRegressor(random_state=42, verbose=0)
            elif algo == 'TabPFN':
                model = TabPFNRegressor()

            model.fit(X_poly_train, y_train)
            mse = -cross_val_score(model, X_poly_test, y_test, cv=5, scoring='neg_mean_squared_error').mean()
            mse_scores.append(mse)

        best_mse = min(mse_scores)
        best_degree = list(degrees)[mse_scores.index(best_mse)]

        if algo == 'linear regression':
            best_model = make_pipeline(PolynomialFeatures(best_degree), LinearRegression())
        elif algo == 'random forest':
            best_model = make_pipeline(PolynomialFeatures(best_degree), RandomForestRegressor(random_state=42))
        elif algo == 'gradientBoosting':
            best_model = make_pipeline(PolynomialFeatures(best_degree), GradientBoostingRegressor(random_state=42))
        elif algo == 'xgboost':
            best_model = make_pipeline(PolynomialFeatures(best_degree), xgb.XGBRegressor(random_state=42, verbosity=0))
        elif algo == 'catboost':
            best_model = make_pipeline(PolynomialFeatures(best_degree), CatBoostRegressor(random_state=42, verbose=0))
        elif algo == 'TabPFN':
            best_model = make_pipeline(PolynomialFeatures(best_degree), TabPFNRegressor())

        best_model.fit(X_train[list(combo)], y_train)
        combo_str = " ".join([feature_mapping[feat] for feat in combo])
        key = f"{algo}_{combo}_degree_{best_degree}"
        models[key] = best_model
        joblib.dump(best_model, f"best_model_{algo}_{combo_str}_degree_{best_degree}.pkl")

        y_pred = best_model.predict(X_test[list(combo)])
        test_mse = mean_squared_error(y_test, y_pred)
        pearson_corr, p_value = pearsonr(y_test, y_pred)
        print(f"{algo} | {combo_str} | Degree {best_degree} | Test MSE: {test_mse:.2e} | Pearson r: {pearson_corr:.2f} | p-value: {p_value:.2e}")

        results[f"{algo}_{combo_str}"] = {
            'best_degree': best_degree,
            'test_mse': test_mse,
            'pearson_corr': pearson_corr,
            'p_value': p_value
        }



In [None]:
results