# Check features

In [1]:
import numpy as np
import pandas as pd
from pycaret.regression import *
from scipy.stats import spearmanr

# Load data to train with
df = pd.read_csv("/home/vmottaqi/cmipb_challenge/vincent_files/training_t_cell_processed_task4.1_nov5.csv", index_col=0)

# Load data to process
df = pd.read_csv('/home/vmottaqi/cmipb_challenge/challenge_data_processed/challenge_t_cell_processed_task4.1_nov11.csv', index_col=0)

In [2]:
df.columns

Index(['day_0_specimen_id', 'subject_id', 'infancy_vac', 'biological_sex',
       'dataset', 'age', 'day_0_IL17A_DMSO', 'day_0_IL17A_PHA',
       'day_0_IL17A_PT', 'day_0_Th1_DMSO', 'day_0_Th1_PHA', 'day_0_Th1_PT',
       'day_0_Th2_DMSO', 'day_0_Th2_PHA', 'day_0_Th2_PT',
       'day_0_DMSO_activation', 'day_0_PHA_activation', 'day_0_PT_activation',
       'day_0_TT_activation'],
      dtype='object')

Use ['day_0_specimen_id', 'subject_id', 'infancy_vac', 'biological_sex',
       'dataset', 'age', 'day_0_IL17A_DMSO', 'day_0_IL17A_PHA',
       'day_0_IL17A_PT', 'day_0_Th1_DMSO', 'day_0_Th1_PHA', 'day_0_Th1_PT',
       'day_0_Th2_DMSO', 'day_0_Th2_PHA', 'day_0_Th2_PT',
       'day_0_DMSO_activation', 'day_0_PHA_activation', 'day_0_PT_activation',
       'day_0_TT_activation']

In [3]:
sel_feat = ['day_0_Th2_PHA', 'day_0_IL17A_PT', 'biological_sex', 'day_0_Th2_PT', 'day_0_IL17A_PHA', 'day_0_Th1_PHA', 'day_0_Th1_PT', 'age', 'infancy_vac']

In [4]:
cat_feat = ['infancy_vac', 'biological_sex']

# Run models for task 4.1

In [7]:
import numpy as np
import pandas as pd
from pycaret.regression import *
from scipy.stats import spearmanr, rankdata

# Load data
df = pd.read_csv("/home/vmottaqi/cmipb_challenge/vincent_files/training_t_cell_processed_task4.1_nov5.csv", index_col=0)
df = df[df['FC_day_30_Th1/Th2'] >= 0]

# Define features and target
X = df[sel_feat]
y = df['FC_day_30_Th1/Th2']

# Combine features and target for PyCaret input
data = X.copy()
data['FC_day_30_Th1/Th2'] = y

# Initialize PyCaret with the dataset and target column
regression_setup = setup(
    data=data,
    target='FC_day_30_Th1/Th2',
    categorical_features=cat_feat,  # Specify any categorical features
    session_id=1,
    fold=5
)

# Custom Spearman correlation function with zero variance handling
def spearman_metric(y_true, y_pred):
    # Rank the data
    y_true_ranked = rankdata(y_true)
    y_pred_ranked = rankdata(y_pred)
    
    # Check for zero variance in ranks (constant values)
    if np.std(y_true_ranked) == 0 or np.std(y_pred_ranked) == 0:
        return np.nan  # If there's no variance, return NaN
    
    # Calculate Spearman correlation on ranked data
    return spearmanr(y_true_ranked, y_pred_ranked)[0]  # Only return the correlation coefficient

# Adding the custom Spearman metric
add_metric(
    'Spearman', 
    'Spearman Correlation', 
    spearman_metric, 
    greater_is_better=True
)

# Compare models using a supported metric (e.g., R2)
top_models = compare_models(sort='R2', n_select=15)  # Select top models based on R2, adjust as needed

# Calculate Spearman correlation for each top model
spearman_results = []
for model in top_models:
    predictions = predict_model(model)  # Get predictions
    
    # Identify the correct prediction column
    prediction_column = 'Label' if 'Label' in predictions.columns else predictions.columns[-1]
    
    # Calculate Spearman correlation, ensuring the target column exists in predictions
    if 'FC_day_30_Th1/Th2' in predictions.columns:
        spearman_corr = spearman_metric(predictions['FC_day_30_Th1/Th2'], predictions[prediction_column])
        spearman_results.append((model, spearman_corr))

# Sort and display models by Spearman correlation
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[1] if x[1] is not None else -1, reverse=True)
for model, spearman_corr in sorted_spearman_results:
    print(f"Model: {model}, Spearman Correlation: {spearman_corr}")


Unnamed: 0,Description,Value
0,Session id,1
1,Target,FC_day_30_Th1/Th2
2,Target type,Regression
3,Original data shape,"(41, 10)"
4,Transformed data shape,"(41, 10)"
5,Transformed train set shape,"(28, 10)"
6,Transformed test set shape,"(13, 10)"
7,Numeric features,7
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation,TT (Sec)
lightgbm,Light Gradient Boosting Machine,4.8184,44.5722,6.361,-0.3762,1.1133,403.6571,0.0,0.102
dummy,Dummy Regressor,4.8184,44.5722,6.361,-0.3762,1.1133,403.6571,0.0,0.016
omp,Orthogonal Matching Pursuit,5.0767,51.7874,6.6579,-0.389,1.1429,887.4784,0.0,0.204
rf,Random Forest Regressor,5.0203,56.041,6.8178,-0.4404,1.1807,363.7226,0.0,0.05
en,Elastic Net,5.4421,56.5869,6.9423,-0.4614,1.2003,653.6781,0.0,0.206
lasso,Lasso Regression,5.462,56.3903,6.9619,-0.5012,1.2086,693.0169,0.0,0.204
llar,Lasso Least Angle Regression,5.4619,56.39,6.9619,-0.5012,1.2086,693.0191,0.0,0.206
br,Bayesian Ridge,5.4598,51.4821,6.7786,-0.5189,1.1939,415.5263,0.0,0.202
ridge,Ridge Regression,6.0633,61.2503,7.2504,-0.5676,1.2882,1193.1436,0.0,0.204
gbr,Gradient Boosting Regressor,5.2544,59.3641,6.9225,-0.6142,1.2155,846.2689,0.0,0.028


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,7.4093,115.6577,10.7544,-0.0728,1.2021,13.5925,


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,7.4093,115.6577,10.7544,-0.0728,1.2021,13.5925,


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,7.399,126.5555,11.2497,-0.1739,1.2055,10.7477,0.1018


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,7.2423,115.5199,10.748,-0.0716,1.0813,5.3663,0.4066


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,7.0899,118.4627,10.8841,-0.0989,1.1046,8.9562,0.4615


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,7.0538,117.7586,10.8517,-0.0923,1.1079,9.7627,0.4341


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,7.0538,117.7586,10.8517,-0.0923,1.1079,9.7627,0.4341


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,7.3513,114.9071,10.7195,-0.0659,1.1783,12.7497,0.3901


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,7.7493,120.1216,10.96,-0.1143,1.1828,7.6785,0.3022


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,7.0453,123.0997,11.095,-0.1419,1.159,3.1242,0.489


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,6.6826,101.4,10.0698,0.0594,1.0473,12.5123,0.4343


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Passive Aggressive Regressor,7.1495,143.3104,11.9712,-0.3294,1.3398,2.4277,0.3626


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Linear Regression,8.0102,121.7924,11.036,-0.1298,1.2302,8.2973,0.1758


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,7.9448,144.5561,12.0231,-0.3409,1.352,3.7241,0.0165


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,5.9309,106.4564,10.3178,0.0125,0.994,5.1086,0.5207


Model: LGBMRegressor(n_jobs=-1, random_state=1), Spearman Correlation: nan
Model: DummyRegressor(), Spearman Correlation: nan
Model: AdaBoostRegressor(random_state=1), Spearman Correlation: 0.5206631326947936
Model: GradientBoostingRegressor(random_state=1), Spearman Correlation: 0.48901098901098894
Model: ElasticNet(random_state=1), Spearman Correlation: 0.46153846153846156
Model: KNeighborsRegressor(n_jobs=-1), Spearman Correlation: 0.4343119072478333
Model: Lasso(random_state=1), Spearman Correlation: 0.4340659340659341
Model: LassoLars(random_state=1), Spearman Correlation: 0.4340659340659341
Model: RandomForestRegressor(n_jobs=-1, random_state=1), Spearman Correlation: 0.40659340659340654
Model: BayesianRidge(), Spearman Correlation: 0.39010989010989006
Model: PassiveAggressiveRegressor(random_state=1), Spearman Correlation: 0.3626373626373627
Model: Ridge(random_state=1), Spearman Correlation: 0.3021978021978022
Model: LinearRegression(n_jobs=-1), Spearman Correlation: 0.17582417

In [8]:
# Check the pearson correlation of predictions
pearson_results = []
for model in top_models:
    predictions = predict_model(model)  # Get predictions
    
    # Identify the correct prediction column
    prediction_column = 'Label' if 'Label' in predictions.columns else predictions.columns[-1]
    
    # Calculate Spearman correlation
    pearson_corr = np.corrcoef(predictions['FC_day_30_Th1/Th2'], predictions[prediction_column])[0,1]
    pearson_results.append((model, pearson_corr))
    
    # Print Spearman correlation
    print(f"Model: {model}, Pearson Correlation: {pearson_corr}")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,7.4093,115.6577,10.7544,-0.0728,1.2021,13.5925,


Model: LGBMRegressor(n_jobs=-1, random_state=1), Pearson Correlation: nan


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,7.4093,115.6577,10.7544,-0.0728,1.2021,13.5925,


Model: DummyRegressor(), Pearson Correlation: nan


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,7.399,126.5555,11.2497,-0.1739,1.2055,10.7477,0.1018


Model: OrthogonalMatchingPursuit(), Pearson Correlation: -0.15006006125336813


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,7.2423,115.5199,10.748,-0.0716,1.0813,5.3663,0.4066


Model: RandomForestRegressor(n_jobs=-1, random_state=1), Pearson Correlation: 0.16983301845179918


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,7.0899,118.4627,10.8841,-0.0989,1.1046,8.9562,0.4615


Model: ElasticNet(random_state=1), Pearson Correlation: 0.09694969170216512


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,7.0538,117.7586,10.8517,-0.0923,1.1079,9.7627,0.4341


Model: Lasso(random_state=1), Pearson Correlation: 0.10512960224647364


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,7.0538,117.7586,10.8517,-0.0923,1.1079,9.7627,0.4341


Model: LassoLars(random_state=1), Pearson Correlation: 0.10512948069804548


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,7.3513,114.9071,10.7195,-0.0659,1.1783,12.7497,0.3901


Model: BayesianRidge(), Pearson Correlation: 0.10851318390831553


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,7.7493,120.1216,10.96,-0.1143,1.1828,7.6785,0.3022


Model: Ridge(random_state=1), Pearson Correlation: 0.06619237333446101


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,7.0453,123.0997,11.095,-0.1419,1.159,3.1242,0.489


Model: GradientBoostingRegressor(random_state=1), Pearson Correlation: 0.18740420793967247


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,6.6826,101.4,10.0698,0.0594,1.0473,12.5123,0.4343


Model: KNeighborsRegressor(n_jobs=-1), Pearson Correlation: 0.528370563557106


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Passive Aggressive Regressor,7.1495,143.3104,11.9712,-0.3294,1.3398,2.4277,0.3626


Model: PassiveAggressiveRegressor(random_state=1), Pearson Correlation: 0.2069168707638107


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Linear Regression,8.0102,121.7924,11.036,-0.1298,1.2302,8.2973,0.1758


Model: LinearRegression(n_jobs=-1), Pearson Correlation: 0.030535247252223164


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,7.9448,144.5561,12.0231,-0.3409,1.352,3.7241,0.0165


Model: HuberRegressor(), Pearson Correlation: -0.11884463803987726


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,5.9309,106.4564,10.3178,0.0125,0.994,5.1086,0.5207


Model: AdaBoostRegressor(random_state=1), Pearson Correlation: 0.42462796818358783


# Exploring more features

In [5]:
import numpy as np
import pandas as pd
from pycaret.regression import *
from scipy.stats import spearmanr

exp_feats = ['day_0_specimen_id', 'subject_id', 'specimen_type', 'infancy_vac',
       'biological_sex', 'ethnicity', 'race', 'dataset', 'age', 'day_0_IgG_PT',
       'day_0_IgG_PRN', 'day_0_IgG_FHA', 'day_0_IgG1_PT', 'day_0_IgG1_PRN',
       'day_0_IgG1_FHA', 'day_0_IgG1_FIM2/3', 'day_0_IgG1_TT', 'day_0_IgG1_DT',
       'day_0_IgG1_OVA', 'day_0_IgG2_PT', 'day_0_IgG2_PRN', 'day_0_IgG2_FHA',
       'day_0_IgG2_FIM2/3', 'day_0_IgG2_TT', 'day_0_IgG2_DT', 'day_0_IgG2_OVA',
       'day_0_IgG3_PT', 'day_0_IgG3_PRN', 'day_0_IgG3_FHA',
       'day_0_IgG3_FIM2/3', 'day_0_IgG3_TT', 'day_0_IgG3_DT', 'day_0_IgG3_OVA',
       'day_0_IgG4_PT', 'day_0_IgG4_PRN', 'day_0_IgG4_FHA',
       'day_0_IgG4_FIM2/3', 'day_0_IgG4_TT', 'day_0_IgG4_DT', 'day_0_IgG4_OVA']

In [28]:
# Load data
df = pd.read_csv("/home/vmottaqi/cmipb_challenge/vincent_files/training_t_cell_processed_task4.1_nov5.csv", index_col=0)
df = df[df['FC_day_30_Th1/Th2'] >= 0]

# Define features and target
X = df[sel_feat]
y1 = df['day_30_Th1_PT']
y2 = df['day_30_Th2_PT']

# Combine features and separate targets for PyCaret input
data_Th1 = X.copy()
data_Th1['day_30_Th1_PT'] = y1

data_Th2 = X.copy()
data_Th2['day_30_Th2_PT'] = y2




In [29]:
# Custom Spearman correlation function
def spearman_metric(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]  # Return the Spearman correlation coefficient

# Initialize PyCaret setup for predicting 'day_30_Th1_PT'
setup_Th1 = setup(
    data=data_Th1,
    target='day_30_Th1_PT',
    categorical_features=cat_feat,
    session_id=1,
    fold=5
)
add_metric(
    'Spearman', 
    'Spearman Correlation', 
    spearman_metric, 
    greater_is_better=True
)

#Compare models using a supported metric (e.g., R2) for both targets
top_models_Th1 = compare_models(sort='R2', n_select=15)


# Initialize PyCaret setup for predicting 'day_30_Th2_PT'
setup_Th2 = setup(
    data=data_Th2,
    target='day_30_Th2_PT',
    categorical_features=cat_feat,
    session_id=2,
    fold=5
)


# Add the custom Spearman metric
add_metric(
    'Spearman', 
    'Spearman Correlation', 
    spearman_metric, 
    greater_is_better=True
)

top_models_Th2 = compare_models(sort='R2', n_select=15)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,day_30_Th1_PT
2,Target type,Regression
3,Original data shape,"(41, 10)"
4,Transformed data shape,"(41, 10)"
5,Transformed train set shape,"(28, 10)"
6,Transformed test set shape,"(13, 10)"
7,Numeric features,7
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation,TT (Sec)
rf,Random Forest Regressor,1.0905,1.7102,1.2791,-0.415,0.5983,12.3483,0.0,0.048
omp,Orthogonal Matching Pursuit,1.2294,1.994,1.354,-0.4213,0.5744,17.1393,0.0,0.016
gbr,Gradient Boosting Regressor,1.0558,1.8937,1.3323,-0.6124,0.5985,8.0375,0.0,0.02
et,Extra Trees Regressor,1.1389,1.8469,1.3222,-0.9015,0.6303,13.6733,0.0,0.04
lasso,Lasso Regression,1.3586,2.2397,1.4863,-0.9426,0.6428,19.8801,0.0,0.016
llar,Lasso Least Angle Regression,1.3586,2.2397,1.4863,-0.9426,0.6428,19.8785,0.0,0.016
dummy,Dummy Regressor,1.3332,2.1593,1.463,-0.9841,0.6299,18.3396,0.0,0.016
lightgbm,Light Gradient Boosting Machine,1.3332,2.1593,1.463,-0.9841,0.6299,18.3396,0.0,0.126
knn,K Neighbors Regressor,1.3321,2.44,1.4987,-0.9886,0.6729,18.1804,0.0,0.022
ada,AdaBoost Regressor,1.1949,2.2012,1.4591,-1.0083,0.6365,9.8161,0.0,0.032


Unnamed: 0,Description,Value
0,Session id,2
1,Target,day_30_Th2_PT
2,Target type,Regression
3,Original data shape,"(41, 10)"
4,Transformed data shape,"(41, 10)"
5,Transformed train set shape,"(28, 10)"
6,Transformed test set shape,"(13, 10)"
7,Numeric features,7
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation,TT (Sec)
knn,K Neighbors Regressor,1.0709,3.485,1.5084,-0.3855,0.4726,1.9474,0.0,0.022
ada,AdaBoost Regressor,0.8665,1.9431,1.1934,-0.4283,0.427,2.1533,0.0,0.022
rf,Random Forest Regressor,0.9522,2.6725,1.2985,-0.6548,0.463,2.874,0.0,0.048
dt,Decision Tree Regressor,1.3352,4.885,1.8225,-1.0703,0.6063,2.9804,0.0,0.012
gbr,Gradient Boosting Regressor,1.1335,3.4456,1.5803,-1.4431,0.5406,2.9466,0.0,0.02
et,Extra Trees Regressor,1.1003,2.5997,1.4083,-1.7866,0.5112,3.5308,0.0,0.04
omp,Orthogonal Matching Pursuit,1.292,3.8092,1.6956,-1.9642,0.5756,3.6308,0.0,0.016
llar,Lasso Least Angle Regression,1.2445,4.0534,1.6853,-2.0855,0.5515,3.026,0.0,0.016
lasso,Lasso Regression,1.2445,4.0534,1.6853,-2.0855,0.5515,3.026,0.0,0.016
en,Elastic Net,1.2079,3.1837,1.5834,-3.1104,0.5282,3.2518,0.0,0.012


In [39]:
data_Th1

Unnamed: 0,day_0_Th2_PHA,day_0_IL17A_PT,biological_sex,day_0_Th2_PT,day_0_IL17A_PHA,day_0_Th1_PHA,day_0_Th1_PT,age,infancy_vac,day_30_Th1_PT
0,0.038217,0.002116,Female,0.127659,0.025064,0.06549,0.006795,32,wP,1.762176
1,1.872613,0.392593,Female,2.489356,0.943766,0.511916,0.342016,25,wP,0.041901
2,2.13376,0.104762,Female,0.127659,0.804949,1.663089,1.255946,42,wP,2.134767
3,2.509555,1.423633,Female,5.063817,1.728149,1.64235,2.908268,47,wP,0.862967
4,2.01911,0.323457,Male,2.595741,0.418059,1.0,2.246886,47,wP,0.961495
5,0.503184,1.0,Male,0.319148,1.456298,0.26087,0.147225,21,aP,0.086069
6,0.038217,0.129806,Female,0.127659,0.286311,0.275787,0.259343,21,aP,3.291055
7,5.076437,0.057143,Female,0.127659,0.220437,1.566309,0.182333,28,wP,1.830126
8,0.700638,0.216226,Female,0.127659,0.480398,0.785155,1.920725,24,wP,0.117781
9,1.019108,0.21552,Female,0.127659,0.61054,1.13189,0.520951,24,wP,1.656853


In [40]:
def calculate_predictions(model, pred_column='Label'):
    """
    Calculate predictions for a model and return the predictions column.
    """
    predictions = predict_model(model)
    predictions = predictions.reset_index(drop=True)
    
    # Identify the prediction column
    pred_col = pred_column if pred_column in predictions.columns else predictions.select_dtypes(include=np.number).columns[-1]
    print(f"Predictions column for model {model}: {pred_col}")  # Debugging statement
    return predictions[[pred_col]].rename(columns={pred_col: f'Prediction_{model}'})

def merge_predictions(pred_Th1, pred_Th2):
    """
    Merge the predictions for two models and calculate the derived fold change (FC).
    """
    merged_predictions = pd.merge(
        pred_Th1, 
        pred_Th2, 
        left_index=True, 
        right_index=True
    )
    derived_FC = merged_predictions.iloc[:, 0] / merged_predictions.iloc[:, 1]
    return derived_FC

def calculate_spearman_correlation(model_Th1, model_Th2, original_target):
    """
    Calculate the Spearman correlation for two models.
    """
    predictions_Th1 = calculate_predictions(model_Th1)
    predictions_Th2 = calculate_predictions(model_Th2)
    
    # Calculate the derived fold change for Th1 and Th2 predictions
    derived_FC = merge_predictions(predictions_Th1, predictions_Th2)
    
    # Calculate Spearman correlation with the original target
    spearman_corr = spearman_metric(original_target, derived_FC)
    return model_Th1, model_Th2, spearman_corr

# Main code block
spearman_results = []
original_target = df[['FC_day_30_Th1/Th2']]

for model_Th1, model_Th2 in zip(top_models_Th1, top_models_Th2):
    result = calculate_spearman_correlation(model_Th1, model_Th2, original_target)
    spearman_results.append(result)

# Sort and display models by Spearman correlation
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[2], reverse=True)
for model_Th1, model_Th2, spearman_corr in sorted_spearman_results:
    print(f"Model Th1: {model_Th1}, Model Th2: {model_Th2}, Spearman Correlation: {spearman_corr}")


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,3.0009,35.3816,5.9482,-0.1115,1.0788,8.5902,0.0675


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.6516,30.3164,5.506,0.0476,0.8929,4.7857,0.2997


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 41 and the array at index 1 has size 13

In [44]:
def calculate_spearman_correlation(model_Th1, model_Th2, original_target):
    """
    Calculate the Spearman correlation for two models.
    """
    predictions_Th1 = calculate_predictions(model_Th1)
    predictions_Th2 = calculate_predictions(model_Th2)
    
    # Calculate the derived fold change for Th1 and Th2 predictions
    derived_FC = merge_predictions(predictions_Th1, predictions_Th2)
    
    # Align derived_FC with original_target
    aligned_data = pd.concat([original_target.reset_index(drop=True), derived_FC.reset_index(drop=True)], axis=1)
    aligned_data.dropna(inplace=True)  # Drop any rows with NaN values if predictions or targets don't align
    target_aligned, derived_FC_aligned = aligned_data.iloc[:, 0], aligned_data.iloc[:, 1]

    # Debugging output: print shapes to verify alignment
    print(f"Original target shape: {target_aligned.shape}, Derived FC shape: {derived_FC_aligned.shape}")
    
    # Calculate Spearman correlation
    spearman_corr = spearman_metric(target_aligned, derived_FC_aligned)
    return model_Th1, model_Th2, spearman_corr

# Main code block
spearman_results = []
original_target = df[['FC_day_30_Th1/Th2']]

for model_Th1, model_Th2 in zip(top_models_Th1, top_models_Th2):
    result = calculate_spearman_correlation(model_Th1, model_Th2, original_target)
    spearman_results.append(result)

# Sort and display models by Spearman correlation
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[2], reverse=True)
for model_Th1, model_Th2, spearman_corr in sorted_spearman_results:
    print(f"Model Th1: {model_Th1}, Model Th2: {model_Th2}, Spearman Correlation: {spearman_corr}")


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,3.0009,35.3816,5.9482,-0.1115,1.0788,8.5902,0.0675


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.6516,30.3164,5.506,0.0476,0.8929,4.7857,0.2997


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,3.1775,32.7977,5.7269,-0.0303,1.013,10.1715,0.0458


Predictions column for model OrthogonalMatchingPursuit(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,2.5973,24.4577,4.9455,0.2317,0.8417,6.7666,0.0692


Predictions column for model AdaBoostRegressor(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,3.1608,36.7896,6.0654,-0.1558,1.1784,9.3884,-0.0352


Predictions column for model GradientBoostingRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,2.6454,24.0143,4.9004,0.2456,0.8158,7.8242,0.27


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.9597,36.0958,6.008,-0.134,1.1211,7.4801,0.1115


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,2.985,27.3063,5.2255,0.1422,0.9845,10.0778,0.0928


Predictions column for model DecisionTreeRegressor(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model Lasso(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,2.7364,21.7204,4.6605,0.3176,0.9104,9.4035,0.2377


Predictions column for model GradientBoostingRegressor(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model LassoLars(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.452,24.5548,4.9553,0.2286,0.7503,6.0588,0.3052


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,3.1332,32.6777,5.7164,-0.0266,1.0025,10.062,


Predictions column for model DummyRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,3.2518,34.3643,5.8621,-0.0796,1.0764,9.6987,0.0458


Predictions column for model OrthogonalMatchingPursuit(): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.1332,32.6777,5.7164,-0.0266,1.0025,10.062,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,2.4727,14.4349,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model LassoLars(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.7042,28.6096,5.3488,0.1012,0.8295,6.9795,0.6386


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,2.4727,14.4348,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model Lasso(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,3.0207,35.4451,5.9536,-0.1135,1.0905,8.3825,0.0059


Predictions column for model AdaBoostRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,2.2456,10.4243,3.2287,0.6725,0.8706,8.226,0.0939


Predictions column for model ElasticNet(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,3.0029,33.0586,5.7497,-0.0385,0.9772,8.9067,-0.1849


Predictions column for model ElasticNet(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Passive Aggressive Regressor,2.2102,9.7371,3.1204,0.6941,0.7797,9.0094,0.1233


Predictions column for model PassiveAggressiveRegressor(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,3.0753,36.9154,6.0758,-0.1597,1.1608,8.4957,0.0898


Predictions column for model DecisionTreeRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,2.0843,9.9657,3.1569,0.6869,0.7405,6.8728,0.1908


Predictions column for model BayesianRidge(): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,2.8952,35.3754,5.9477,-0.1113,1.0775,7.5003,-0.1673


Predictions column for model HuberRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,2.8924,35.056,5.9208,-0.1013,1.0463,7.6822,-0.226


Predictions column for model Ridge(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model DummyRegressor(): prediction_label
Original target shape: (13,), Derived FC shape: (13,)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Linear Regression,2.8724,35.0366,5.9192,-0.1007,1.0431,7.5739,-0.2553


Predictions column for model LinearRegression(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,1.8958,7.7968,2.7923,0.7551,0.6065,6.0743,0.27


Predictions column for model Ridge(random_state=2): prediction_label
Original target shape: (13,), Derived FC shape: (13,)
Model Th1: DummyRegressor(), Model Th2: OrthogonalMatchingPursuit(), Spearman Correlation: 0.0746905827751051
Model Th1: AdaBoostRegressor(random_state=1), Model Th2: ElasticNet(random_state=2), Spearman Correlation: 0.010989010989010988
Model Th1: Ridge(random_state=1), Model Th2: DummyRegressor(), Spearman Correlation: -0.1043956043956044
Model Th1: KNeighborsRegressor(n_jobs=-1), Model Th2: Lasso(random_state=2), Spearman Correlation: -0.1758241758241758
Model Th1: LinearRegression(n_jobs=-1), Model Th2: Ridge(random_state=2), Spearman Correlation: -0.23626373626373626
Model Th1: ElasticNet(random_state=1), Model Th2: PassiveAggressiveRegressor(random_state=2), Spearman Correlation: -0.2582417582417582
Model Th1: HuberRegressor(), Model Th2: LGBMRegressor(n_jobs=-1, random_state=2), Spearman Correlation: -0.2747252747252747
Model Th1: LGBMRegressor(n_jobs=-1, ra

In [48]:
def calculate_spearman_correlation(model_Th1, model_Th2, original_target, invert=False):
    """
    Calculate the Spearman correlation for two models with an optional inversion.
    """
    predictions_Th1 = calculate_predictions(model_Th1)
    predictions_Th2 = calculate_predictions(model_Th2)
    
    # Calculate the derived fold change, allowing for an inversion
    derived_FC = merge_predictions(predictions_Th1, predictions_Th2)
    if invert:
        derived_FC = 1 / derived_FC  # Invert the fold change if needed
    
    # Align derived_FC with original_target
    aligned_data = pd.concat([original_target.reset_index(drop=True), derived_FC.reset_index(drop=True)], axis=1)
    aligned_data.dropna(inplace=True)
    target_aligned, derived_FC_aligned = aligned_data.iloc[:, 0], aligned_data.iloc[:, 1]
    
    # Calculate Spearman correlation
    spearman_corr = spearman_metric(target_aligned, derived_FC_aligned)
    return spearman_corr

# Main code block
spearman_results = []
original_target = df[['FC_day_30_Th1/Th2']]

for model_Th1, model_Th2 in zip(top_models_Th1, top_models_Th2):
    spearman_corr = calculate_spearman_correlation(model_Th1, model_Th2, original_target, invert=False)
    
    # If the correlation is negative, try the inversion
    if spearman_corr < 0:
        spearman_corr = calculate_spearman_correlation(model_Th1, model_Th2, original_target, invert=True)
    
    spearman_results.append((model_Th1, model_Th2, spearman_corr))

# Sort and display positive Spearman correlations
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[2], reverse=True)
for model_Th1, model_Th2, spearman_corr in sorted_spearman_results:
    print(f"Spearman Correlation between Original Target and Derived FC for {model_Th1} and {model_Th2}: {spearman_corr}")


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,3.0009,35.3816,5.9482,-0.1115,1.0788,8.5902,0.0675


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.6516,30.3164,5.506,0.0476,0.8929,4.7857,0.2997


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,3.0009,35.3816,5.9482,-0.1115,1.0788,8.5902,0.0675


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.6516,30.3164,5.506,0.0476,0.8929,4.7857,0.2997


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,3.1775,32.7977,5.7269,-0.0303,1.013,10.1715,0.0458


Predictions column for model OrthogonalMatchingPursuit(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,2.5973,24.4577,4.9455,0.2317,0.8417,6.7666,0.0692


Predictions column for model AdaBoostRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,3.1775,32.7977,5.7269,-0.0303,1.013,10.1715,0.0458


Predictions column for model OrthogonalMatchingPursuit(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,2.5973,24.4577,4.9455,0.2317,0.8417,6.7666,0.0692


Predictions column for model AdaBoostRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,3.1608,36.7896,6.0654,-0.1558,1.1784,9.3884,-0.0352


Predictions column for model GradientBoostingRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,2.6454,24.0143,4.9004,0.2456,0.8158,7.8242,0.27


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,3.1608,36.7896,6.0654,-0.1558,1.1784,9.3884,-0.0352


Predictions column for model GradientBoostingRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,2.6454,24.0143,4.9004,0.2456,0.8158,7.8242,0.27


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.9597,36.0958,6.008,-0.134,1.1211,7.4801,0.1115


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,2.985,27.3063,5.2255,0.1422,0.9845,10.0778,0.0928


Predictions column for model DecisionTreeRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.9597,36.0958,6.008,-0.134,1.1211,7.4801,0.1115


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,2.985,27.3063,5.2255,0.1422,0.9845,10.0778,0.0928


Predictions column for model DecisionTreeRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model Lasso(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,2.7364,21.7204,4.6605,0.3176,0.9104,9.4035,0.2377


Predictions column for model GradientBoostingRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model Lasso(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,2.7364,21.7204,4.6605,0.3176,0.9104,9.4035,0.2377


Predictions column for model GradientBoostingRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model LassoLars(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.452,24.5548,4.9553,0.2286,0.7503,6.0588,0.3052


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model LassoLars(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.452,24.5548,4.9553,0.2286,0.7503,6.0588,0.3052


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,3.1332,32.6777,5.7164,-0.0266,1.0025,10.062,


Predictions column for model DummyRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,3.2518,34.3643,5.8621,-0.0796,1.0764,9.6987,0.0458


Predictions column for model OrthogonalMatchingPursuit(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.1332,32.6777,5.7164,-0.0266,1.0025,10.062,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,2.4727,14.4349,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model LassoLars(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.1332,32.6777,5.7164,-0.0266,1.0025,10.062,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,2.4727,14.4349,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model LassoLars(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.7042,28.6096,5.3488,0.1012,0.8295,6.9795,0.6386


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,2.4727,14.4348,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model Lasso(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.7042,28.6096,5.3488,0.1012,0.8295,6.9795,0.6386


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,2.4727,14.4348,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model Lasso(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,3.0207,35.4451,5.9536,-0.1135,1.0905,8.3825,0.0059


Predictions column for model AdaBoostRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,2.2456,10.4243,3.2287,0.6725,0.8706,8.226,0.0939


Predictions column for model ElasticNet(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,3.0029,33.0586,5.7497,-0.0385,0.9772,8.9067,-0.1849


Predictions column for model ElasticNet(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Passive Aggressive Regressor,2.2102,9.7371,3.1204,0.6941,0.7797,9.0094,0.1233


Predictions column for model PassiveAggressiveRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,3.0029,33.0586,5.7497,-0.0385,0.9772,8.9067,-0.1849


Predictions column for model ElasticNet(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Passive Aggressive Regressor,2.2102,9.7371,3.1204,0.6941,0.7797,9.0094,0.1233


Predictions column for model PassiveAggressiveRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,3.0753,36.9154,6.0758,-0.1597,1.1608,8.4957,0.0898


Predictions column for model DecisionTreeRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,2.0843,9.9657,3.1569,0.6869,0.7405,6.8728,0.1908


Predictions column for model BayesianRidge(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,3.0753,36.9154,6.0758,-0.1597,1.1608,8.4957,0.0898


Predictions column for model DecisionTreeRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,2.0843,9.9657,3.1569,0.6869,0.7405,6.8728,0.1908


Predictions column for model BayesianRidge(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,2.8952,35.3754,5.9477,-0.1113,1.0775,7.5003,-0.1673


Predictions column for model HuberRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,2.8952,35.3754,5.9477,-0.1113,1.0775,7.5003,-0.1673


Predictions column for model HuberRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,2.8924,35.056,5.9208,-0.1013,1.0463,7.6822,-0.226


Predictions column for model Ridge(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model DummyRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,2.8924,35.056,5.9208,-0.1013,1.0463,7.6822,-0.226


Predictions column for model Ridge(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model DummyRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Linear Regression,2.8724,35.0366,5.9192,-0.1007,1.0431,7.5739,-0.2553


Predictions column for model LinearRegression(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,1.8958,7.7968,2.7923,0.7551,0.6065,6.0743,0.27


Predictions column for model Ridge(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Linear Regression,2.8724,35.0366,5.9192,-0.1007,1.0431,7.5739,-0.2553


Predictions column for model LinearRegression(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,1.8958,7.7968,2.7923,0.7551,0.6065,6.0743,0.27


Predictions column for model Ridge(random_state=2): prediction_label
Spearman Correlation between Original Target and Derived FC for LassoLars(random_state=1) and ExtraTreesRegressor(n_jobs=-1, random_state=2): 0.5879120879120878
Spearman Correlation between Original Target and Derived FC for OrthogonalMatchingPursuit() and AdaBoostRegressor(random_state=2): 0.47802197802197804
Spearman Correlation between Original Target and Derived FC for ExtraTreesRegressor(n_jobs=-1, random_state=1) and DecisionTreeRegressor(random_state=2): 0.46153846153846156
Spearman Correlation between Original Target and Derived FC for GradientBoostingRegressor(random_state=1) and RandomForestRegressor(n_jobs=-1, random_state=2): 0.4340659340659341
Spearman Correlation between Original Target and Derived FC for DecisionTreeRegressor(random_state=1) and BayesianRidge(): 0.4010989010989011
Spearman Correlation between Original Target and Derived FC for RandomForestRegressor(n_jobs=-1, random_state=1) and KNeighb

In [50]:
sorted_spearman_results[0]

(LassoLars(random_state=1),
 ExtraTreesRegressor(n_jobs=-1, random_state=2),
 0.5879120879120878)

In [52]:
# save sorted_spearman_results[0][0]
save_model(sorted_spearman_results[0][0], 'Th1_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['day_0_Th2_PHA', 'day_0_IL17A_PT',
                                              'day_0_Th2_PT', 'day_0_IL17A_PHA',
                                              'day_0_Th1_PHA', 'day_0_Th1_PT',
                                              'age'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['infancy_vac', 'biological_sex'],
                                     transformer=SimpleImputer(strategy='mos...
                  TransformerWrapper(include=['infancy_vac', 'biological_sex'],
                                     transformer=OrdinalEncoder(cols=['infancy_vac',
                                                                      'biological_sex'],
                                                                handle_missing='return_nan',
          

In [53]:
# save sorted_spearman_results[0][1]
save_model(sorted_spearman_results[0][1], 'Th2_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['day_0_Th2_PHA', 'day_0_IL17A_PT',
                                              'day_0_Th2_PT', 'day_0_IL17A_PHA',
                                              'day_0_Th1_PHA', 'day_0_Th1_PT',
                                              'age'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['infancy_vac', 'biological_sex'],
                                     transformer=SimpleImputer(strategy='mos...
                  TransformerWrapper(include=['infancy_vac', 'biological_sex'],
                                     transformer=OrdinalEncoder(cols=['infancy_vac',
                                                                      'biological_sex'],
                                                                handle_missing='return_nan',
          

In [47]:
def calculate_spearman_correlation(model_Th1, model_Th2, original_target):
    """
    Calculate the Spearman correlation for two models.
    """
    predictions_Th1 = calculate_predictions(model_Th1)
    predictions_Th2 = calculate_predictions(model_Th2)
    
    # Calculate the derived fold change for Th1 and Th2 predictions
    derived_FC = merge_predictions(predictions_Th1, predictions_Th2)
    
    # Align derived_FC with original_target
    aligned_data = pd.concat([original_target.reset_index(drop=True), derived_FC.reset_index(drop=True)], axis=1)
    aligned_data.dropna(inplace=True)  # Drop any rows with NaN values if predictions or targets don't align
    target_aligned, derived_FC_aligned = aligned_data.iloc[:, 0], aligned_data.iloc[:, 1]
    
    # Calculate Spearman correlation
    spearman_corr = spearman_metric(target_aligned, derived_FC_aligned)
    return spearman_corr

# Main code block
spearman_results = []
original_target = df[['FC_day_30_Th1/Th2']]

for model_Th1, model_Th2 in zip(top_models_Th1, top_models_Th2):
    spearman_corr = calculate_spearman_correlation(model_Th1, model_Th2, original_target)
    spearman_results.append((model_Th1, model_Th2, spearman_corr))

# Sort and display Spearman correlations only
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[2], reverse=True)
for model_Th1, model_Th2, spearman_corr in sorted_spearman_results:
    print(f"Spearman Correlation between Original Target and Derived FC for {model_Th1} and {model_Th2}: {spearman_corr}")


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,3.0009,35.3816,5.9482,-0.1115,1.0788,8.5902,0.0675


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.6516,30.3164,5.506,0.0476,0.8929,4.7857,0.2997


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,3.1775,32.7977,5.7269,-0.0303,1.013,10.1715,0.0458


Predictions column for model OrthogonalMatchingPursuit(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,2.5973,24.4577,4.9455,0.2317,0.8417,6.7666,0.0692


Predictions column for model AdaBoostRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,3.1608,36.7896,6.0654,-0.1558,1.1784,9.3884,-0.0352


Predictions column for model GradientBoostingRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,2.6454,24.0143,4.9004,0.2456,0.8158,7.8242,0.27


Predictions column for model RandomForestRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.9597,36.0958,6.008,-0.134,1.1211,7.4801,0.1115


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,2.985,27.3063,5.2255,0.1422,0.9845,10.0778,0.0928


Predictions column for model DecisionTreeRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model Lasso(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,2.7364,21.7204,4.6605,0.3176,0.9104,9.4035,0.2377


Predictions column for model GradientBoostingRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,3.0857,32.7598,5.7236,-0.0292,0.9892,9.6214,-0.2172


Predictions column for model LassoLars(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,2.452,24.5548,4.9553,0.2286,0.7503,6.0588,0.3052


Predictions column for model ExtraTreesRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,3.1332,32.6777,5.7164,-0.0266,1.0025,10.062,


Predictions column for model DummyRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,3.2518,34.3643,5.8621,-0.0796,1.0764,9.6987,0.0458


Predictions column for model OrthogonalMatchingPursuit(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.1332,32.6777,5.7164,-0.0266,1.0025,10.062,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,2.4727,14.4349,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model LassoLars(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.7042,28.6096,5.3488,0.1012,0.8295,6.9795,0.6386


Predictions column for model KNeighborsRegressor(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,2.4727,14.4348,3.7993,0.5465,0.8786,8.372,0.0939


Predictions column for model Lasso(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,3.0207,35.4451,5.9536,-0.1135,1.0905,8.3825,0.0059


Predictions column for model AdaBoostRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,2.2456,10.4243,3.2287,0.6725,0.8706,8.226,0.0939


Predictions column for model ElasticNet(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,3.0029,33.0586,5.7497,-0.0385,0.9772,8.9067,-0.1849


Predictions column for model ElasticNet(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Passive Aggressive Regressor,2.2102,9.7371,3.1204,0.6941,0.7797,9.0094,0.1233


Predictions column for model PassiveAggressiveRegressor(random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,3.0753,36.9154,6.0758,-0.1597,1.1608,8.4957,0.0898


Predictions column for model DecisionTreeRegressor(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,2.0843,9.9657,3.1569,0.6869,0.7405,6.8728,0.1908


Predictions column for model BayesianRidge(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,2.8952,35.3754,5.9477,-0.1113,1.0775,7.5003,-0.1673


Predictions column for model HuberRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model LGBMRegressor(n_jobs=-1, random_state=2): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,2.8924,35.056,5.9208,-0.1013,1.0463,7.6822,-0.226


Predictions column for model Ridge(random_state=1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,3.0157,33.1268,5.7556,-0.0407,0.9749,8.571,


Predictions column for model DummyRegressor(): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Linear Regression,2.8724,35.0366,5.9192,-0.1007,1.0431,7.5739,-0.2553


Predictions column for model LinearRegression(n_jobs=-1): prediction_label


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,1.8958,7.7968,2.7923,0.7551,0.6065,6.0743,0.27


Predictions column for model Ridge(random_state=2): prediction_label
Spearman Correlation between Original Target and Derived FC for DummyRegressor() and OrthogonalMatchingPursuit(): 0.0746905827751051
Spearman Correlation between Original Target and Derived FC for AdaBoostRegressor(random_state=1) and ElasticNet(random_state=2): 0.010989010989010988
Spearman Correlation between Original Target and Derived FC for Ridge(random_state=1) and DummyRegressor(): -0.1043956043956044
Spearman Correlation between Original Target and Derived FC for KNeighborsRegressor(n_jobs=-1) and Lasso(random_state=2): -0.1758241758241758
Spearman Correlation between Original Target and Derived FC for LinearRegression(n_jobs=-1) and Ridge(random_state=2): -0.23626373626373626
Spearman Correlation between Original Target and Derived FC for ElasticNet(random_state=1) and PassiveAggressiveRegressor(random_state=2): -0.2582417582417582
Spearman Correlation between Original Target and Derived FC for HuberRegressor

In [31]:

# Calculate Spearman correlation for each pair of top models
spearman_results = []
for model_Th1, model_Th2 in zip(top_models_Th1, top_models_Th2):
    predictions_Th1 = predict_model(model_Th1)  # Predictions for 'day_30_Th1_PT'
    predictions_Th2 = predict_model(model_Th2)  # Predictions for 'day_30_Th2_PT'
    
    # Reset index and ensure subject_id is included for alignment
    predictions_Th1 = predictions_Th1.reset_index(drop=True)
    predictions_Th2 = predictions_Th2.reset_index(drop=True)
    
    # Identify the prediction column
    pred_Th1 = 'Label' if 'Label' in predictions_Th1.columns else predictions_Th1.select_dtypes(include=np.number).columns[-1]
    pred_Th2 = 'Label' if 'Label' in predictions_Th2.columns else predictions_Th2.select_dtypes(include=np.number).columns[-1]
    
    # Print statements to verify selected columns for debugging
    print(f"Predictions column for Th1: {pred_Th1}, Predictions column for Th2: {pred_Th2}")
    
    # Merge predictions on index for alignment
    merged_predictions = pd.merge(
        predictions_Th1[[pred_Th1]], 
        predictions_Th2[[pred_Th2]], 
        left_index=True, 
        right_index=True,
        suffixes=('_Th1', '_Th2')
    )
    
    # Calculate the derived 'FC_day_30_Th1/Th2' as the ratio of predictions
    derived_FC = merged_predictions[pred_Th1] / merged_predictions[pred_Th2]
    
    # Align with original target using index
    original_target = df[['FC_day_30_Th1/Th2']]
    
    # Calculate Spearman correlation
    spearman_corr = spearman_metric(original_target, derived_FC)
    spearman_results.append((model_Th1, model_Th2, spearman_corr))

# Sort and display models by Spearman correlation
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[2], reverse=True)
for model_Th1, model_Th2, spearman_corr in sorted_spearman_results:
    print(f"Model Th1: {model_Th1}, Model Th2: {model_Th2}, Spearman Correlation: {spearman_corr}")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,3.0009,35.3816,5.9482,-0.1115,1.0788,8.5902,0.0675


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,2.6516,30.3164,5.506,0.0476,0.8929,4.7857,0.2997


Predictions column for Th1: prediction_label, Predictions column for Th2: prediction_label


KeyError: 'prediction_label'

In [36]:
predictions_Th1

Unnamed: 0,day_0_Th2_PHA,day_0_IL17A_PT,biological_sex,day_0_Th2_PT,day_0_IL17A_PHA,day_0_Th1_PHA,day_0_Th1_PT,age,infancy_vac,day_30_Th2_PT,prediction_label
0,0.051724,2.271345,Female,6.465512,1.0,0.09643,1.452289,20,aP,0.103448,1.257464
1,1.019108,0.21552,Female,0.127659,0.61054,1.13189,0.520951,24,wP,0.127659,1.408748
2,4.25478,1.916049,Male,1.0,2.855399,2.22958,4.074746,26,wP,7.382984,3.349277
3,0.038217,0.002116,Female,0.127659,0.025064,0.06549,0.006795,32,wP,0.127659,1.740664
4,2.13376,0.104762,Female,0.127659,0.804949,1.663089,1.255946,42,wP,0.127659,1.734901
5,0.051724,0.007018,Female,0.103448,0.003155,0.002782,0.00931,27,wP,0.103448,1.908869
6,7.27389,1.59224,Female,32.319077,1.0,0.135347,0.003398,27,wP,20.744637,0.192708
7,7.69827,2.837427,Male,1.965515,1.995268,1.33426,1.775019,25,aP,0.103448,1.200177
8,0.038217,0.360141,Female,0.127659,0.001928,0.668365,0.231031,22,aP,0.127659,1.555578
9,1.637929,0.007018,Male,0.103448,0.353838,0.014372,0.00931,31,wP,0.103448,0.798224


In [37]:
predictions_Th2

Unnamed: 0,day_0_Th2_PHA,day_0_IL17A_PT,biological_sex,day_0_Th2_PT,day_0_IL17A_PHA,day_0_Th1_PHA,day_0_Th1_PT,age,infancy_vac,day_30_Th2_PT,prediction_label
0,0.051724,2.271345,Female,6.465512,1.0,0.09643,1.452289,20,aP,0.103448,0.927058
1,1.019108,0.21552,Female,0.127659,0.61054,1.13189,0.520951,24,wP,0.127659,0.139256
2,4.25478,1.916049,Male,1.0,2.855399,2.22958,4.074746,26,wP,7.382984,0.519005
3,0.038217,0.002116,Female,0.127659,0.025064,0.06549,0.006795,32,wP,0.127659,1.23338
4,2.13376,0.104762,Female,0.127659,0.804949,1.663089,1.255946,42,wP,0.127659,3.230664
5,0.051724,0.007018,Female,0.103448,0.003155,0.002782,0.00931,27,wP,0.103448,0.176228
6,7.27389,1.59224,Female,32.319077,1.0,0.135347,0.003398,27,wP,20.744637,2.643725
7,7.69827,2.837427,Male,1.965515,1.995268,1.33426,1.775019,25,aP,0.103448,0.443143
8,0.038217,0.360141,Female,0.127659,0.001928,0.668365,0.231031,22,aP,0.127659,0.256864
9,1.637929,0.007018,Male,0.103448,0.353838,0.014372,0.00931,31,wP,0.103448,1.23338
