In [1]:
import numpy as np
import pandas as pd
from pycaret.regression import *
from scipy.stats import spearmanr

# Load data
df = pd.read_csv("/home/vmottaqi/cmipb_challenge/vincent_files/training_cellfreq_barchcorr_processed_oct29.csv", index_col=0)


In [3]:
df.columns

Index(['day_0_specimen_id', 'subject_id', 'specimen_type', 'infancy_vac',
       'biological_sex', 'ethnicity', 'race', 'dataset', 'age',
       'day_0_Monocytes', 'day_0_Classical_Monocytes',
       'day_0_Non-Classical_Monocytes', 'day_0_Intermediate_Monocytes',
       'day_0_Bcells', 'day_0_CD3CD19neg', 'day_0_CD3 Tcells',
       'day_0_CD4Tcells', 'day_0_CD8Tcells', 'day_0_TemraCD4',
       'day_0_NaiveCD4', 'day_0_TemCD4', 'day_0_TcmCD4', 'day_0_TemraCD8',
       'day_0_NaiveCD8', 'day_0_TemCD8', 'day_0_TcmCD8', 'day_0_NK',
       'day_0_Basophils', 'day_0_pDC', 'day_0_B cells (CD19+CD3-CD14-CD56-)',
       'day_0_B cells (CD19+CD20+CD3-CD14-CD56-)', 'day_0_Memory B cells',
       'day_0_Naive B cells', 'day_0_Proliferating B cells',
       'day_0_Activated B cells (ABCs)', 'day_0_CD56+CD3+T cells',
       'day_0_CD4+CD8+ T cells', 'day_0_CD4-CD8- T cells',
       'day_0_NK cells (CD3-CD19-CD56+)', 'day_0_CD3-CD19-CD56- cells',
       'day_0_non-pDCs',
       'day_0_CD3-CD19-CD56-

In [4]:
X = df.drop(columns=['day_1_specimen_id', 'day_1_Monocytes', 'FC_day_0_1_Monocytes'])
y = df['day_1_Monocytes']

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74 entries, 0 to 73
Data columns (total 48 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   day_0_specimen_id                                                 74 non-null     int64  
 1   subject_id                                                        74 non-null     int64  
 2   specimen_type                                                     74 non-null     object 
 3   infancy_vac                                                       74 non-null     object 
 4   biological_sex                                                    74 non-null     object 
 5   ethnicity                                                         74 non-null     object 
 6   race                                                              74 non-null     object 
 7   dataset                                   

In [8]:
cat_feat = ['specimen_type', 'infancy_vac', 'biological_sex', 'ethnicity', 'race', 'dataset']

In [9]:
# Define features and target
# Combine features and target for PyCaret input
data = X.copy()
target = 'day_1_Monocytes'
data[target] = y

# Initialize PyCaret with the dataset and target column
regression_setup = setup(
    data=data,
    target=target,
    categorical_features=cat_feat,  # Specify any categorical features
    session_id=1,
    fold=5
)

# Custom Spearman correlation function
def spearman_metric(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]  # Return the Spearman correlation coefficient

# Adding the custom Spearman metric
add_metric(
    'Spearman', 
    'Spearman Correlation', 
    spearman_metric, 
    greater_is_better=True
)

# Compare models using a supported metric (e.g., R2)
top_models = compare_models(sort='R2', n_select=15)  # Select top models based on R2, adjust as needed

# Calculate Spearman correlation for each top model
spearman_results = []
for model in top_models:
    predictions = predict_model(model)  # Get predictions
    
    # Identify the correct prediction column
    prediction_column = 'Label' if 'Label' in predictions.columns else predictions.columns[-1]
    
    # Calculate Spearman correlation
    spearman_corr = spearman_metric(predictions[target], predictions[prediction_column])
    spearman_results.append((model, spearman_corr))

# Sort and display models by Spearman correlation
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[1], reverse=True)
for model, spearman_corr in sorted_spearman_results:
    print(f"Model: {model}, Spearman Correlation: {spearman_corr}")


Unnamed: 0,Description,Value
0,Session id,1
1,Target,day_1_Monocytes
2,Target type,Regression
3,Original data shape,"(74, 49)"
4,Transformed data shape,"(74, 55)"
5,Transformed train set shape,"(51, 55)"
6,Transformed test set shape,"(23, 55)"
7,Numeric features,42
8,Categorical features,6
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation,TT (Sec)
et,Extra Trees Regressor,0.3,0.1514,0.363,0.1212,0.1656,0.2954,0.0,0.24
rf,Random Forest Regressor,0.2972,0.1499,0.3632,0.0773,0.1678,0.3022,0.0,0.242
gbr,Gradient Boosting Regressor,0.3052,0.1689,0.383,0.0498,0.1748,0.3069,0.0,0.23
lightgbm,Light Gradient Boosting Machine,0.3028,0.1652,0.3831,-0.0236,0.1716,0.2979,0.0,0.526
ada,AdaBoost Regressor,0.3099,0.1563,0.3785,-0.0383,0.1739,0.3086,0.0,0.226
br,Bayesian Ridge,0.3655,0.2227,0.4412,-0.4221,0.2008,0.3429,0.0,0.218
dummy,Dummy Regressor,0.3792,0.2134,0.4394,-0.4365,0.198,0.3593,0.0,0.216
llar,Lasso Least Angle Regression,0.3961,0.2363,0.4605,-0.5556,0.2068,0.3701,0.0,0.216
lasso,Lasso Regression,0.3961,0.2363,0.4605,-0.5556,0.2068,0.3701,0.0,0.214
en,Elastic Net,0.4004,0.2396,0.4637,-0.5797,0.2083,0.3742,0.0,0.216


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,0.3065,0.1469,0.3833,0.4359,0.1785,0.3502,0.6443


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,0.3107,0.156,0.3949,0.4013,0.1795,0.3422,0.6423


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,0.3193,0.207,0.455,0.2054,0.2034,0.3586,0.5435


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,0.3434,0.1797,0.424,0.3099,0.1972,0.3917,0.5761


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,0.318,0.1514,0.3891,0.4189,0.1798,0.3516,0.6247


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,0.3142,0.1679,0.4098,0.3554,0.1898,0.3598,0.6087


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,0.4213,0.2605,0.5104,-0.0001,0.2326,0.4572,


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,0.429,0.2648,0.5146,-0.0168,0.2331,0.4532,0.1413


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,0.429,0.2648,0.5146,-0.0168,0.2331,0.4532,0.1413


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,0.4287,0.2645,0.5143,-0.0156,0.2329,0.4523,0.1591


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,0.4195,0.2609,0.5107,-0.0014,0.2277,0.4168,0.1838


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,0.4572,0.4005,0.6328,-0.5374,0.2637,0.5065,0.5534


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,0.441,0.3216,0.5671,-0.2344,0.2484,0.4432,0.3955


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,0.3768,0.2496,0.4996,0.0419,0.2235,0.4269,0.5227


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,0.4145,0.2737,0.5232,-0.0507,0.2427,0.4844,0.0104


Model: ExtraTreesRegressor(n_jobs=-1, random_state=1), Spearman Correlation: 0.6442687747035574
Model: RandomForestRegressor(n_jobs=-1, random_state=1), Spearman Correlation: 0.642292490118577
Model: AdaBoostRegressor(random_state=1), Spearman Correlation: 0.6246602612245414
Model: BayesianRidge(), Spearman Correlation: 0.608695652173913
Model: LGBMRegressor(n_jobs=-1, random_state=1), Spearman Correlation: 0.5760869565217391
Model: GradientBoostingRegressor(random_state=1), Spearman Correlation: 0.5434782608695652
Model: DummyRegressor(), Spearman Correlation: nan
Model: Ridge(random_state=1), Spearman Correlation: 0.5533596837944664
Model: HuberRegressor(), Spearman Correlation: 0.5227272727272728
Model: DecisionTreeRegressor(random_state=1), Spearman Correlation: 0.3955453299592544
Model: OrthogonalMatchingPursuit(), Spearman Correlation: 0.18379446640316205
Model: ElasticNet(random_state=1), Spearman Correlation: 0.1590909090909091
Model: LassoLars(random_state=1), Spearman Correla

In [11]:
# Check the pearson correlation of predictions
pearson_results = []
for model in top_models:
    predictions = predict_model(model)  # Get predictions
    
    # Identify the correct prediction column
    prediction_column = 'Label' if 'Label' in predictions.columns else predictions.columns[-1]
    
    # Calculate Spearman correlation
    pearson_corr = np.corrcoef(predictions[target], predictions[prediction_column])[0,1]
    pearson_results.append((model, pearson_corr))
    
    # Print Spearman correlation
    print(f"Model: {model}, Pearson Correlation: {pearson_corr}")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,0.3065,0.1469,0.3833,0.4359,0.1785,0.3502,0.6443


Model: ExtraTreesRegressor(n_jobs=-1, random_state=1), Pearson Correlation: 0.680082690446901


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,0.3107,0.156,0.3949,0.4013,0.1795,0.3422,0.6423


Model: RandomForestRegressor(n_jobs=-1, random_state=1), Pearson Correlation: 0.6435056735535511


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,0.3193,0.207,0.455,0.2054,0.2034,0.3586,0.5435


Model: GradientBoostingRegressor(random_state=1), Pearson Correlation: 0.49743189850660874


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,0.3434,0.1797,0.424,0.3099,0.1972,0.3917,0.5761


Model: LGBMRegressor(n_jobs=-1, random_state=1), Pearson Correlation: 0.5787300360074287


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,0.318,0.1514,0.3891,0.4189,0.1798,0.3516,0.6247


Model: AdaBoostRegressor(random_state=1), Pearson Correlation: 0.6628564172287155


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,0.3142,0.1679,0.4098,0.3554,0.1898,0.3598,0.6087


Model: BayesianRidge(), Pearson Correlation: 0.6184213181583382


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,0.4213,0.2605,0.5104,-0.0001,0.2326,0.4572,


Model: DummyRegressor(), Pearson Correlation: nan


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,0.429,0.2648,0.5146,-0.0168,0.2331,0.4532,0.1413


Model: LassoLars(random_state=1), Pearson Correlation: -0.03754221631656104


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,0.429,0.2648,0.5146,-0.0168,0.2331,0.4532,0.1413


Model: Lasso(random_state=1), Pearson Correlation: -0.037542216316561156


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,0.4287,0.2645,0.5143,-0.0156,0.2329,0.4523,0.1591


Model: ElasticNet(random_state=1), Pearson Correlation: -0.02529191896218707


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,0.4195,0.2609,0.5107,-0.0014,0.2277,0.4168,0.1838


Model: OrthogonalMatchingPursuit(), Pearson Correlation: 0.16315866314024347


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Ridge Regression,0.4572,0.4005,0.6328,-0.5374,0.2637,0.5065,0.5534


Model: Ridge(random_state=1), Pearson Correlation: 0.4747896170631824


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,0.441,0.3216,0.5671,-0.2344,0.2484,0.4432,0.3955


Model: DecisionTreeRegressor(random_state=1), Pearson Correlation: 0.2916028494541159


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,0.3768,0.2496,0.4996,0.0419,0.2235,0.4269,0.5227


Model: HuberRegressor(), Pearson Correlation: 0.5399235413089805


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,0.4145,0.2737,0.5232,-0.0507,0.2427,0.4844,0.0104


Model: KNeighborsRegressor(n_jobs=-1), Pearson Correlation: 0.06962299103803224


In [13]:
top_models[0]

In [14]:
# save top_models[0]
save_model(top_models[0], 'task21_best_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['day_0_specimen_id', 'subject_id',
                                              'age', 'day_0_Monocytes',
                                              'day_0_Classical_Monocytes',
                                              'day_0_Non-Classical_Monocytes',
                                              'day_0_Intermediate_Monocytes',
                                              'day_0_Bcells', 'day_0_CD3CD19neg',
                                              'day_0_CD3 Tcells',
                                              'day_0_CD4Tcells',
                                              'day_0_CD8Tcells',
                                              'day_0_TemraCD4', 'da...
 dtype: int64}]))),
                 ('onehot_encoding',
                  TransformerWrapper(include=['specimen_type', 'race',
                                              'dataset'],
 

# Exploring more features

In [3]:
import numpy as np
import pandas as pd
from pycaret.regression import *
from scipy.stats import spearmanr

exp_feats = ['day_0_specimen_id', 'subject_id', 'specimen_type', 'infancy_vac',
       'biological_sex', 'ethnicity', 'race', 'dataset', 'age', 'day_0_IgG_PT',
       'day_0_IgG_PRN', 'day_0_IgG_FHA', 'day_0_IgG1_PT', 'day_0_IgG1_PRN',
       'day_0_IgG1_FHA', 'day_0_IgG1_FIM2/3', 'day_0_IgG1_TT', 'day_0_IgG1_DT',
       'day_0_IgG1_OVA', 'day_0_IgG2_PT', 'day_0_IgG2_PRN', 'day_0_IgG2_FHA',
       'day_0_IgG2_FIM2/3', 'day_0_IgG2_TT', 'day_0_IgG2_DT', 'day_0_IgG2_OVA',
       'day_0_IgG3_PT', 'day_0_IgG3_PRN', 'day_0_IgG3_FHA',
       'day_0_IgG3_FIM2/3', 'day_0_IgG3_TT', 'day_0_IgG3_DT', 'day_0_IgG3_OVA',
       'day_0_IgG4_PT', 'day_0_IgG4_PRN', 'day_0_IgG4_FHA',
       'day_0_IgG4_FIM2/3', 'day_0_IgG4_TT', 'day_0_IgG4_DT', 'day_0_IgG4_OVA']

In [4]:
# Load data
df = pd.read_csv("/home/vmottaqi/cmipb_challenge/vincent_files/training_abtier_batchcorrected_processed_oct24.csv", index_col=0)
df = df[df['FC_day_0_14_IgG_PT'] >= 0]

# Define features and target
X = df[exp_feats]
y = df['FC_day_0_14_IgG_PT']

# Combine features and target for PyCaret input
data = X.copy()
data['FC_day_0_14_IgG_PT'] = y

# Initialize PyCaret with the dataset and target column
regression_setup = setup(
    data=data,
    target='FC_day_0_14_IgG_PT',
    categorical_features=['specimen_type', 'infancy_vac', 'biological_sex', 'ethnicity', 'race', 'dataset'],  # Specify any categorical features
    session_id=1,
    fold=5
)

# Custom Spearman correlation function
def spearman_metric(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]  # Return the Spearman correlation coefficient

# Adding the custom Spearman metric
add_metric(
    'Spearman', 
    'Spearman Correlation', 
    spearman_metric, 
    greater_is_better=True
)

# Compare models using a supported metric (e.g., R2)
top_models = compare_models(sort='R2', n_select=15)  # Select top models based on R2, adjust as needed

# Calculate Spearman correlation for each top model
spearman_results = []
for model in top_models:
    predictions = predict_model(model)  # Get predictions
    
    # Identify the correct prediction column
    prediction_column = 'Label' if 'Label' in predictions.columns else predictions.columns[-1]
    
    # Calculate Spearman correlation
    spearman_corr = spearman_metric(predictions['FC_day_0_14_IgG_PT'], predictions[prediction_column])
    spearman_results.append((model, spearman_corr))

# Sort and display models by Spearman correlation
sorted_spearman_results = sorted(spearman_results, key=lambda x: x[1], reverse=True)
for model, spearman_corr in sorted_spearman_results:
    print(f"Model: {model}, Spearman Correlation: {spearman_corr}")


Unnamed: 0,Description,Value
0,Session id,1
1,Target,FC_day_0_14_IgG_PT
2,Target type,Regression
3,Original data shape,"(93, 41)"
4,Transformed data shape,"(93, 50)"
5,Transformed train set shape,"(65, 50)"
6,Transformed test set shape,"(28, 50)"
7,Numeric features,34
8,Categorical features,6
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation,TT (Sec)
ada,AdaBoost Regressor,8.2899,304.234,14.4331,-0.6856,0.8898,2.5197,0.0,0.046
et,Extra Trees Regressor,9.9649,372.4152,15.8866,-1.2296,0.9985,3.0019,0.0,0.056
rf,Random Forest Regressor,8.2793,333.1955,14.8631,-1.3967,0.8767,2.5708,0.0,0.062
br,Bayesian Ridge,11.758,443.7684,17.6791,-1.6908,1.2512,4.4298,0.0,0.212
dummy,Dummy Regressor,11.8398,450.3617,17.7636,-1.6909,1.2705,4.5486,0.0,0.032
lightgbm,Light Gradient Boosting Machine,10.7373,368.7215,16.1494,-1.8962,1.1321,4.3257,0.0,0.162
huber,Huber Regressor,12.1447,462.851,18.757,-2.0608,1.3168,5.4361,0.0,0.026
omp,Orthogonal Matching Pursuit,11.7686,441.9598,17.6979,-2.0977,1.2393,5.465,0.0,0.214
gbr,Gradient Boosting Regressor,8.5105,323.0224,16.0121,-2.5338,0.8212,2.272,0.0,0.048
knn,K Neighbors Regressor,11.4062,509.3115,20.0147,-3.3025,1.0903,2.2346,0.0,0.034


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,7.4131,139.8895,11.8275,0.378,0.714,1.1994,0.7991


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Extra Trees Regressor,10.5108,211.4003,14.5396,0.0601,0.9012,1.8426,0.5627


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Random Forest Regressor,8.4416,148.578,12.1893,0.3394,0.7576,1.2519,0.7219


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Bayesian Ridge,10.6591,229.6061,15.1528,-0.0209,1.0606,2.3707,0.4921


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Dummy Regressor,10.7162,230.9339,15.1965,-0.0268,1.0673,2.4033,


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Light Gradient Boosting Machine,10.3184,190.1402,13.7891,0.1546,0.9662,2.4282,0.5435


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Huber Regressor,13.1406,315.9076,17.7738,-0.4046,1.1388,2.3171,0.0515


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Orthogonal Matching Pursuit,11.2362,255.9252,15.9977,-0.1379,0.9976,2.077,0.5064


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Gradient Boosting Regressor,10.2576,304.9197,17.462,-0.3557,0.8382,1.6082,0.7701


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,K Neighbors Regressor,10.5972,225.2064,15.0069,-0.0013,1.022,1.8016,0.3852


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Decision Tree Regressor,15.9162,1128.7836,33.5974,-4.0187,1.0322,1.0058,0.619


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Passive Aggressive Regressor,16.3921,857.3976,29.2814,-2.8121,1.1645,2.4894,0.3125


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Elastic Net,15.8792,388.2278,19.7035,-0.7261,1.2881,4.3655,0.3793


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Least Angle Regression,15.8758,403.2669,20.0815,-0.793,1.2958,4.8665,0.4559


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,Lasso Regression,15.8759,403.2671,20.0815,-0.793,1.2958,4.8666,0.4559


Model: AdaBoostRegressor(random_state=1), Spearman Correlation: 0.7991238351089663
Model: RandomForestRegressor(n_jobs=-1, random_state=1), Spearman Correlation: 0.7219485495347564
Model: DummyRegressor(), Spearman Correlation: nan
Model: GradientBoostingRegressor(random_state=1), Spearman Correlation: 0.7701149425287356
Model: DecisionTreeRegressor(random_state=1), Spearman Correlation: 0.6190360485809107
Model: ExtraTreesRegressor(n_jobs=-1, random_state=1), Spearman Correlation: 0.5626710454296661
Model: LGBMRegressor(n_jobs=-1, random_state=1), Spearman Correlation: 0.5435139573070608
Model: OrthogonalMatchingPursuit(), Spearman Correlation: 0.5063637655192589
Model: BayesianRidge(), Spearman Correlation: 0.4920634920634921
Model: LassoLars(random_state=1), Spearman Correlation: 0.4559386973180077
Model: Lasso(random_state=1), Spearman Correlation: 0.4559386973180077
Model: KNeighborsRegressor(n_jobs=-1), Spearman Correlation: 0.38521563823397276
Model: ElasticNet(random_state=1), 

In [5]:
sorted_spearman_results

[(AdaBoostRegressor(random_state=1), 0.7991238351089663),
 (RandomForestRegressor(n_jobs=-1, random_state=1), 0.7219485495347564),
 (DummyRegressor(), nan),
 (GradientBoostingRegressor(random_state=1), 0.7701149425287356),
 (DecisionTreeRegressor(random_state=1), 0.6190360485809107),
 (ExtraTreesRegressor(n_jobs=-1, random_state=1), 0.5626710454296661),
 (LGBMRegressor(n_jobs=-1, random_state=1), 0.5435139573070608),
 (OrthogonalMatchingPursuit(), 0.5063637655192589),
 (BayesianRidge(), 0.4920634920634921),
 (LassoLars(random_state=1), 0.4559386973180077),
 (Lasso(random_state=1), 0.4559386973180077),
 (KNeighborsRegressor(n_jobs=-1), 0.38521563823397276),
 (ElasticNet(random_state=1), 0.37931034482758613),
 (PassiveAggressiveRegressor(random_state=1), 0.31253420908593316),
 (HuberRegressor(), 0.051450465243568694)]

In [8]:
# Get top model and save it:
best_scc_model = sorted_spearman_results[0][0]

In [10]:
# save out best_scc_model
save_model(best_scc_model, '/home/jrollins/cmipb-challenge/task1-2_best_scc_model.pkl')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['day_0_specimen_id', 'subject_id',
                                              'age', 'day_0_IgG_PT',
                                              'day_0_IgG_PRN', 'day_0_IgG_FHA',
                                              'day_0_IgG1_PT', 'day_0_IgG1_PRN',
                                              'day_0_IgG1_FHA',
                                              'day_0_IgG1_FIM2/3',
                                              'day_0_IgG1_TT', 'day_0_IgG1_DT',
                                              'day_0_IgG1_OVA', 'day_0_IgG2_PT',
                                              'day_0_IgG2_PRN', 'day_0_IgG2_FHA',
                                              '...
 dtype: int64}]))),
                 ('onehot_encoding',
                  TransformerWrapper(include=['specimen_type', 'ethnicity',
                                              'race'

In [13]:
# Optimize the AdaBoost model
tuned_model = tune_model(best_scc_model, optimize='R2')


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,6.0123,96.9703,9.8474,0.4554,0.592,0.8164,0.0
1,8.5368,288.5051,16.9854,-0.3545,1.1287,4.9657,0.0
2,15.9394,1124.602,33.5351,0.2293,0.8739,1.3954,0.0
3,4.8094,41.2148,6.4199,0.616,0.8098,2.5701,0.0
4,4.1713,20.6261,4.5416,-1.2363,0.8041,1.8054,0.0
Mean,7.8938,314.3837,14.2659,-0.058,0.8417,2.3106,0.0
Std,4.2908,416.0064,10.5294,0.6748,0.1721,1.4452,0.0


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [19]:
# Get the spearman correlation of the tuned model
predictions = predict_model(tuned_model)
spearman_corr = spearman_metric(predictions['FC_day_0_14_IgG_PT'], predictions['prediction_label'])
print(f"Tuned Model: {spearman_corr}, Spearman Correlation")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Spearman Correlation
0,AdaBoost Regressor,8.0445,162.3859,12.7431,0.278,0.7731,1.2295,0.7043


Tuned Model: 0.7042717375933771, Spearman Correlation


# Vincent's comparison code

In [9]:
import numpy as np
import pandas as pd
import catboost as cb
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr
from joblib import Parallel, delayed

# Load data
df = pd.read_csv("~/cmipb_challenge/vincent_files/training_abtier_normalized_processed.csv", index_col=0)
df = df[df['FC_day_0_14_IgG_PT'] >= 0]

features = ['infancy_vac', 'biological_sex', 'age', 'day_0_IgG_PT', 'day_0_IgG4_PRN', 'day_0_IgG1_PRN']
X = df[features]
y = df['FC_day_0_14_IgG_PT']
categorical_features_indices = np.where((X.dtypes != np.float64) & (X.dtypes != np.int64))[0]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Define parameter grid
param_grid = {
    'iterations': [25, 50, 75],
    'learning_rate': [0.01, 0.03, 0.05],
    'depth': [2, 3, 4, 5, 6],
    'l2_leaf_reg': [0, 1]
}
parameter_grid = list(ParameterGrid(param_grid))

# Function to train and evaluate a model
def evaluate_model(params, X_train, y_train, X_test, y_test, cat_features):
    train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)
    test_data = Pool(data=X_test, label=y_test, cat_features=cat_features)
    model = CatBoostRegressor(**params, loss_function='RMSE', verbose=False)
    model.fit(train_data)
    preds = model.predict(test_data)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    spearman_corr, _ = spearmanr(y_test, preds)
    return rmse, r2, spearman_corr, params

# Parallel execution of grid search
results = Parallel(n_jobs=100)(delayed(evaluate_model)(params, X_train, y_train, X_test, y_test, categorical_features_indices) for params in parameter_grid)

# Find the best parameters based on RMSE or another metric
best_result = min(results, key=lambda x: x[0])
print("Best RMSE:", best_result[0])
print("R2 Score:", best_result[1])
print("Spearman Correlation:", best_result[2])
print("Best Hyperparameters:", best_result[3])


ModuleNotFoundError: No module named 'numpy.strings'