### Predicting age and assessment values from two domains using features derived from brain MRI images as inputs.


From sklearn:
"A Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it."

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingRegressor

In [None]:
def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [4]:
fnc_df = pd.read_csv('MRI_dataset/fnc.csv')

In [6]:
loading_df = pd.read_csv('MRI_dataset/loading.csv')

fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
features = loading_features + fnc_features
df = fnc_df.merge(loading_df, on="Id")

In [7]:
labels_df = pd.read_csv('MRI_dataset/train_scores.csv')
labels_df["is_train"] = True

df = df.merge(labels_df, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

In [8]:
# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/600

df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

In [9]:
# Take a copy of the main dataframe, to report on per-target scores for each model.
df_model1 = df.copy()
df_model2 = df.copy()
df_model3 = df.copy()

NUM_FOLDS = 7
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

features = loading_features + fnc_features


In [10]:
# Blending weights between the three models are specified separately for the 5 targets. 
#                                 SVR,  Ridge, BaggingRegressor
blend_weights = {"age":          [0.4,  0.55,  0.05],
                 "domain1_var1": [0.55, 0.15,  0.3],
                 "domain1_var2": [0.45, 0.0,   0.55],
                 "domain2_var1": [0.55, 0.15,  0.3],
                 "domain2_var2": [0.5,  0.05,  0.45]}

In [15]:
overall_score = 0
for target, c, w in [("age", 60, 0.3), ("domain1_var1", 12, 0.175), ("domain1_var2", 8, 0.175), ("domain2_var1", 9, 0.175), ("domain2_var2", 12, 0.175)]:    
    y_oof = np.zeros(df.shape[0])
    y_oof_model_1 = np.zeros(df.shape[0])
    y_oof_model_2 = np.zeros(df.shape[0])
    y_oof_model_3 = np.zeros(df.shape[0])
    y_test = np.zeros((test_df.shape[0], NUM_FOLDS))
    
    for f, (train_ind, val_ind) in enumerate(kf.split(df, df)):
        train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
        train_df = train_df[train_df[target].notnull()]

        model_1 = SVR(C=c, cache_size=3000.0)
        model_1.fit(train_df[features].values, train_df[target].values)
        model_2 = linear_model.Ridge(alpha = 0.0001)
        model_2.fit(train_df[features].values, train_df[target].values)
        
        ### The BaggingRegressor, using the Ridge regression method as a base, is added here. The BaggingRegressor
        # is from sklearn
        model_3 = BaggingRegressor(linear_model.Ridge(alpha = 0.0001), n_estimators=30, random_state=42, max_samples=0.3, max_features=0.3)
        model_3.fit(train_df[features].values, train_df[target].values)

        val_pred_1 = model_1.predict(val_df[features])
        val_pred_2 = model_2.predict(val_df[features])
        val_pred_3 = model_3.predict(val_df[features])
        
        test_pred_1 = model_1.predict(test_df[features])
        test_pred_2 = model_2.predict(test_df[features])
        test_pred_3 = model_3.predict(test_df[features])
        
        val_pred = blend_weights[target][0]*val_pred_1+blend_weights[target][1]*val_pred_2+blend_weights[target][2]*val_pred_3
        val_pred = val_pred.flatten()
        
        test_pred = blend_weights[target][0]*test_pred_1+blend_weights[target][1]*test_pred_2+blend_weights[target][2]*test_pred_3
        test_pred = test_pred.flatten()
        
        y_oof[val_ind] = val_pred
        y_oof_model_1[val_ind] = val_pred_1
        y_oof_model_2[val_ind] = val_pred_2
        y_oof_model_3[val_ind] = val_pred_3
        y_test[:, f] = test_pred
        
    df["pred_{}".format(target)] = y_oof
    df_model1["pred_{}".format(target)] = y_oof_model_1
    df_model2["pred_{}".format(target)] = y_oof_model_2
    df_model3["pred_{}".format(target)] = y_oof_model_3
    test_df[target] = y_test.mean(axis=1)
    
    score = metric(df[df[target].notnull()][target].values, df[df[target].notnull()]["pred_{}".format(target)].values)
    overall_score += w*score
    
    score_model1 = metric(df_model1[df_model1[target].notnull()][target].values, df_model1[df_model1[target].notnull()]["pred_{}".format(target)].values)
    score_model2 = metric(df_model2[df_model2[target].notnull()][target].values, df_model2[df_model1[target].notnull()]["pred_{}".format(target)].values)
    score_model3 = metric(df_model3[df_model3[target].notnull()][target].values, df_model3[df_model1[target].notnull()]["pred_{}".format(target)].values)

    print(f"For {target}:")
    print("SVR:", np.round(score_model1, 6))
    print("Ridge:", np.round(score_model2, 6))
    print("BaggingRegressor:", np.round(score_model3, 6))
    print("Ensemble:", np.round(score, 6))
    print()
    
print("Overall score:", np.round(overall_score, 6))

For age:
SVR: 0.144158
Ridge: 0.143463
BaggingRegressor: 0.15619
Ensemble: 0.142335

For domain1_var1:
SVR: 0.151293
Ridge: 0.1537
BaggingRegressor: 0.151566
Ensemble: 0.15067

For domain1_var2:
SVR: 0.151021
Ridge: 0.15582
BaggingRegressor: 0.151204
Ensemble: 0.15079

For domain2_var1:
SVR: 0.181687
Ridge: 0.185433
BaggingRegressor: 0.18241
Ensemble: 0.18116

For domain2_var2:
SVR: 0.17621
Ridge: 0.1804
BaggingRegressor: 0.176257
Ensemble: 0.175727

Overall score: 0.157911


In [37]:
sub_df = pd.melt(test_df[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")

In [38]:
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0]*5

In [39]:
sub_df.head(10)

Unnamed: 0,Id,Predicted
0,10003_age,55.886192
5877,10003_domain1_var1,50.338737
11754,10003_domain1_var2,59.508279
17631,10003_domain2_var1,48.823294
23508,10003_domain2_var2,56.867111
1,10006_age,63.080919
5878,10006_domain1_var1,54.534685
11755,10006_domain1_var2,59.363543
17632,10006_domain2_var1,48.85153
23509,10006_domain2_var2,52.202746
