# Dumbest of baselines: Predicting the mean

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from brain_age_prediction import sklearn_utils

Note: ICA -->
- mean on train, tested on test: 6.495
- mean on test, tested on test: 6.591

In [2]:
def get_mean_predictions(input_age_array, test_age_array):
    """
    Fills a given array of subjects' ages with the mean age of that array.
    Input:
        input_age_array: 1D array of subjects' ages.
    Output:
        array of mean age "predictions".
    """
    mean_age = np.mean(input_age_array)
    return np.full(test_age_array.shape, mean_age)

def fill_score(df, row, input_age_array, test_age_array):
    """
    Fills in all scores of a row.
    Input:
        df: scores dataframe.
        row: row of dataframe, either 'train', 'train + val', or 'test'.
        input_age_array: corresponding to row, 1D array of subjects' ages.
        test_age_array: 1D array of subjects' ages from test split.
    Output:
        df: updated scores dataframe.
    """
    row_idx = df[df['mean of']==row].index[0]
    predictions = get_mean_predictions(input_age_array, test_age_array)
    df.loc[row_idx, 'MSE'] = mean_squared_error(test_age_array, predictions)
    df.loc[row_idx, 'MAE'] = mean_absolute_error(test_age_array, predictions)
    df.loc[row_idx, 'R2'] = r2_score(test_age_array, predictions)
    return df

def get_prediction_scores(train_age_array, val_age_array, test_age_array):
    print('MODEL PERFORMANCE: MEAN OF SPLIT X TESTED ON TEST SPLIT')
    scores_df = pd.DataFrame([['train',0,0,0],['train + val',0,0,0],['test',0,0,0]],
                             columns=['mean of','MSE','MAE','R2'])
    train_val_age_array = np.append(train_age_array, val_age_array)
    # fill in scores_df
    scores_df = fill_score(scores_df, 'train', train_age_array, test_age_array)
    scores_df = fill_score(scores_df, 'train + val', train_val_age_array, test_age_array)
    scores_df = fill_score(scores_df, 'test', test_age_array, test_age_array)
    return scores_df

In [3]:
# load exemplary datasplit
_, y_train, _, y_val, _, y_test = sklearn_utils.access_dataset('7n100p')

In [4]:
get_prediction_scores(y_train, y_val, y_test)

MODEL PERFORMANCE: MEAN OF SPLIT X TESTED ON TEST SPLIT


Unnamed: 0,mean of,MSE,MAE,R2
0,train,62.097906,6.636025,-0.000126
1,train + val,62.100395,6.637262,-0.000166
2,test,62.090102,6.627693,0.0
