In [5]:
from pathlib import Path
import pprint

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

pprint.sorted = lambda x, key=None: x
pd.set_option('max_colwidth', None)

In [6]:
PATH = Path('data')

df = pd.read_csv(PATH / 'dev_set.csv').sort_values(['RID_HASH', 'VISCODE'], ignore_index=True)
df_dev_1 = pd.read_csv(PATH / 'dev_1.csv').sort_values(['RID_HASH', 'VISCODE'], ignore_index=True)
df_dev_2 = pd.read_csv(PATH / 'dev_2.csv').sort_values(['RID_HASH', 'VISCODE'], ignore_index=True)
df_dev_3 = pd.read_csv(PATH / 'dev_3.csv').sort_values(['RID_HASH', 'VISCODE'], ignore_index=True)

df_test_a = pd.read_csv(PATH / 'test_A.csv').sort_values(['RID_HASH', 'VISCODE'], ignore_index=True)
df_test_b = pd.read_csv(PATH / 'test_B.csv').sort_values(['RID_HASH', 'VISCODE'], ignore_index=True)

# train/test were splitted by subject
assert set(df['RID_HASH']) & set(df_test_a['RID_HASH']) == set()
assert set(df['RID_HASH']) & set(df_test_b['RID_HASH']) == set()
assert set(df_test_a['RID_HASH']) & set(df_test_b['RID_HASH']) == set()

# dev sets are the train set but with different missing values patterns
assert set(df['RID_HASH']) == set(df_dev_1['RID_HASH'])
assert set(df['RID_HASH']) == set(df_dev_2['RID_HASH'])
assert set(df['RID_HASH']) == set(df_dev_3['RID_HASH'])

df = df.set_index(['RID_HASH', 'VISCODE'])
df_dev_1 = df_dev_1.set_index(['RID_HASH', 'VISCODE'])
df_dev_2 = df_dev_2.set_index(['RID_HASH', 'VISCODE'])
df_dev_3 = df_dev_3.set_index(['RID_HASH', 'VISCODE'])
df_test_a = df_test_a.set_index(['RID_HASH', 'VISCODE'])
df_test_b = df_test_b.set_index(['RID_HASH', 'VISCODE'])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp
RID_HASH,VISCODE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001c7955017f905ccf78d55c94e81070a1cca7b1efb5bdc713271adea9eaa158,0,79.1,0,20,1.0,1.0,0.5,28.0,12.00,16636.0,7208.0,979010.0,3672.0,12661.0,18165.0
001c7955017f905ccf78d55c94e81070a1cca7b1efb5bdc713271adea9eaa158,6,79.6,0,20,1.0,1.0,1.5,28.0,17.33,16649.0,7205.0,970782.0,3331.0,12630.0,18085.0
00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8ea199212588d2d672c,0,72.9,1,12,1.0,1.0,1.0,30.0,9.00,27456.0,7000.0,864414.0,3952.0,15911.0,15686.0
00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8ea199212588d2d672c,6,73.4,1,12,1.0,1.0,1.0,30.0,12.00,27773.0,7213.0,860154.0,3508.0,15229.0,15672.0
00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8ea199212588d2d672c,12,73.9,1,12,1.0,1.0,1.0,29.0,8.00,29427.0,7024.0,848430.0,3807.0,15636.0,15283.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803bcd30ea99c58dcf91d7,60,79.8,1,19,1.0,0.0,3.0,28.0,16.33,31772.7,5510.2,934145.0,2804.0,16773.0,19587.0
ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803bcd30ea99c58dcf91d7,102,83.3,1,19,1.0,0.0,3.0,26.0,12.33,32894.0,5466.1,924182.0,2497.0,16242.0,18534.0
ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c81b37c40ecf646cb0c6,0,72.1,0,12,1.0,0.0,0.5,27.0,11.00,69297.3,7684.6,1190560.0,4501.0,23702.0,23405.0
ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c81b37c40ecf646cb0c6,12,73.1,0,12,1.0,0.0,1.0,29.0,11.33,66556.4,7738.5,1188930.0,4540.0,23421.0,23729.0


In [7]:
print(df.shape)
print(df_dev_1.shape)
print(df_dev_2.shape)
print(df_dev_3.shape)
print(df_test_a.shape)
print(df_test_b.shape)

(4101, 14)
(4101, 14)
(4101, 14)
(4101, 14)
(1328, 14)
(1470, 14)


In [8]:
def nmae(range_, y_true, y_pred):
    '''
    The range must be calculated before splitting the dataset.
    Should consider each variable.
    '''
    EPS = 1e-3
    weight = 1 / (range_ + EPS)
    return (np.abs(y_true - y_pred) * weight).mean()

In [9]:
def round_off_rating(x):
    '''Round a number to the closest half integer.'''
    return round(x * 2) / 2

In [10]:
CONSTANT_COLS = [
    'PTGENDER_num',
    'PTEDUCAT',
    'APOE4',
    'DX_num'
]

FEATURES = [x for x in df.columns if x not in ['RID_HASH', 'VISCODE', *CONSTANT_COLS]]


def imputation(val_df, train: bool = True):
    df_true = df.copy()
    df_pred = val_df.copy()
    
    null_idxs = []
    for col in CONSTANT_COLS + FEATURES:
        null_idxs.append(df_pred[df_pred[col].isnull()].index)
        df_pred[col] = df_pred.groupby('RID_HASH')[col].transform(lambda x: x.interpolate(method='linear'))
        df_pred[col] = df_pred.groupby('RID_HASH')[col].ffill()
        df_pred[col] = df_pred.groupby('RID_HASH')[col].bfill()
        
    # get idxs where there are still remaining NAs 
    null_idxs2 = []
    for col in CONSTANT_COLS + FEATURES:
        null_idxs2.append(df_pred[df_pred[col].isnull()].index)
        
    df_pred['VISCODE_temp'] = df_pred.index.get_level_values(1)

    # imputer in the last case
    scaler = StandardScaler()
    df_pred_norm = pd.DataFrame(
        scaler.fit_transform(df_pred),
        columns=df_pred.columns,
        index=df_pred.index
    )   
    
    huber = linear_model.HuberRegressor(
        epsilon=1.4 # controls the number of samples that should be classified as outliers. The smaller the epsilon, the more robust it is to outliers. default=1.35
    )
    
    # iterative imputer is sensible to the tolerance and dependent on the estimator used internally
    imp = IterativeImputer(
        estimator=huber,
        tol=1e-4,
        max_iter=200,
        skip_complete=True,
        random_state=42
    )
    df_pred_imp = pd.DataFrame(
        imp.fit_transform(df_pred_norm),
        columns=df_pred.columns,
        index=df_pred.index
    )
    df_pred_imp = pd.DataFrame(
        scaler.inverse_transform(df_pred_imp),
        columns=df_pred.columns,
        index=df_pred.index
    )
    
    del df_pred['VISCODE_temp']
    
    for idx in null_idxs2:
        df_pred.loc[idx, :] = df_pred_imp.loc[idx, :]
    
    # rounds
    df_pred[CONSTANT_COLS + ['MMSE']] = df_pred[CONSTANT_COLS + ['MMSE']].round(0)
    df_pred['AGE'] = df_pred['AGE'].round(1)
    df_pred['CDRSB'] = df_pred['CDRSB'].apply(round_off_rating)
    
    # clip
    for col in CONSTANT_COLS + FEATURES:
        df_pred[col] = df_pred[col].clip(df_true[col].min(), df_true[col].max())
    
    assert df_true.shape[1] == df_pred.shape[1]
    assert df_pred.isnull().sum().sum() == 0
    
    if train:
        scores = {}
        for col, null_idx in zip(CONSTANT_COLS + FEATURES, null_idxs):
            # evaluate only in missing values
            scores[col] = nmae(
                df_true[col].max() - df_true[col].min(),
                df_true.loc[null_idx, col],
                df_pred.loc[null_idx, col]
            )
        scores = dict(sorted(scores.items(), key=lambda x: -x[1]))  # worses first
        return scores, df_pred
    else:
        return df_pred
        
    
scores_dev_1, df_dev_1_filled = imputation(df_dev_1)
pprint.pprint(scores_dev_1)
score_dev_1 = np.mean(list(scores_dev_1.values()))
pprint.pprint(f'NMAE: {score_dev_1}')
print()

scores_dev_2, df_dev_2_filled = imputation(df_dev_2)
pprint.pprint(scores_dev_2)
score_dev_2 = np.mean(list(scores_dev_2.values()))
pprint.pprint(f'NMAE: {score_dev_2}')
print()

print('NMAE:', np.mean([score_dev_1, score_dev_2]))

{'DX_num': 0.1442648841325194,
 'APOE4': 0.1322867977775818,
 'PTGENDER_num': 0.11315859638659836,
 'AGE': 0.07764756812453379,
 'Entorhinal': 0.07316156335813404,
 'MMSE': 0.07309536118564075,
 'PTEDUCAT': 0.07104819106700566,
 'ADAS13': 0.06963076623561737,
 'CDRSB': 0.061201285416898996,
 'Ventricles': 0.05497921653130812,
 'Fusiform': 0.054907371183479436,
 'Hippocampus': 0.04968770695163179,
 'MidTemp': 0.047107390069802406,
 'WholeBrain': 0.0426974793918755}
'NMAE: 0.07606244127233053'

{'APOE4': 0.15840294138644964,
 'DX_num': 0.13186863182917785,
 'PTGENDER_num': 0.11364968195846818,
 'AGE': 0.07376692486626893,
 'Entorhinal': 0.07368597509848841,
 'MMSE': 0.06726407229221897,
 'ADAS13': 0.06518262146892521,
 'Ventricles': 0.05763701381490592,
 'CDRSB': 0.05625092955161141,
 'PTEDUCAT': 0.05433186437408355,
 'Fusiform': 0.052406589275389916,
 'MidTemp': 0.05027437979061315,
 'Hippocampus': 0.049566465239392926,
 'WholeBrain': 0.0468141869420222}
'NMAE: 0.0750787341348583'

NMAE

### Submission

In [11]:
def wide_to_long(df, suffix: str):
    df_long = df.stack().to_frame('Predicted').reset_index()
    df_long['Id'] = df_long['RID_HASH'] + '_' + df_long['VISCODE'].astype('str') + '_' + df_long['level_2'] + suffix
    return df_long[['Id', 'Predicted']]

In [12]:
df_test_a_filled = imputation(df_test_a, train=False)
df_test_a_filled = wide_to_long(df_test_a_filled, '_test_A')

df_test_b_filled = imputation(df_test_b, train=False)
df_test_b_filled = wide_to_long(df_test_b_filled, '_test_B')

ids = pd.read_csv(PATH / 'sample_submission.csv')['Id'].values
df_test = pd.concat([df_test_a_filled, df_test_b_filled])
df_test = df_test[df_test['Id'].isin(ids)].reset_index(drop=True)

assert len(df_test) == 15026

df_test

Unnamed: 0,Id,Predicted
0,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8f667526b0e4520129b_0_AGE_test_A,75.100000
1,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8f667526b0e4520129b_0_PTGENDER_num_test_A,0.000000
2,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8f667526b0e4520129b_0_Ventricles_test_A,40046.144448
3,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8f667526b0e4520129b_0_Hippocampus_test_A,6622.208724
4,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8f667526b0e4520129b_0_WholeBrain_test_A,990845.560830
...,...,...
15021,ffa86109ba8684f31325842d0ff26568e105f0f63b366acd4c77c0d2ece69a2f_0_Fusiform_test_B,14733.000000
15022,ffa86109ba8684f31325842d0ff26568e105f0f63b366acd4c77c0d2ece69a2f_0_MidTemp_test_B,19433.000000
15023,ffa86109ba8684f31325842d0ff26568e105f0f63b366acd4c77c0d2ece69a2f_24_AGE_test_B,66.300000
15024,ffa86109ba8684f31325842d0ff26568e105f0f63b366acd4c77c0d2ece69a2f_24_PTEDUCAT_test_B,13.000000


In [13]:
df_test.to_csv('submission.csv', index=False)