In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_profiling
import os
import xgboost as xgb
import seaborn as sns; sns.set()
import polars as pl

from sklearn.model_selection import train_test_split
#from sklearn.model_selection import KFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error

import lightgbm as lgb #LightGBM
from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor,Pool

# 1. Define config and Function

In [2]:
class config:
    METRIC = 'RMSE' #RMSE or SMAPE
    RANDOM_STATE=100  
    FRAC = 0.2
    N_FOLD = 5
    
class paths:
    # kaggle環境ならTrue
    if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
        common_path = "/kaggle/input/amp-parkinsons-disease-progression-prediction"
    
    # colaboratory環境ならTrue
    if 'COLAB_GPU' in set(os.environ.keys()):
        common_path = "/content/drive/MyDrive/AMP"
    
    SAMPLE_SUBMISSION = common_path+"/example_test_files/sample_submission.csv"
    SUPPLEMENTAL_CLINICAL_DATA = common_path+"/supplemental_clinical_data.csv"
    TRAIN_CLINICAL_DATA = common_path+"/train_clinical_data.csv"
    TRAIN_PEPTIDES = common_path+"/train_peptides.csv"
    TRAIN_PROTEINS = common_path+"/train_proteins.csv"
    TEST_CLINICAL_DATA = common_path+"/example_test_files/test.csv"
    TEST_PEPTIDES = common_path+"/example_test_files/test_peptides.csv"
    TEST_PROTEINS = common_path+"/example_test_files/test_proteins.csv"

In [3]:
def score_cal(y_true, y_pred):
  if config.METRIC == 'SMAPE':
    smap = np.zeros(len(y_true))
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    ret = 100 * np.mean(smap)
  if config.METRIC == 'RMSE':
    ret = np.sqrt(mean_squared_error(y_true,y_pred))

  return ret

# 2. Load Train and Sample Test Data

In [4]:
train_clinical = pd.read_csv(paths.TRAIN_CLINICAL_DATA)
train_peptides = pd.read_csv(paths.TRAIN_PEPTIDES)
train_proteins = pd.read_csv(paths.TRAIN_PROTEINS)
print('Proteins shape:',train_proteins.shape,'Peptides shape:',train_peptides.shape,'Clinical shape:',train_clinical.shape)
test_clinical = pd.read_csv(paths.TEST_CLINICAL_DATA)
test_peptides = pd.read_csv(paths.TEST_PEPTIDES)
test_proteins = pd.read_csv(paths.TEST_PROTEINS)
print('Proteins shape:',test_proteins.shape,'Peptides shape:',test_peptides.shape,'Clinical shape:',test_clinical.shape)

Proteins shape: (232741, 5) Peptides shape: (981834, 6) Clinical shape: (2615, 8)
Proteins shape: (453, 6) Peptides shape: (2057, 7) Clinical shape: (16, 6)


# 4. Make dataset for training

## 4.1 Training only first month (0's visit_month)

In [5]:
df_0 = train_clinical[(train_clinical.visit_month == 0)][['visit_id','updrs_1']]
print('Train shape:', df_0.shape)
df_0.head()

Train shape: (248, 2)


Unnamed: 0,visit_id,updrs_1
0,55_0,10.0
13,942_0,3.0
28,1517_0,11.0
38,1923_0,2.0
45,2660_0,2.0


## 4.2 Feature Engineering

### 4.2.1 Proteins features

In [6]:
proteins_npx_ft = train_proteins.groupby('visit_id').agg(NPX_min=('NPX','min'), NPX_max=('NPX','max'), NPX_mean=('NPX','mean'), NPX_std=('NPX','std'))\
                .reset_index()
proteins_npx_ft.head()

Unnamed: 0,visit_id,NPX_min,NPX_max,NPX_mean,NPX_std
0,10053_0,2497.84,269126000.0,2856580.0,21316300.0
1,10053_12,5800.87,270030000.0,2728871.0,20921620.0
2,10053_18,1334.11,278835000.0,2509967.0,19694530.0
3,10138_12,2520.24,365582000.0,3002583.0,25161700.0
4,10138_24,1436.94,396894000.0,3068891.0,27168060.0


In [7]:
df_proteins = pd.merge(train_proteins, df_0, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()
proteins_Uniprot_updrs.head()

Unnamed: 0,UniProt,updrs_1_sum
0,O00391,4.971014
1,O00533,5.319588
2,O00584,5.286458
3,O14498,5.217877
4,O14773,5.371585


In [8]:
df_proteins = pd.merge(train_proteins, proteins_Uniprot_updrs, on = 'UniProt', how = 'left')
proteins_UniProt_ft = df_proteins.groupby('visit_id').agg(proteins_updrs_1_min=('updrs_1_sum','min'), proteins_updrs_1_max=('updrs_1_sum','max'),\
                                                          proteins_updrs_1_mean=('updrs_1_sum','mean'), proteins_updrs_1_std=('updrs_1_sum','std'))\
                .reset_index()
proteins_UniProt_ft.head()

Unnamed: 0,visit_id,proteins_updrs_1_min,proteins_updrs_1_max,proteins_updrs_1_mean,proteins_updrs_1_std
0,10053_0,4.892857,5.601449,5.300548,0.077355
1,10053_12,4.816794,5.652174,5.296073,0.099055
2,10053_18,4.297619,5.652174,5.272617,0.134631
3,10138_12,4.297619,5.652174,5.263118,0.143238
4,10138_24,4.297619,5.652174,5.269522,0.137776


### 4.2.2 Peptides features

In [9]:
peptides_PeptideAbundance_ft = train_peptides.groupby('visit_id').agg(Abe_min=('PeptideAbundance','min'), Abe_max=('PeptideAbundance','max'),\
                                                                Abe_mean=('PeptideAbundance','mean'), Abe_std=('PeptideAbundance','std'))\
                .reset_index()
peptides_PeptideAbundance_ft.head()

Unnamed: 0,visit_id,Abe_min,Abe_max,Abe_mean,Abe_std
0,10053_0,82.9679,66333900.0,726248.393431,3535602.0
1,10053_12,128.446,73059300.0,737183.385744,3799654.0
2,10053_18,108.5,64711200.0,601466.78432,3006568.0
3,10138_12,129.024,71652400.0,699099.199189,3379573.0
4,10138_24,142.648,123897000.0,732120.888877,4912602.0


In [10]:
df_peptides = pd.merge(train_peptides, df_0, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()
peptides_PeptideAbundance_updrs.head()

Unnamed: 0,Peptide,updrs_1_sum
0,AADDTWEPFASGK,5.357143
1,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,5.296703
2,AAFTEC(UniMod_4)C(UniMod_4)QAADK,5.305699
3,AANEVSSADVK,5.36478
4,AATGEC(UniMod_4)TATVGKR,5.146497


In [11]:
df_peptides = pd.merge(train_peptides, peptides_PeptideAbundance_updrs, on = 'Peptide', how = 'left')
peptides_ft = df_peptides.groupby('visit_id').agg(peptides_updrs_1_min=('updrs_1_sum','min'), peptides_updrs_1_max=('updrs_1_sum','max'),\
                                                          peptides_updrs_1_mean=('updrs_1_sum','mean'), peptides_updrs_1_std=('updrs_1_sum','std'))\
                .reset_index()
peptides_ft

Unnamed: 0,visit_id,peptides_updrs_1_min,peptides_updrs_1_max,peptides_updrs_1_mean,peptides_updrs_1_std
0,10053_0,4.878788,5.661972,5.279278,0.092880
1,10053_12,4.816794,5.661972,5.277513,0.097712
2,10053_18,4.297619,5.661972,5.265384,0.116303
3,10138_12,4.297619,5.661972,5.253513,0.126117
4,10138_24,4.297619,5.661972,5.257710,0.123452
...,...,...,...,...,...
1108,8699_24,4.572519,5.661972,5.256902,0.123395
1109,942_12,4.572519,5.661972,5.254323,0.118205
1110,942_24,4.572519,5.652174,5.255565,0.117226
1111,942_48,4.572519,5.652174,5.253489,0.119653


### 4.2.3 Put it all together

In [12]:
df_0_1 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','patient_id','updrs_1']]
df_0_2 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','patient_id','updrs_2']]
df_0_3 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','patient_id','updrs_3']]
df_0_4 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','patient_id','updrs_4']]

df_proteins = pd.merge(train_proteins, df_0_1, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs1 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()

df_proteins = pd.merge(train_proteins, df_0_2, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs2 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_2','mean')).reset_index()

df_proteins = pd.merge(train_proteins, df_0_3, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs3 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_3','mean')).reset_index()

df_proteins = pd.merge(train_proteins, df_0_4, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs4 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_4','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_1, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs1 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_2, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs2 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_2','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_3, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs3 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_3','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_4, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs4 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_4','mean')).reset_index()

df_proteins_fts = [proteins_Uniprot_updrs1, proteins_Uniprot_updrs2, proteins_Uniprot_updrs3, proteins_Uniprot_updrs4]
df_peptides_fts = [peptides_PeptideAbundance_updrs1, peptides_PeptideAbundance_updrs2, peptides_PeptideAbundance_updrs3, peptides_PeptideAbundance_updrs4]
df_lst = [df_0_1, df_0_2, df_0_3, df_0_4]

In [13]:
def features(df, proteins, peptides, classes):
    proteins_npx_ft = proteins.groupby('visit_id').agg(NPX_min=('NPX','min'), NPX_max=('NPX','max'), NPX_mean=('NPX','mean'), NPX_std=('NPX','std'))\
                    .reset_index()
    peptides_PeptideAbundance_ft = peptides.groupby('visit_id').agg(Abe_min=('PeptideAbundance','min'), Abe_max=('PeptideAbundance','max'),\
                                                                    Abe_mean=('PeptideAbundance','mean'), Abe_std=('PeptideAbundance','std'))\
                    .reset_index()

    df_proteins = pd.merge(proteins, df_proteins_fts[classes], on = 'UniProt', how = 'left')
    proteins_UniProt_ft = df_proteins.groupby('visit_id').agg(proteins_updrs_1_min=('updrs_1_sum','min'), proteins_updrs_1_max=('updrs_1_sum','max'),\
                                                              proteins_updrs_1_mean=('updrs_1_sum','mean'), proteins_updrs_1_std=('updrs_1_sum','std'))\
                    .reset_index()
    df_peptides = pd.merge(peptides, df_peptides_fts[classes], on = 'Peptide', how = 'left')
    peptides_ft = df_peptides.groupby('visit_id').agg(peptides_updrs_1_min=('updrs_1_sum','min'), peptides_updrs_1_max=('updrs_1_sum','max'),\
                                                              peptides_updrs_1_mean=('updrs_1_sum','mean'), peptides_updrs_1_std=('updrs_1_sum','std'))\
                    .reset_index()

    df = pd.merge(df, proteins_npx_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, peptides_PeptideAbundance_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, proteins_UniProt_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, peptides_ft, on = 'visit_id', how = 'left')
    df = df.fillna(df.mean())
    return df

# 5. Training

In [14]:
model = {}

for i in range(3):
    print('--------------------------------------------------------')
    print('Model {0}'.format(i + 1))
    train_0 = features(df_lst[i], train_proteins, train_peptides, i)
    X = train_0.drop(columns = ['visit_id','updrs_{0}'.format(i + 1)], axis = 1).values
    y = train_0['updrs_{0}'.format(i + 1)].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config.FRAC, random_state=config.RANDOM_STATE,shuffle=True)

    models = [] 
    preds = []
    scores = 0.0   
    best_score = np.inf

    gf = GroupShuffleSplit(n_splits=config.N_FOLD,test_size=0.2,random_state=config.RANDOM_STATE)
    groups = X_train[:,0]

    for fold, (trn_idx, val_idx) in enumerate(gf.split(X_train,y_train,groups)):

        X_trn, y_trn = X_train[trn_idx,:], y_train[trn_idx]
        X_val, y_val = X_train[val_idx,:], y_train[val_idx]

        #pool
        train_pool = Pool(X_trn, y_trn)
        validate_pool = Pool(X_val, y_val)

        model = CatBoostRegressor(random_seed=config.RANDOM_STATE,logging_level='Silent',eval_metric = config.METRIC)
        model.fit(
                train_pool
                ,eval_set = validate_pool    # 検証用データ
                ,early_stopping_rounds = 10  # 10回以上精度が改善しなければ中止
                ,use_best_model=True       # 最も精度が高かったモデルを使用するかの設定
                ,plot=False
                ,verbose=False)                 # 誤差の推移を描画するか否かの設定

        val_pred = model.predict(X_val)
        score = score_cal(y_val,val_pred)

        scores += score / config.N_FOLD
        models.append(model)

        if score < best_score:
            best_score = score
            best_model = model
            best_fold = fold+1


        print("Fold :" , fold+1)

    print(f"fold average score : {scores:.4f}")
    print(f"fold best score : {best_score:.4f}"," FOLD ",best_fold)
    #****************************
    # test データ
    preds = np.zeros(len(X_test))
    # 各モデルで推論
    for model in models:
        pred = model.predict(X_test)
        preds += pred / len(models)

    test_score = score_cal(y_test, preds)
    print(f"Test data average score : {test_score:.4f}"," seed ", config.RANDOM_STATE)
    pred_best = best_model.predict(X_test)
    print(f"Test data best socre : {score_cal(y_test, pred_best):.4f}"," seed ", config.RANDOM_STATE)
    #*****************************
    
    if i+1 == 1:
        model_1 = best_model
    if i+1 == 2:
        model_2 = best_model
    if i+1 == 3:
        model_3 = best_model

--------------------------------------------------------
Model 1
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
fold average score : 3.8979
fold best score : 2.8700  FOLD  2
Test data average score : 5.8102  seed  100
Test data best socre : 5.7723  seed  100
--------------------------------------------------------
Model 2
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
fold average score : 4.8148
fold best score : 3.4295  FOLD  5
Test data average score : 4.2400  seed  100
Test data best socre : 4.2389  seed  100
--------------------------------------------------------
Model 3
Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
fold average score : 8.9630
fold best score : 7.7403  FOLD  3
Test data average score : 8.5532  seed  100
Test data best socre : 8.5857  seed  100


**Note** 'updrs_3 is the same for all visited_id

# 6. Inference

In [15]:
import amp_pd_peptide
env = amp_pd_peptide.make_env()
iter_test = env.iter_test()

In [16]:
def map_test(x):
    updrs = x.split('_')[2] + '_' + x.split('_')[3]
    month = int(x.split('_plus_')[1].split('_')[0])
    visit_id = x.split('_')[0] + '_' + x.split('_')[1]
    # set all predictions 0 where updrs equals 'updrs_4'
    if updrs=='updrs_4':
        rating = 0
    elif updrs =='updrs_1':
        rating = df[df.visit_id == visit_id]['pred0'].values[0]
    elif updrs =='updrs_2':
        rating = df[df.visit_id == visit_id]['pred1'].values[0]
    else:
        rating = df[df.visit_id == visit_id]['pred2'].values[0]
    return rating

counter = 0
# The API will deliver four dataframes in this specific order:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    df = test[['visit_id']].drop_duplicates('visit_id')
    
    pred_0 = features(df[['visit_id']], test_proteins, test_peptides, 0)
    pred_0 = model_1.predict(pred_0.drop(columns = ['visit_id'], axis = 1))
    df['pred0'] = pred_0
    
    pred_1 = features(df[['visit_id']], test_proteins, test_peptides, 1)
    pred_1 = model_2.predict(pred_1.drop(columns = ['visit_id'], axis = 1))
    df['pred1'] = pred_1
    
    pred_2 = features(df[['visit_id']], test_proteins, test_peptides, 2)
    pred_2 = model_3.predict(pred_2.drop(columns = ['visit_id'], axis = 1))
    df['pred2'] = pred_2
    
    sample_submission['rating'] = sample_submission['prediction_id'].apply(map_test)
    env.predict(sample_submission)
    
    if counter == 0:
        display(test)
        display(sample_submission)
        
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Unnamed: 0,visit_id,visit_month,patient_id,updrs_test,row_id
0,3342_0,0,3342,updrs_1,3342_0_updrs_1
1,3342_0,0,3342,updrs_2,3342_0_updrs_2
2,3342_0,0,3342,updrs_3,3342_0_updrs_3
3,3342_0,0,3342,updrs_4,3342_0_updrs_4
4,50423_0,0,50423,updrs_1,50423_0_updrs_1
5,50423_0,0,50423,updrs_2,50423_0_updrs_2
6,50423_0,0,50423,updrs_3,50423_0_updrs_3
7,50423_0,0,50423,updrs_4,50423_0_updrs_4


Unnamed: 0,prediction_id,rating
0,3342_0_updrs_1_plus_0_months,5.46551
1,3342_0_updrs_1_plus_6_months,5.46551
2,3342_0_updrs_1_plus_12_months,5.46551
3,3342_0_updrs_1_plus_24_months,5.46551
4,3342_0_updrs_2_plus_0_months,6.75768
5,3342_0_updrs_2_plus_6_months,6.75768
6,3342_0_updrs_2_plus_12_months,6.75768
7,3342_0_updrs_2_plus_24_months,6.75768
8,3342_0_updrs_3_plus_0_months,18.986881
9,3342_0_updrs_3_plus_6_months,18.986881
