# Import Libraries

In [2]:
#!pip install pytorch_tabnet

In [3]:
import os
import warnings
warnings.filterwarnings("ignore", message="Device used : cpu", category=UserWarning)

import numpy as np
import pandas as pd
import polars as pl


# visual
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor,Pool
import xgboost as xgb

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Configuration

In [4]:
class config:
    METRIC = 'RMSE' #RMSE or SMAPE
    RANDOM_STATE=100  
    FRAC = 0.2
    N_FOLD = 5
    
class paths:
    # kaggle環境ならTrue
    if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
        common_path = "/kaggle/input/amp-parkinsons-disease-progression-prediction"
    
    # colaboratory環境ならTrue
    if 'COLAB_GPU' in set(os.environ.keys()):
        common_path = "/content/drive/MyDrive/AMP"
    
    SAMPLE_SUBMISSION = common_path + "/example_test_files/sample_submission.csv"
    SUPPLEMENTAL_CLINICAL_DATA = common_path + "/supplemental_clinical_data.csv"
    TRAIN_CLINICAL_DATA = common_path + "/train_clinical_data.csv"
    TRAIN_PEPTIDES = common_path + "/train_peptides.csv"
    TRAIN_PROTEINS = common_path + "/train_proteins.csv"
    TEST_CLINICAL_DATA = common_path + "/example_test_files/test.csv"
    TEST_PEPTIDES = common_path + "/example_test_files/test_peptides.csv"
    TEST_PROTEINS = common_path + "/example_test_files/test_proteins.csv"

# Function Definition

## display_feature_importance 

In [5]:
def display_feature_importance(feature_list,model):

    # 特徴量重要度を保管する dataframe を用意
    feature_importance = pd.DataFrame()
    feature_importance['feature'] = feature_list
    feature_importance['importance'] = model.feature_importances_

    print('****************************') 
    display(feature_importance.groupby("feature")["importance"].mean().sort_values(ascending=False)[:20])

    # order = list(feature_importance.groupby("feature")["importance"].mean().sort_values(ascending=False).index)[:30]
    # # 可視化
    # plt.figure(figsize=(10, 10))
    # sns.barplot(x="importance",y="feature",data=feature_importance,order=order)
    # plt.title('importance')
    # plt.tight_layout()
    # plt.show()

## score_cal

In [6]:
def score_cal(y_true, y_pred,metric = config.METRIC):
  if metric == 'SMAPE':
    y_true = y_true + 1
    y_pred = y_pred + 1
    smap = np.zeros(len(y_true))
    num = np.abs((y_true) - (y_pred))
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    pos_ind = ((y_true)!=0)|((y_pred)!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    ret = 100 * np.mean(smap)
  if metric == 'RMSE':
    ret = np.sqrt(mean_squared_error(y_true,y_pred))

  return ret

## fit_catboost_fold1

In [7]:
def fit_catboost_fold1(X,y):

  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=config.FRAC, random_state=config.RANDOM_STATE,shuffle=True)

  train_pool = Pool(X_train, y_train)
  model = CatBoostRegressor(random_seed=config.RANDOM_STATE,logging_level='Silent',eval_metric = config.METRIC)
  model.fit(train_pool,plot=False, verbose=False)  

  test_score = score_cal(y_val, model.predict(X_val))
  print('****************************')
  print(f"Test data average score : {test_score:.4f}"," seed ", config.RANDOM_STATE)

  return model,test_score

# Load Data

In [36]:
train_clinical = pd.read_csv(paths.TRAIN_CLINICAL_DATA)
train_peptides = pd.read_csv(paths.TRAIN_PEPTIDES)
train_proteins = pd.read_csv(paths.TRAIN_PROTEINS)
supplemental_clinical = pd.read_csv(paths.SUPPLEMENTAL_CLINICAL_DATA)
#print('Correct label:Proteins shape: (232741, 5) Peptides shape: (981834, 6) Clinical shape: (2615, 8)')
print('Proteins shape:',train_proteins.shape)
print('Peptides shape:',train_peptides.shape)
print('Clinical shape:',train_clinical.shape)
print('supplemental_clinical shape:',supplemental_clinical.shape)

test_clinical = pd.read_csv(paths.TEST_CLINICAL_DATA)
test_peptides = pd.read_csv(paths.TEST_PEPTIDES)
test_proteins = pd.read_csv(paths.TEST_PROTEINS)
sample_submission= pd.read_csv(paths.SAMPLE_SUBMISSION)
print('Proteins shape:',test_proteins.shape)
print('Peptides shape:',test_peptides.shape)
print('Clinical shape:',test_clinical.shape)


Unnamed: 0_level_0,visit_month,patient_id,UniProt,NPX,group_key
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3342_6,6,3342,Q9Y6R7,295860000.0,6
50423_0,0,50423,Q9Y6R7,439208000.0,0


# Data Cleaning

In [9]:
train_clinical.replace({'On':1,'Off':0},inplace=True)
supplemental_clinical.replace({'On':1,'Off':0},inplace=True)

#欠損値処理
train_clinical.dropna(subset=['updrs_1'],inplace=True)
train_clinical.dropna(subset=['updrs_2'],inplace=True)
train_clinical.dropna(subset=['updrs_3'],inplace=True) #To Revice

#お試し。数値情報のlog化
train_peptides['PeptideAbundance'] = train_peptides['PeptideAbundance'].apply(np.log)
test_peptides['PeptideAbundance'] = test_peptides['PeptideAbundance'].apply(np.log)
#train_peptides[['PeptideAbundance']].describe()

train_proteins['NPX'] = train_proteins['NPX'].apply(np.log)
test_proteins['NPX'] = test_proteins['NPX'].apply(np.log)
#train_proteins[['PeptideAbundance']].describe()

# common

In [10]:
#sort用にキー列のみ抽出
train_sort = train_clinical[['visit_id','patient_id','visit_month']]

# Feature Engineering

## target encoding

In [11]:
#te用にコピー作る
train_clinical_te = train_clinical.copy()
train_peptides_te = train_peptides.copy()
train_proteins_te = train_proteins.copy()
supplemental_te = supplemental_clinical.copy()

### pep,uni <- updrs1-4

In [12]:
#与えられたdfを用いてvisit_idに集約
def pep_te_agg(df_key,df_peptide):
  pep_te_agg_ = pl.DataFrame(df_peptide[['visit_id','Peptide']]).join(train_pep_te_agg_,on='Peptide',how='left').drop('Peptide').groupby(['visit_id']).agg(
      [
          pl.mean("*").suffix("_mean_by_id"),
          pl.std("*").suffix("_std_by_id"),
      ]
  ).to_pandas()
  pep_te_agg_fill = df_key.merge(pep_te_agg_,on='visit_id',how='left').groupby('patient_id').fillna(method='ffill')

  return pep_te_agg_fill

def pro_te_agg(df_key,df_protein):
  uni_te_agg_ = pl.DataFrame(df_protein[['visit_id','UniProt']]).join(train_uni_te_agg_,on='UniProt',how='left').drop('UniProt').groupby(['visit_id']).agg(
      [
          pl.mean("*").suffix("_mean_by_id"),
          pl.std("*").suffix("_std_by_id"),
      ]
  ).to_pandas()
  pro_te_agg_fill = df_key.merge(uni_te_agg_,on='visit_id',how='left').groupby('patient_id').fillna(method='ffill')
  
  return pro_te_agg_fill

In [13]:
train_pep_te = train_peptides_te[['visit_id','Peptide']].merge(train_clinical_te[['visit_id','updrs_1','updrs_2','updrs_3','updrs_4'
                                                                                  ,'upd23b_clinical_state_on_medication']]
                                                ,on='visit_id',how='inner')
train_uni_te = train_proteins_te[['visit_id','UniProt']].merge(train_clinical_te[['visit_id','updrs_1','updrs_2','updrs_3','updrs_4'
                                                                                  ,'upd23b_clinical_state_on_medication']]
                                                ,on='visit_id',how='inner')

#集計処理 by Pep/Uni
train_pep_te_agg_ = pl.DataFrame(train_pep_te.drop(['visit_id'],axis=1)).groupby(['Peptide']).agg(
    [
        pl.mean("*").suffix("_mean_by_Pep"),
        pl.std("*").suffix("_std_by_Pep"),
        pl.median("*").suffix("_median_by_Pep"),
        pl.max("*").suffix("_max_by_Pep"),
        #pl.min("*").suffix("_min_by_Pep"),
    ]
)

train_uni_te_agg_ = pl.DataFrame(train_uni_te.drop(['visit_id'],axis=1)).groupby(['UniProt']).agg(
    [
        pl.mean("*").suffix("_mean_by_Uni"),
        pl.std("*").suffix("_std_by_Uni"),
        pl.max("*").suffix("_max_by_Uni"),
        #pl.min("*").suffix("_min_by_Uni"),
    ]
)

train_pep_te_agg_fill = pep_te_agg(train_sort,train_peptides_te)
train_pro_te_agg_fill = pro_te_agg(train_sort,train_proteins_te)

### updrs <- updrs

In [14]:
train_clisup_ = pd.concat([train_clinical_te,supplemental_te])
#train_clisup_ = train_clinical_te.copy() #supplementalを使わない場合

#train_clisup = train_clisup_[['updrs_1','updrs_2','updrs_3','updrs_4']]
train_clisup = train_clisup_[['updrs_1','updrs_2','updrs_3']]
for i in range(3) :
  train_clisup_tmp = pl.DataFrame(train_clisup).groupby(['updrs_{0}'.format(i+1)]).agg(
    [
        pl.mean("*").suffix("_mean_by_"+'updrs_{0}'.format(i+1)),
        #pl.std("*").suffix("_std_by_Pep"),
        pl.median("*").suffix("_median_by_"+'updrs_{0}'.format(i+1)),
        #pl.max("*").suffix("_max_by_Pep"),
        pl.min("*").suffix("_min_by_"+'updrs_{0}'.format(i+1)),
    ]
  ).to_pandas()
  locals()["te_updrs_{0}".format(i+1)] = train_clisup_tmp.dropna()

te_updrs_1.head()

Unnamed: 0,updrs_1,updrs_2_mean_by_updrs_1,updrs_3_mean_by_updrs_1,updrs_2_median_by_updrs_1,updrs_3_median_by_updrs_1,updrs_2_min_by_updrs_1,updrs_3_min_by_updrs_1
0,5.0,5.421546,19.676815,5.0,19.0,0.0,0.0
1,14.0,11.177778,26.255556,11.0,27.0,1.0,0.0
2,18.0,15.975,32.425,16.0,33.5,4.0,7.0
3,17.0,15.509091,31.218182,16.0,30.0,3.0,5.0
4,27.0,20.166667,40.333333,18.0,36.0,12.0,21.0


### month <- updrs1-4

In [15]:
train_clisup = train_clisup_[['visit_month','updrs_1','updrs_2','updrs_3','updrs_4']]

train_te_month_agg = pl.DataFrame(train_clisup).groupby(['visit_month']).agg(
    [
        pl.mean("*").suffix("_mean_by_month"),
        pl.std("*").suffix("_std_by_month"),
        pl.median("*").suffix("_median_by_month"),
        #pl.min("*").suffix("_min_by_Uni"),
    ]
).to_pandas()

train_te_month_agg.head()

Unnamed: 0,visit_month,updrs_1_mean_by_month,updrs_2_mean_by_month,updrs_3_mean_by_month,updrs_4_mean_by_month,updrs_1_std_by_month,updrs_2_std_by_month,updrs_3_std_by_month,updrs_4_std_by_month,updrs_1_median_by_month,updrs_2_median_by_month,updrs_3_median_by_month,updrs_4_median_by_month
0,84,7.768421,8.189474,22.589474,2.919355,5.680224,7.859531,18.387956,2.771627,7.0,8.0,21.0,3.0
1,96,7.714286,7.625,21.267857,4.37931,5.55866,8.203242,21.044102,2.932744,7.0,5.0,19.5,4.0
2,108,9.416667,8.166667,25.25,2.571429,8.836477,9.093787,21.62963,2.699206,5.5,6.5,34.5,2.0
3,12,5.745798,5.766807,19.726891,0.957364,4.617642,5.057379,13.349433,2.152305,5.0,5.0,19.0,0.0
4,48,7.642487,7.11399,19.378238,1.985185,6.228434,7.158726,16.538684,3.056642,6.0,5.0,19.0,0.0


## Protein

In [16]:
def feature_proteins(df_key,df_proteins):
  #UniProtのcount情報を付与
  #pro_Uniprot_count = pd.DataFrame(df_proteins['UniProt'].value_counts()).reset_index()
  #df_proteins = df_proteins.merge(pro_Uniprot_count.rename(columns={'UniProt':'value_counts'}),left_on='UniProt',right_on='index',how='left')

  #統計量の作成
  df_proteins_ = pl.DataFrame(df_proteins).select([
      pl.mean('NPX').over('visit_id').alias('NPX_mean_by_id')
    ,pl.std('NPX').over('visit_id').alias('NPX_std_by_id')
    ,pl.median('NPX').over('visit_id').alias('NPX_median_by_id')
    ,pl.mean('NPX').over('UniProt').alias('NPX_mean_by_Uni')
    ,pl.std('NPX').over('UniProt').alias('NPX_std_by_Uni')
    ,pl.median('NPX').over('UniProt').alias('NPX_median_by_Uni')
    #,pl.mean('value_counts').over('visit_id').alias('Unicount_mean_by_id')
    #,pl.std('value_counts').over('visit_id').alias('Unicount_std_by_id')
    #,pl.median('value_counts').over('visit_id').alias('Unicount_median_by_id')
  ])

  df_proteins_agg_ = df_proteins[['visit_id']].join(df_proteins_.to_pandas())
  df_proteins_agg = df_key.merge(df_proteins_agg_.groupby('visit_id').mean().reset_index(),on='visit_id',how='left').sort_values(['patient_id','visit_month']).reset_index(drop=True).drop(['visit_month'],axis=1)

  #欠損値を1つ前の値で埋める
  df_proteins_agg_fill = df_proteins_agg.groupby('patient_id').fillna(method='ffill')
  #df_proteins_toall = df_proteins_agg_fill.merge(train_pro_te_agg_fill,on='visit_id',how='left').copy()
  df_proteins_toall = df_proteins_agg_fill.merge(pro_te_agg(df_key,df_proteins),on='visit_id',how='left').copy()

  return df_proteins_toall

#action
train_proteins_feature = feature_proteins(train_sort,train_proteins)
train_proteins_feature.head()

Unnamed: 0,visit_id,NPX_mean_by_id,NPX_std_by_id,NPX_median_by_id,NPX_mean_by_Uni,NPX_std_by_Uni,NPX_median_by_Uni,visit_month,updrs_1_mean_by_Uni_mean_by_id,updrs_2_mean_by_Uni_mean_by_id,...,updrs_1_std_by_Uni_std_by_id,updrs_2_std_by_Uni_std_by_id,updrs_3_std_by_Uni_std_by_id,updrs_4_std_by_Uni_std_by_id,upd23b_clinical_state_on_medication_std_by_Uni_std_by_id,updrs_1_max_by_Uni_std_by_id,updrs_2_max_by_Uni_std_by_id,updrs_3_max_by_Uni_std_by_id,updrs_4_max_by_Uni_std_by_id,upd23b_clinical_state_on_medication_max_by_Uni_std_by_id
0,55_0,12.070977,2.034095,11.726075,11.782605,0.418512,11.830588,0,6.540107,5.780976,...,0.062679,0.0788,0.155715,0.082443,0.001159,1.069003,0.263228,2.332944,1.618408,0.0
1,55_3,12.070977,2.034095,11.726075,11.782605,0.418512,11.830588,3,6.540107,5.780976,...,0.062679,0.0788,0.155715,0.082443,0.001159,1.069003,0.263228,2.332944,1.618408,0.0
2,55_6,12.018358,2.053036,11.767932,11.79064,0.418797,11.838714,6,6.541114,5.781368,...,0.062071,0.078784,0.156349,0.079954,0.001147,1.071346,0.271265,2.227249,1.606156,0.0
3,55_9,12.018358,2.053036,11.767932,11.79064,0.418797,11.838714,9,6.541114,5.781368,...,0.062071,0.078784,0.156349,0.079954,0.001147,1.071346,0.271265,2.227249,1.606156,0.0
4,55_12,12.084764,2.024976,11.731776,11.765437,0.417521,11.813118,12,6.534384,5.774895,...,0.066756,0.083855,0.162132,0.082774,0.001186,1.109143,0.276098,2.470506,1.593888,0.0


## Peptide

In [17]:
def feature_peptide(df_key,df_peptides):
  #pep_Peptide_count = pd.DataFrame(df_peptides['Peptide'].value_counts()).reset_index()
  #df_peptides = train_peptides.merge(pep_Peptide_count.rename(columns={'Peptide':'value_counts'}),left_on='Peptide',right_on='index',how='left')

  df_peptides_ = pl.DataFrame(df_peptides).select([
      pl.mean('PeptideAbundance').over('visit_id').alias('Abu_mean_by_id')
    ,pl.std('PeptideAbundance').over('visit_id').alias('Abu_std_by_id')
    ,pl.median('PeptideAbundance').over('visit_id').alias('Abu_median_by_id')
    ,pl.mean('PeptideAbundance').over('Peptide').alias('Abu_mean_by_Pep')
    ,pl.std('PeptideAbundance').over('Peptide').alias('Abu_std_by_Pep')
    ,pl.median('PeptideAbundance').over('Peptide').alias('Abu_median_by_Pep')
    #,pl.mean('value_counts').over('visit_id').alias('Pepcount_mean_by_id')
    #,pl.std('value_counts').over('visit_id').alias('Pepcount_std_by_id')
    #,pl.median('value_counts').over('visit_id').alias('Pepcount_median_by_id')
  ])

  df_peptides_agg_ = df_peptides[['visit_id']].join(df_peptides_.to_pandas())
  df_peptides_agg_ = df_key.merge(df_peptides_agg_.groupby('visit_id').mean().reset_index(),on='visit_id',how='left').sort_values(['patient_id','visit_month']).reset_index(drop=True).drop(['visit_month'],axis=1)

  #欠損値を1つ前の値で埋める
  df_peptides_agg_fill = df_peptides_agg_.groupby('patient_id').fillna(method='ffill')
  df_peptides_agg_toall = df_peptides_agg_fill.merge(pep_te_agg(df_key,df_peptides),on='visit_id',how='left').copy()
  return df_peptides_agg_toall

train_peptides_feature = feature_peptide(train_sort,train_peptides)
train_peptides_feature.head()

Unnamed: 0,visit_id,Abu_mean_by_id,Abu_std_by_id,Abu_median_by_id,Abu_mean_by_Pep,Abu_std_by_Pep,Abu_median_by_Pep,visit_month,updrs_1_mean_by_Pep_mean_by_id,updrs_2_mean_by_Pep_mean_by_id,...,updrs_1_median_by_Pep_std_by_id,updrs_2_median_by_Pep_std_by_id,updrs_3_median_by_Pep_std_by_id,updrs_4_median_by_Pep_std_by_id,upd23b_clinical_state_on_medication_median_by_Pep_std_by_id,updrs_1_max_by_Pep_std_by_id,updrs_2_max_by_Pep_std_by_id,updrs_3_max_by_Pep_std_by_id,updrs_4_max_by_Pep_std_by_id,upd23b_clinical_state_on_medication_max_by_Pep_std_by_id
0,55_0,11.588525,1.613698,11.441803,11.319405,0.420705,11.35889,0,6.52462,5.777552,...,0.475611,0.159602,0.426175,0.0,0.484294,1.3171,0.212778,3.240491,1.951132,0.0
1,55_3,11.588525,1.613698,11.441803,11.319405,0.420705,11.35889,3,6.52462,5.777552,...,0.475611,0.159602,0.426175,0.0,0.484294,1.3171,0.212778,3.240491,1.951132,0.0
2,55_6,11.530496,1.629913,11.348227,11.323267,0.420642,11.362635,6,6.526324,5.779549,...,0.476213,0.155831,0.415266,0.0,0.483457,1.291932,0.212234,3.135952,1.944267,0.0
3,55_9,11.530496,1.629913,11.348227,11.323267,0.420642,11.362635,9,6.526324,5.779549,...,0.476213,0.155831,0.415266,0.0,0.483457,1.291932,0.212234,3.135952,1.944267,0.0
4,55_12,11.610614,1.617102,11.424916,11.304093,0.420415,11.34343,12,6.523275,5.77642,...,0.474786,0.161945,0.450148,0.0,0.483756,1.34352,0.215993,3.263424,1.952518,0.0


## Clinical

In [18]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html

# 移動平均の取得

train_clinical_rolling = train_clinical.copy()

for i in range(3):
  train_clinical_rolling['updrs_{0}_rolling'.format(i+1)] \
    = train_clinical_rolling.groupby('patient_id')['updrs_{0}'.format(i+1)].rolling(3,min_periods=1).mean().reset_index(drop=True)
  train_clinical_rolling['updrs_{0}_ewm'.format(i+1)] \
    = train_clinical_rolling.groupby('patient_id')['updrs_{0}'.format(i+1)].transform(lambda x: x.ewm(halflife=10).mean()).reset_index(drop=True)
train_clinical_rolling.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,updrs_1_rolling,updrs_1_ewm,updrs_2_rolling,updrs_2_ewm,updrs_3_rolling,updrs_3_ewm
0,55_0,55,0,10.0,6.0,15.0,,,10.0,10.0,6.0,6.0,15.0,15.0
1,55_3,55,3,10.0,7.0,25.0,,,10.0,10.0,6.5,6.517322,20.0,20.173217
2,55_6,55,6,8.0,10.0,34.0,,,9.333333,9.286627,7.666667,7.759546,24.666667,25.105042
3,55_9,55,9,8.0,9.0,30.0,0.0,1.0,8.666667,8.930796,8.666667,8.102607,29.666667,26.458798
4,55_12,55,12,10.0,10.0,41.0,0.0,1.0,8.666667,9.175259,9.666667,8.536426,35.0,29.783493


## supplemental_clinical

In [19]:
#現状、実装なし
supplemental_clinical.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,35_0,35,0,5.0,3.0,16.0,0.0,
1,35_36,35,36,6.0,4.0,20.0,0.0,
2,75_0,75,0,4.0,6.0,26.0,0.0,
3,75_36,75,36,1.0,8.0,38.0,0.0,1.0
4,155_0,155,0,,,0.0,,


# Train ALL 学習用trainの作成

In [20]:
def make_df_all(df_cli,df_pro,df_pep):
  df_propep = df_pro.merge(df_pep,on='visit_id',how='left')
  df_all = df_cli.merge(df_propep,on='visit_id',how='left')
  return df_all

train_clinical_toall = train_clinical.merge(train_te_month_agg,on='visit_month',how='left')

#train_clinical_toall.shape,train_proteins_feature.shape,train_peptides_feature.shape

train_all = make_df_all(train_clinical_toall,train_proteins_feature,train_peptides_feature)
print('shape',train_all.shape)

train_all.head()

#train_all.describe()

#train_all.columns
#train_proteins_feature.columns
#train_peptides_feature.columns

shape (2588, 104)


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,updrs_1_mean_by_month,updrs_2_mean_by_month,...,updrs_1_median_by_Pep_std_by_id,updrs_2_median_by_Pep_std_by_id,updrs_3_median_by_Pep_std_by_id,updrs_4_median_by_Pep_std_by_id,upd23b_clinical_state_on_medication_median_by_Pep_std_by_id,updrs_1_max_by_Pep_std_by_id,updrs_2_max_by_Pep_std_by_id,updrs_3_max_by_Pep_std_by_id,updrs_4_max_by_Pep_std_by_id,upd23b_clinical_state_on_medication_max_by_Pep_std_by_id
0,55_0,55,0,10.0,6.0,15.0,,,5.725108,5.734848,...,0.475611,0.159602,0.426175,0.0,0.484294,1.3171,0.212778,3.240491,1.951132,0.0
1,55_3,55,3,10.0,7.0,25.0,,,5.412281,6.596491,...,0.475611,0.159602,0.426175,0.0,0.484294,1.3171,0.212778,3.240491,1.951132,0.0
2,55_6,55,6,8.0,10.0,34.0,,,5.952164,6.678815,...,0.476213,0.155831,0.415266,0.0,0.483457,1.291932,0.212234,3.135952,1.944267,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0,1.0,6.142857,7.204082,...,0.476213,0.155831,0.415266,0.0,0.483457,1.291932,0.212234,3.135952,1.944267,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0,1.0,5.745798,5.766807,...,0.474786,0.161945,0.450148,0.0,0.483756,1.34352,0.215993,3.263424,1.952518,0.0


# Fit

In [21]:
df_train = train_all.copy()

## Make Test_Val Data

In [22]:
#pivot
df_train_pv=df_train[['visit_id','patient_id','visit_month','updrs_1','updrs_2','updrs_3','updrs_4']].dropna(axis=0).pivot(index="patient_id",columns='visit_month',values='visit_id').reset_index()
df_train_pv.head()

#testデータ用の候補探し
test_patient_candidate_list = list(df_train_pv[
    ~df_train_pv[0].isna() & ~df_train_pv[6].isna() & ~df_train_pv[12].isna() & ~df_train_pv[24].isna()
]['patient_id'].values)
len(test_patient_candidate_list)

56

In [23]:
#test_patient_list = [1517,3863] #決め打ち　Top1,2
test_patient_list = [1517,3863,4923,7265] #決め打ち  Top3,4

test_month_list = [0,6,12,24]   #TESTと同一
#test_month_list = [0,3,6,9,12,18,24]
df_test_val = df_train[df_train['patient_id'].isin(test_patient_list) & df_train['visit_month'].isin(test_month_list)].reset_index(drop=True)

df_test_val_true = pd.DataFrame()

for i in range(0,3):
  df_test_val_true_ = pd.DataFrame()
  df_test_val_true_['id']=df_test_val[['visit_id']] + '_updrs_{0}'.format(i+1)
  df_test_val_true_['updrs']=df_test_val[['updrs_{0}'.format(i+1)]]
  df_test_val_true = pd.concat([df_test_val_true,df_test_val_true_],axis=0)

## predict test val

In [24]:
def predict_test_val(df_test_val,predict_order_list:list,models:list,te_updrs_s:list):
  df_test_val_ = df_test_val.drop(['visit_id','patient_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication'],axis=1)

  for order in predict_order_list:
    test_X = df_test_val_.values
    test_y = df_test_val['updrs_{0}'.format(order)].values
    if order == 1:
          model_ = models[0]
          df_te_val = te_updrs_s[0]
    if order == 2:
          model_ = models[1]
          df_te_val = te_updrs_s[1]
    if order == 3:
          model_ = models[2]
          df_te_val = te_updrs_s[2]

    df_test_val_['updrs_{0}'.format(order)] = model_.predict(test_X)
    df_test_val_['updrs_{0}_round'.format(order)] = df_test_val_['updrs_{0}'.format(order)].round(0)
    #予測値のtarget-encodingテーブルをjoin
    df_te_val.rename(columns={'updrs_{0}'.format(order):'updrs_{0}_te'.format(order)},inplace=True)
    df_test_val_ = df_test_val_.merge(df_te_val,left_on='updrs_{0}_round'.format(order),right_on='updrs_{0}_te'.format(order),how='left')
    #不要列の削除
    df_test_val_.drop(['updrs_{0}_round'.format(order),'updrs_{0}_te'.format(order)],axis=1,inplace=True)

  df_test_val_pred_ = df_test_val[['visit_id']].join(df_test_val_[['updrs_1','updrs_2','updrs_3']])

  df_test_val_pred = pd.DataFrame()
  for i in range(0,3):
    df_test_val_pred_vs = pd.DataFrame()
    df_test_val_pred_vs['id'] = df_test_val_pred_[['visit_id']] + '_updrs_{0}'.format(i+1)
    df_test_val_pred_vs['updrs']=df_test_val_pred_[['updrs_{0}'.format(i+1)]]
    df_test_val_pred = pd.concat([df_test_val_pred,df_test_val_pred_vs],axis=0)

  print('Score Test Val Predict : ',score_cal(df_test_val_true['updrs'].values,df_test_val_pred['updrs'].values).round(3))
  print('Score Test Val Predict SMAPE : ',score_cal(df_test_val_true['updrs'].values,df_test_val_pred['updrs'].values,'SMAPE').round(3))

  #どれぐらい差があるか調べる
  df_test_val_truepred = df_test_val_true.merge(df_test_val_pred,on='id',how='left').rename(columns={'updrs_x':'true','updrs_y':'pred'}).round(1)
  df_test_val_truepred['diff'] = abs(df_test_val_truepred['true']-df_test_val_truepred['pred'])
  df_test_val_truepred['diff_ratio'] = round((df_test_val_truepred['diff'] / df_test_val_truepred['true']) * 100,0)
  display(df_test_val_truepred)

## fit catboost

In [25]:
#predict_order_list = [2,1,3]
predict_order_list = [1,2,3]
#predict_order_list = [3,1,2]
#predict_order_list = [1]

scores = []

df_train_ = df_train.drop(['visit_id','patient_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication'],axis=1)

for order in predict_order_list:
  
  train_X = df_train_.values
  train_y = df_train['updrs_{0}'.format(order)].values
  model,score = fit_catboost_fold1(train_X,train_y)
  scores.append(score)
  if order == 1:
        model_1 = model
        df_te = te_updrs_1
  if order == 2:
        model_2 = model
        df_te = te_updrs_2
  if order == 3:
        model_3 = model
        df_te = te_updrs_3

  #display importance
  display_feature_importance(df_train_.columns,model)

  #予測値をtrainにセット
  df_train_['updrs_{0}'.format(order)] = model.predict(train_X)
  df_train_['updrs_{0}_round'.format(order)] = df_train_['updrs_{0}'.format(order)].round(0)
  #予測値のtarget-encodingテーブルをjoin
  df_te.rename(columns={'updrs_{0}'.format(order):'updrs_{0}_te'.format(order)},inplace=True)
  df_train_ = df_train_.merge(df_te,left_on='updrs_{0}_round'.format(order),right_on='updrs_{0}_te'.format(order),how='left')
  #不要列の削除
  df_train_.drop(['updrs_{0}_round'.format(order),'updrs_{0}_te'.format(order)],axis=1,inplace=True)
  print('train shape:',df_train_.shape)

****************************
Test data average score : 3.7372  seed  100
****************************


feature
updrs_1_std_by_Pep_mean_by_id       6.387558
updrs_2_std_by_Pep_mean_by_id       4.968793
updrs_2_max_by_Pep_mean_by_id       2.973657
updrs_1_max_by_Uni_std_by_id        2.197365
updrs_1_max_by_Uni_mean_by_id       2.111346
Abu_mean_by_id                      1.903034
updrs_1_median_by_Pep_mean_by_id    1.859336
updrs_4_max_by_Uni_std_by_id        1.851628
updrs_1_mean_by_Pep_mean_by_id      1.669213
updrs_2_max_by_Uni_mean_by_id       1.661833
updrs_1_max_by_Pep_std_by_id        1.659940
NPX_std_by_id                       1.606891
updrs_2_max_by_Uni_std_by_id        1.589266
NPX_std_by_Uni                      1.554238
updrs_3_std_by_Pep_mean_by_id       1.437333
updrs_4_max_by_Pep_std_by_id        1.395071
Abu_std_by_id                       1.390201
updrs_4_std_by_Uni_mean_by_id       1.336244
updrs_4_mean_by_Uni_std_by_id       1.267396
updrs_4_mean_by_Pep_mean_by_id      1.219074
Name: importance, dtype: float64

train shape: (2588, 104)
****************************
Test data average score : 3.8178  seed  100
****************************


feature
updrs_1                                                       11.034346
updrs_2_std_by_Pep_mean_by_id                                  4.292956
updrs_3_median_by_updrs_1                                      4.158214
updrs_3_mean_by_updrs_1                                        3.878519
updrs_2_median_by_updrs_1                                      3.764192
updrs_2_mean_by_updrs_1                                        3.336175
NPX_median_by_id                                               2.081063
updrs_2_std_by_Uni_mean_by_id                                  2.054589
updrs_3_std_by_Uni_mean_by_id                                  1.899736
updrs_3_std_by_Pep_mean_by_id                                  1.534634
upd23b_clinical_state_on_medication_mean_by_Uni_mean_by_id     1.505573
updrs_3_max_by_Uni_mean_by_id                                  1.391186
NPX_std_by_Uni                                                 1.361238
upd23b_clinical_state_on_medication_std_by_Uni_std_by_id

train shape: (2588, 111)
****************************
Test data average score : 9.4211  seed  100
****************************


feature
updrs_2                                                        10.662759
updrs_3_mean_by_updrs_2                                         9.285255
updrs_1_mean_by_updrs_2                                         7.917938
updrs_3_median_by_updrs_2                                       4.383608
updrs_3_std_by_Pep_mean_by_id                                   3.527945
updrs_1_median_by_updrs_2                                       2.292577
updrs_1                                                         1.831895
updrs_3_std_by_Uni_mean_by_id                                   1.680480
NPX_median_by_id                                                1.447831
NPX_std_by_id                                                   1.410893
Abu_std_by_id                                                   1.295799
updrs_4_max_by_Pep_std_by_id                                    1.218305
Abu_median_by_id                                                1.200922
updrs_4_std_by_Uni_mean_by_id              

train shape: (2588, 118)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [26]:
models = [model_1,model_2,model_3]
te_updrs_s = [te_updrs_1,te_updrs_2,te_updrs_3]
predict_test_val(df_test_val,predict_order_list,models,te_updrs_s) 

Score Test Val Predict :  7.201
Score Test Val Predict SMAPE :  25.513


Unnamed: 0,id,true,pred,diff,diff_ratio
0,1517_0_updrs_1,11.0,17.0,6.0,55.0
1,1517_6_updrs_1,17.0,16.3,0.7,4.0
2,1517_12_updrs_1,20.0,18.6,1.4,7.0
3,1517_24_updrs_1,19.0,18.5,0.5,3.0
4,3863_0_updrs_1,8.0,5.9,2.1,26.0
5,3863_6_updrs_1,9.0,7.1,1.9,21.0
6,3863_12_updrs_1,11.0,9.8,1.2,11.0
7,3863_24_updrs_1,11.0,9.8,1.2,11.0
8,4923_0_updrs_1,2.0,6.7,4.7,235.0
9,4923_6_updrs_1,6.0,10.1,4.1,68.0


# get_predictions

In [27]:
#とりあえずMock
def get_predictions(test, test_peptides, test_proteins, sample_submission):
    df = sample_submission
    return df

# Submitting to API

In [28]:
#なぜかsubファイルが追記になるのでどこかで治す

import sys
sys.path.append('/kaggle/input/amp-parkinsons-disease-progression-prediction/')

import amp_pd_peptide
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()

iter_test = env.iter_test() 

for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    #display(test)
    
    submission = get_predictions(test, test_peptides, test_proteins, sample_submission)
    #submission = submission.drop_duplicates(subset=['prediction_id', 'rating'])
    
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
