<a href="https://colab.research.google.com/github/kunai-3txk/Compe_tonyobyo/blob/main/AMP_FeatureStore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [235]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [236]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_profiling
import os
import seaborn as sns; sns.set()
import polars as pl

# Configuration

In [237]:
class config:
    MAX_PROTEINS = 20
    PROFILE_REPORT = True
    

class paths:
    # kaggle環境ならTrue
    if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
        common_path = "/kaggle/input/amp-parkinsons-disease-progression-prediction"
    
    # colaboratory環境ならTrue
    if 'COLAB_GPU' in set(os.environ.keys()):
        common_path = "/content/drive/MyDrive/AMP"

    SAMPLE_SUBMISSION = common_path + "/example_test_files/sample_submission.csv"
    SUPPLEMENTAL_CLINICAL_DATA = common_path + "/supplemental_clinical_data.csv"
    TRAIN_CLINICAL_DATA = common_path + "/train_clinical_data.csv"
    TRAIN_PEPTIDES = common_path + "/train_peptides.csv"
    TRAIN_PROTEINS = common_path + "/train_proteins.csv"
    TEST_CLINICAL_DATA = common_path + "/example_test_files/test.csv"
    TEST_PEPTIDES = common_path + "/example_test_files/test_peptides.csv"
    TEST_PROTEINS = common_path + "/example_test_files/test_proteins.csv"

# Function Definition

In [238]:
#統計量の作成
def make_statistic(df,group_col:str,agg_list:list):
    df_ret = df.groupby(group_col).agg([
        pl.mean(agg_list).suffix('_mean')
        ,pl.std(agg_list).suffix("_std")
        ,pl.median(agg_list).suffix("_med")
        #,pl.col(agg_list).max().suffix("_max") 
        #,pl.col(agg_list).min().suffix("_min") 
        #,pl.quantile(agg_list',0.25).suffix("_quan1")
        #,pl.quantile(agg_list',0.75).suffix("_quan3")
        ])
    return df_ret

# Load Data

train

In [239]:
train_clinical = pd.read_csv(paths.TRAIN_CLINICAL_DATA)
train_peptides = pd.read_csv(paths.TRAIN_PEPTIDES)
train_proteins = pd.read_csv(paths.TRAIN_PROTEINS)
supplemental_clinical = pd.read_csv(paths.SUPPLEMENTAL_CLINICAL_DATA)
#print('Correct label:Proteins shape: (232741, 5) Peptides shape: (981834, 6) Clinical shape: (2615, 8)')
print('Proteins shape:',train_proteins.shape)
print('Peptides shape:',train_peptides.shape)
print('Clinical shape:',train_clinical.shape)
print('supplemental_clinical shape:',supplemental_clinical.shape)

Proteins shape: (232741, 5)
Peptides shape: (981834, 6)
Clinical shape: (2615, 8)
supplemental_clinical shape: (2223, 8)


test

In [240]:
test_clinical = pd.read_csv(paths.TEST_CLINICAL_DATA)
test_peptides = pd.read_csv(paths.TEST_PEPTIDES)
test_proteins = pd.read_csv(paths.TEST_PROTEINS)
print('Proteins shape:',test_proteins.shape)
print('Peptides shape:',test_peptides.shape)
print('Clinical shape:',test_clinical.shape)

Proteins shape: (453, 6)
Peptides shape: (2057, 7)
Clinical shape: (16, 6)


#Common

In [241]:
#sort用にキー列のみ抽出
train_sort = train_clinical[['visit_id','patient_id','visit_month']]

target encoding

In [242]:
#検討中のため退避
train_clinical_te = train_clinical.copy()
train_peptides_te = train_peptides.copy()
train_proteins_te = train_proteins.copy()
supplemental_te = supplemental_clinical.copy()

train_clinical_te.replace({'On':1,'Off':0},inplace=True)
supplemental_te.replace({'On':1,'Off':0},inplace=True)

#clinical + supplemental -> NPX,AbuがないのでTEには使えない...
train_clinical_sup_te  =  pd.concat([train_clinical_te,supplemental_te])

#欠損値処理
#train_clinical_te.dropna(subset=['updrs_1'],inplace=True)
#train_clinical_te.dropna(subset=['updrs_2'],inplace=True)
#train_clinical_te.dropna(subset=['updrs_3'],inplace=True) 

train_pep_te = train_peptides_te[['visit_id','Peptide']].merge(train_clinical_te[['visit_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication']]
                                                ,on='visit_id',how='inner')
train_uni_te = train_proteins_te[['visit_id','UniProt']].merge(train_clinical_te[['visit_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication']]
                                                ,on='visit_id',how='inner')

#集計処理 by Pep/Uni
train_pep_te_agg_ = pl.DataFrame(train_pep_te.drop(['visit_id'],axis=1)).groupby(['Peptide']).agg(
    [
        pl.mean("*").suffix("_mean_by_Pep"),
        pl.std("*").suffix("_std_by_Pep"),
        pl.median("*").suffix("_median_by_Pep"),
        pl.max("*").suffix("_max_by_Pep"),
        #pl.min("*").suffix("_min_by_Pep"),
    ]
)

train_uni_te_agg_ = pl.DataFrame(train_uni_te.drop(['visit_id'],axis=1)).groupby(['UniProt']).agg(
    [
        pl.mean("*").suffix("_mean_by_Uni"),
        pl.std("*").suffix("_std_by_Uni"),
        pl.max("*").suffix("_max_by_Uni"),
        #pl.min("*").suffix("_min_by_Uni"),
    ]
)

#visit_idに集約
train_pep_te_agg = pl.DataFrame(train_peptides_te[['visit_id','Peptide']]).join(train_pep_te_agg_,on='Peptide',how='left').drop('Peptide').groupby(['visit_id']).agg(
    [
        pl.mean("*").suffix("_mean_by_id"),
        pl.std("*").suffix("_std_by_id"),
    ]
).to_pandas()

train_uni_te_agg = pl.DataFrame(train_proteins_te[['visit_id','UniProt']]).join(train_uni_te_agg_,on='UniProt',how='left').drop('UniProt').groupby(['visit_id']).agg(
    [
        pl.mean("*").suffix("_mean_by_id"),
        pl.std("*").suffix("_std_by_id"),
    ]
).to_pandas()

train_pep_te_agg_fill = train_sort.merge(train_pep_te_agg,on='visit_id',how='left').groupby('patient_id').fillna(method='ffill')
train_pro_te_agg_fill = train_sort.merge(train_uni_te_agg,on='visit_id',how='left').groupby('patient_id').fillna(method='ffill')


#Protain

In [243]:
pro_Uniprot_count = pd.DataFrame(train_proteins['UniProt'].value_counts()).reset_index()
train_proteins = train_proteins.merge(pro_Uniprot_count.rename(columns={'UniProt':'value_counts'}),left_on='UniProt',right_on='index',how='left')
train_proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,index,value_counts
0,55_0,0,55,O00391,11254.3,O00391,764
1,55_0,0,55,O00533,732430.0,O00533,1112
2,55_0,0,55,O00584,39585.8,O00584,1100
3,55_0,0,55,O14498,41526.9,O14498,1032
4,55_0,0,55,O14773,31238.0,O14773,1047


In [244]:
train_proteins_ = pl.DataFrame(train_proteins).select([
    pl.mean('NPX').over('visit_id').alias('NPX_mean_by_id')
   ,pl.std('NPX').over('visit_id').alias('NPX_std_by_id')
   ,pl.median('NPX').over('visit_id').alias('NPX_median_by_id')
   ,pl.mean('NPX').over('UniProt').alias('NPX_mean_by_Uni')
   ,pl.std('NPX').over('UniProt').alias('NPX_std_by_Uni')
   ,pl.median('NPX').over('UniProt').alias('NPX_median_by_Uni')
   ,pl.mean('value_counts').over('visit_id').alias('Unicount_mean_by_id')
   ,pl.std('value_counts').over('visit_id').alias('Unicount_std_by_id')
   ,pl.median('value_counts').over('visit_id').alias('Unicount_median_by_id')
])
train_proteins_.head()

NPX_mean_by_id,NPX_std_by_id,NPX_median_by_id,NPX_mean_by_Uni,NPX_std_by_Uni,NPX_median_by_Uni,Unicount_mean_by_id,Unicount_std_by_id,Unicount_median_by_id
f64,f64,f64,f64,f64,f64,f64,f64,f64
3180500.0,26727000.0,123757.0,11641.264435,2817.00353,11546.4,1034.324201,126.934968,1103.0
3180500.0,26727000.0,123757.0,511164.883903,235735.664694,483442.5,1034.324201,126.934968,1103.0
3180500.0,26727000.0,123757.0,26505.529157,10705.15254,26529.7,1034.324201,126.934968,1103.0
3180500.0,26727000.0,123757.0,27305.934884,8446.187506,26624.0,1034.324201,126.934968,1103.0
3180500.0,26727000.0,123757.0,17688.295406,7166.325369,17399.6,1034.324201,126.934968,1103.0


In [245]:
train_proteins_agg_ = train_proteins[['visit_id']].join(train_proteins_.to_pandas())
train_proteins_agg = train_sort.merge(train_proteins_agg_.groupby('visit_id').mean().reset_index(),on='visit_id',how='left').sort_values(['patient_id','visit_month']).reset_index(drop=True).drop(['visit_month'],axis=1)

#欠損値を1つ前の値で埋める
train_proteins_agg_fill = train_proteins_agg.groupby('patient_id').fillna(method='ffill')
#train_proteins_agg.drop(['patient_id'],axis=1,inplace=True)

train_proteins_agg_fill.head()

Unnamed: 0,visit_id,NPX_mean_by_id,NPX_std_by_id,NPX_median_by_id,NPX_mean_by_Uni,NPX_std_by_Uni,NPX_median_by_Uni,Unicount_mean_by_id,Unicount_std_by_id,Unicount_median_by_id
0,55_0,3180508.0,26727000.0,123757.0,2596938.0,702016.931653,2534106.0,1034.324201,126.934968,1103.0
1,55_3,3180508.0,26727000.0,123757.0,2596938.0,702016.931653,2534106.0,1034.324201,126.934968,1103.0
2,55_6,2942039.0,23914690.0,129048.0,2608845.0,705203.512224,2545728.0,1037.215596,122.425466,1103.5
3,55_9,2942039.0,23914690.0,129048.0,2608845.0,705203.512224,2545728.0,1037.215596,122.425466,1103.5
4,55_12,3145608.0,26126740.0,124498.0,2562630.0,692742.468271,2500628.0,1031.878378,129.82811,1101.5


In [246]:
train_proteins_toall = train_proteins_agg_fill.merge(train_pro_te_agg_fill,on='visit_id',how='left').copy()

#Peptide

In [247]:
pep_Peptide_count = pd.DataFrame(train_peptides['Peptide'].value_counts()).reset_index()
train_peptides = train_peptides.merge(pep_Peptide_count.rename(columns={'Peptide':'value_counts'}),left_on='Peptide',right_on='index',how='left')
train_peptides.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,index,value_counts
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3,NEQEQPLGQWHLS,764
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0,GNPEPTFSWTK,1030
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0,IEIPSSVQQVPTIIK,1048
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,1066
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7,SMEQNGPGLEYR,1094


In [248]:
train_peptides_ = pl.DataFrame(train_peptides).select([
    pl.mean('PeptideAbundance').over('visit_id').alias('Abu_mean_by_id')
   ,pl.std('PeptideAbundance').over('visit_id').alias('Abu_std_by_id')
   ,pl.median('PeptideAbundance').over('visit_id').alias('Abu_median_by_id')
   ,pl.mean('PeptideAbundance').over('Peptide').alias('Abu_mean_by_Pep')
   ,pl.std('PeptideAbundance').over('Peptide').alias('Abu_std_by_Pep')
   ,pl.median('PeptideAbundance').over('Peptide').alias('Abu_median_by_Pep')
   ,pl.mean('value_counts').over('visit_id').alias('Pepcount_mean_by_id')
   ,pl.std('value_counts').over('visit_id').alias('Pepcount_std_by_id')
   ,pl.median('value_counts').over('visit_id').alias('Pepcount_median_by_id')
])
train_peptides_.head()

Abu_mean_by_id,Abu_std_by_id,Abu_median_by_id,Abu_mean_by_Pep,Abu_std_by_Pep,Abu_median_by_Pep,Pepcount_mean_by_id,Pepcount_std_by_id,Pepcount_median_by_id
f64,f64,f64,f64,f64,f64,f64,f64,f64
748153.907014,4231800.0,93134.8,11641.264435,2817.00353,11546.4,1020.748657,110.580857,1066.0
748153.907014,4231800.0,93134.8,76367.164728,32366.189011,72213.95,1020.748657,110.580857,1066.0
748153.907014,4231800.0,93134.8,145751.527996,64302.85968,138427.0,1020.748657,110.580857,1066.0
748153.907014,4231800.0,93134.8,25062.222917,11221.845903,23616.2,1020.748657,110.580857,1066.0
748153.907014,4231800.0,93134.8,24059.520484,11002.026192,22063.85,1020.748657,110.580857,1066.0


In [249]:
train_peptides_agg_ = train_peptides[['visit_id']].join(train_peptides_.to_pandas())
train_peptides_agg  = train_sort.merge(train_peptides_agg_.groupby('visit_id').mean().reset_index(),on='visit_id',how='left').sort_values(['patient_id','visit_month']).reset_index(drop=True).drop(['visit_month'],axis=1)

#欠損値を1つ前の値で埋める
train_peptides_agg_fill = train_peptides_agg.groupby('patient_id').fillna(method='ffill')
#train_peptides_agg_fill.drop(['patient_id'],axis=1,inplace=True)
train_peptides_agg_fill.head()

Unnamed: 0,visit_id,Abu_mean_by_id,Abu_std_by_id,Abu_median_by_id,Abu_mean_by_Pep,Abu_std_by_Pep,Abu_median_by_Pep,Pepcount_mean_by_id,Pepcount_std_by_id,Pepcount_median_by_id
0,55_0,748153.907014,4231806.0,93134.8,619334.666897,202617.423261,597764.341992,1020.748657,110.580857,1066.0
1,55_3,748153.907014,4231806.0,93134.8,619334.666897,202617.423261,597764.341992,1020.748657,110.580857,1066.0
2,55_6,685218.599872,3491859.0,84814.9,620241.001591,203152.040437,598682.013953,1021.979701,108.646257,1066.0
3,55_9,685218.599872,3491859.0,84814.9,620241.001591,203152.040437,598682.013953,1021.979701,108.646257,1066.0
4,55_12,740535.423563,3882536.0,91575.2,612046.308868,200234.524653,590729.986479,1018.652174,112.724341,1065.0


In [250]:
train_peptides_toall = train_peptides_agg_fill.merge(train_pep_te_agg_fill,on='visit_id',how='left').copy()

#Clinical

In [251]:
#train_clinical['visit_month'].nunique()
#train_clinical['visit_month'].unique()
#train_clinical['visit_month'].value_counts()
#[  0,   3,   6,   9,  12,  18,  24,  30,  36,  42,  48,  54,  60, 72,  84,  96, 108] 17types
#⇒ Phase_1 初期 [0,3,6,9,12,18,24] Phase_2 中期 [30,36,42,48,54,60] Phase_3 後期[72,84,96,108]

In [252]:
#0309 Fitへ移動

# #pivot
# train_clinical_pv=train_clinical.dropna(axis=0).pivot(index="patient_id",columns='visit_month',values='visit_id').reset_index()
# train_clinical_pv.head()

# #testデータ用の候補探し
# test_patient_list = list(train_clinical_pv[
#     ~train_clinical_pv[0].isna() & ~train_clinical_pv[6].isna() & ~train_clinical_pv[12].isna() & ~train_clinical_pv[24].isna()
# ]['patient_id'].values)
# len(test_patient_list)

In [253]:
#欠損値処理
train_clinical.dropna(subset=['updrs_1'],inplace=True)
train_clinical.dropna(subset=['updrs_2'],inplace=True)
train_clinical.dropna(subset=['updrs_3'],inplace=True) #とりあえず

#upd23dの置き換え
train_clinical.replace({'On':1,'Off':2},inplace=True)

train_clinical.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2588 entries, 0 to 2614
Data columns (total 8 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   visit_id                             2588 non-null   object 
 1   patient_id                           2588 non-null   int64  
 2   visit_month                          2588 non-null   int64  
 3   updrs_1                              2588 non-null   float64
 4   updrs_2                              2588 non-null   float64
 5   updrs_3                              2588 non-null   float64
 6   updrs_4                              1562 non-null   float64
 7   upd23b_clinical_state_on_medication  1280 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 182.0+ KB


In [254]:
train_clinical_ = pl.DataFrame(train_clinical).groupby('visit_id').agg([
    pl.mean('updrs_1').suffix('_mean')
    ,pl.std('updrs_1').suffix("_std")
    ,pl.median('updrs_1').suffix("_med")
    ])

train_clinical_.head()

visit_id,updrs_1_mean,updrs_1_std,updrs_1_med
str,f64,f64,f64
"""42086_42""",29.0,0.0,29.0
"""13360_9""",2.0,0.0,2.0
"""31121_30""",6.0,0.0,6.0
"""45181_12""",9.0,0.0,9.0
"""40751_0""",15.0,0.0,15.0


In [255]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html

train_clinical_rolling = train_clinical.copy()

for i in range(3):
  train_clinical_rolling['updrs_{0}_rolling'.format(i+1)] = train_clinical_rolling.groupby('patient_id')['updrs_{0}'.format(i+1)].rolling(3,min_periods=1).mean().reset_index(drop=True)
  train_clinical_rolling['updrs_{0}_ewm'.format(i+1)] = train_clinical_rolling.groupby('patient_id')['updrs_{0}'.format(i+1)].transform(lambda x: x.ewm(halflife=10).mean()).reset_index(drop=True)
train_clinical_rolling.head()


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,updrs_1_rolling,updrs_1_ewm,updrs_2_rolling,updrs_2_ewm,updrs_3_rolling,updrs_3_ewm
0,55_0,55,0,10.0,6.0,15.0,,,10.0,10.0,6.0,6.0,15.0,15.0
1,55_3,55,3,10.0,7.0,25.0,,,10.0,10.0,6.5,6.517322,20.0,20.173217
2,55_6,55,6,8.0,10.0,34.0,,,9.333333,9.286627,7.666667,7.759546,24.666667,25.105042
3,55_9,55,9,8.0,9.0,30.0,0.0,1.0,8.666667,8.930796,8.666667,8.102607,29.666667,26.458798
4,55_12,55,12,10.0,10.0,41.0,0.0,1.0,8.666667,9.175259,9.666667,8.536426,35.0,29.783493


# supplemental_clinical

In [256]:
supplemental_clinical

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,35_0,35,0,5.0,3.0,16.0,0.0,
1,35_36,35,36,6.0,4.0,20.0,0.0,
2,75_0,75,0,4.0,6.0,26.0,0.0,
3,75_36,75,36,1.0,8.0,38.0,0.0,On
4,155_0,155,0,,,0.0,,
...,...,...,...,...,...,...,...,...
2218,65382_0,65382,0,,,0.0,,
2219,65405_0,65405,0,5.0,16.0,31.0,0.0,
2220,65405_5,65405,5,,,57.0,,
2221,65530_0,65530,0,10.0,6.0,24.0,0.0,


#Train All

In [258]:
def make_train_all(df_cli,df_pro,df_pep):
  df_propep = df_pro.merge(df_pep,on='visit_id',how='left')
  df_all = df_cli.merge(df_propep,on='visit_id',how='left')
  return df_all


train_all = make_train_all(train_clinical,train_proteins_toall,train_peptides_toall)
print('shape',train_all.shape)
train_all.head()

train_all.describe()

train_all.to_csv('/content/drive/MyDrive/AMP/train_all.csv',index=False)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,NPX_mean_by_id,NPX_std_by_id,...,updrs_1_median_by_Pep_std_by_id,updrs_2_median_by_Pep_std_by_id,updrs_3_median_by_Pep_std_by_id,updrs_4_median_by_Pep_std_by_id,upd23b_clinical_state_on_medication_median_by_Pep_std_by_id,updrs_1_max_by_Pep_std_by_id,updrs_2_max_by_Pep_std_by_id,updrs_3_max_by_Pep_std_by_id,updrs_4_max_by_Pep_std_by_id,upd23b_clinical_state_on_medication_max_by_Pep_std_by_id
0,55_0,55,0,10.0,6.0,15.0,,,3.180508e+06,2.672700e+07,...,0.309644,0.148859,0.426175,0.0,0.491248,1.317100,0.212778,3.240491,1.951132,0.0
1,55_3,55,3,10.0,7.0,25.0,,,3.180508e+06,2.672700e+07,...,0.309644,0.148859,0.426175,0.0,0.491248,1.317100,0.212778,3.240491,1.951132,0.0
2,55_6,55,6,8.0,10.0,34.0,,,2.942039e+06,2.391469e+07,...,0.308946,0.144019,0.415266,0.0,0.490972,1.291932,0.212234,3.135952,1.944267,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0,1.0,2.942039e+06,2.391469e+07,...,0.308946,0.144019,0.415266,0.0,0.490972,1.291932,0.212234,3.135952,1.944267,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0,1.0,3.145608e+06,2.612674e+07,...,0.307975,0.150702,0.450148,0.0,0.491258,1.343520,0.215993,3.263424,1.952518,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2583,65043_48,65043,48,7.0,6.0,13.0,0.0,2.0,2.917391e+06,2.171818e+07,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0
2584,65043_54,65043,54,4.0,8.0,11.0,1.0,2.0,2.917391e+06,2.171818e+07,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0
2585,65043_60,65043,60,6.0,6.0,16.0,1.0,2.0,2.917391e+06,2.171818e+07,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0
2586,65043_72,65043,72,3.0,9.0,14.0,1.0,2.0,2.917391e+06,2.171818e+07,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0
