<a href="https://colab.research.google.com/github/kunai-3txk/Compe_tonyobyo/blob/main/AMP_FeatureStore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting polars
  Downloading polars-0.16.12-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.2/16.2 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.16.12


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_profiling
import os
import seaborn as sns; sns.set()
import polars as pl

# Configuration

In [4]:
class config:
    MAX_PROTEINS = 20
    PROFILE_REPORT = True
    

class paths:
    # kaggle環境ならTrue
    if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
        common_path = "/kaggle/input/amp-parkinsons-disease-progression-prediction"
    
    # colaboratory環境ならTrue
    if 'COLAB_GPU' in set(os.environ.keys()):
        common_path = "/content/drive/MyDrive/AMP"

    SAMPLE_SUBMISSION = common_path + "/example_test_files/sample_submission.csv"
    SUPPLEMENTAL_CLINICAL_DATA = common_path + "/supplemental_clinical_data.csv"
    TRAIN_CLINICAL_DATA = common_path + "/train_clinical_data.csv"
    TRAIN_PEPTIDES = common_path + "/train_peptides.csv"
    TRAIN_PROTEINS = common_path + "/train_proteins.csv"
    TEST_CLINICAL_DATA = common_path + "/example_test_files/test.csv"
    TEST_PEPTIDES = common_path + "/example_test_files/test_peptides.csv"
    TEST_PROTEINS = common_path + "/example_test_files/test_proteins.csv"
    OUTPUT = common_path

# Function Definition

## make_statistic

In [5]:
#統計量の作成
def make_statistic(df,group_col:str,agg_list:list):
    df_ret = df.groupby(group_col).agg([
        pl.mean(agg_list).suffix('_mean')
        ,pl.std(agg_list).suffix("_std")
        ,pl.median(agg_list).suffix("_med")
        #,pl.col(agg_list).max().suffix("_max") 
        #,pl.col(agg_list).min().suffix("_min") 
        #,pl.quantile(agg_list',0.25).suffix("_quan1")
        #,pl.quantile(agg_list',0.75).suffix("_quan3")
        ])
    return df_ret

# Load Data

## train

In [6]:
train_clinical = pd.read_csv(paths.TRAIN_CLINICAL_DATA)
train_peptides = pd.read_csv(paths.TRAIN_PEPTIDES)
train_proteins = pd.read_csv(paths.TRAIN_PROTEINS)
supplemental_clinical = pd.read_csv(paths.SUPPLEMENTAL_CLINICAL_DATA)
#print('Correct label:Proteins shape: (232741, 5) Peptides shape: (981834, 6) Clinical shape: (2615, 8)')
print('Proteins shape:',train_proteins.shape)
print('Peptides shape:',train_peptides.shape)
print('Clinical shape:',train_clinical.shape)
print('supplemental_clinical shape:',supplemental_clinical.shape)

train_clinical.replace({'On':1,'Off':0},inplace=True)
supplemental_clinical.replace({'On':1,'Off':0},inplace=True)

Proteins shape: (232741, 5)
Peptides shape: (981834, 6)
Clinical shape: (2615, 8)
supplemental_clinical shape: (2223, 8)


## test

In [7]:
test_clinical = pd.read_csv(paths.TEST_CLINICAL_DATA)
test_peptides = pd.read_csv(paths.TEST_PEPTIDES)
test_proteins = pd.read_csv(paths.TEST_PROTEINS)
sample_submission= pd.read_csv(paths.SAMPLE_SUBMISSION)
print('Proteins shape:',test_proteins.shape)
print('Peptides shape:',test_peptides.shape)
print('Clinical shape:',test_clinical.shape)

Proteins shape: (453, 6)
Peptides shape: (2057, 7)
Clinical shape: (16, 6)


#Common

## make train_sort

In [8]:
#sort用にキー列のみ抽出
train_sort = train_clinical[['visit_id','patient_id','visit_month']]

In [9]:
test_peptides['visit_id'].unique()

array(['50423_0', '3342_6'], dtype=object)

# target encoding

In [10]:
train_clinical_te = train_clinical.copy()
train_peptides_te = train_peptides.copy()
train_proteins_te = train_proteins.copy()
supplemental_te = supplemental_clinical.copy()

## pep,uni <- updrs1-4

In [11]:
#欠損値処理
#train_clinical_te.dropna(subset=['updrs_1'],inplace=True)
#train_clinical_te.dropna(subset=['updrs_2'],inplace=True)
#train_clinical_te.dropna(subset=['updrs_3'],inplace=True) 

train_pep_te = train_peptides_te[['visit_id','Peptide']].merge(train_clinical_te[['visit_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication']]
                                                ,on='visit_id',how='inner')
train_uni_te = train_proteins_te[['visit_id','UniProt']].merge(train_clinical_te[['visit_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication']]
                                                ,on='visit_id',how='inner')

#集計処理 by Pep/Uni
train_pep_te_agg_ = pl.DataFrame(train_pep_te.drop(['visit_id'],axis=1)).groupby(['Peptide']).agg(
    [
        pl.mean("*").suffix("_mean_by_Pep"),
        pl.std("*").suffix("_std_by_Pep"),
        pl.median("*").suffix("_median_by_Pep"),
        pl.max("*").suffix("_max_by_Pep"),
        #pl.min("*").suffix("_min_by_Pep"),
    ]
)

train_uni_te_agg_ = pl.DataFrame(train_uni_te.drop(['visit_id'],axis=1)).groupby(['UniProt']).agg(
    [
        pl.mean("*").suffix("_mean_by_Uni"),
        pl.std("*").suffix("_std_by_Uni"),
        pl.max("*").suffix("_max_by_Uni"),
        #pl.min("*").suffix("_min_by_Uni"),
    ]
)

#visit_idに集約
train_pep_te_agg = pl.DataFrame(train_peptides_te[['visit_id','Peptide']]).join(train_pep_te_agg_,on='Peptide',how='left').drop('Peptide').groupby(['visit_id']).agg(
    [
        pl.mean("*").suffix("_mean_by_id"),
        pl.std("*").suffix("_std_by_id"),
    ]
).to_pandas()

train_uni_te_agg = pl.DataFrame(train_proteins_te[['visit_id','UniProt']]).join(train_uni_te_agg_,on='UniProt',how='left').drop('UniProt').groupby(['visit_id']).agg(
    [
        pl.mean("*").suffix("_mean_by_id"),
        pl.std("*").suffix("_std_by_id"),
    ]
).to_pandas()

train_pep_te_agg_fill = train_sort.merge(train_pep_te_agg,on='visit_id',how='left').groupby('patient_id').fillna(method='ffill')
train_pro_te_agg_fill = train_sort.merge(train_uni_te_agg,on='visit_id',how='left').groupby('patient_id').fillna(method='ffill')


## updrs <- updrs

In [12]:
train_clinical_te
supplemental_te

train_clisup_ = pd.concat([train_clinical_te,supplemental_te])
train_clisup = train_clisup_[['updrs_1','updrs_2','updrs_3','updrs_4']]

for i in range(4) :
  train_clisup_tmp = pl.DataFrame(train_clisup).groupby(['updrs_{0}'.format(i+1)]).agg(
    [
        pl.mean("*").suffix("_mean_by_"+'updrs_{0}'.format(i+1)),
        #pl.std("*").suffix("_std_by_Pep"),
        pl.median("*").suffix("_median_by_"+'updrs_{0}'.format(i+1)),
        #pl.max("*").suffix("_max_by_Pep"),
        pl.min("*").suffix("_min_by_"+'updrs_{0}'.format(i+1)),
    ]
  ).to_pandas()
  locals()["train_te_updrs_{0}".format(i+1)] = train_clisup_tmp

train_te_updrs_1.to_csv(paths.OUTPUT+'/train_te_updrs_1.csv',index=False)
train_te_updrs_2.to_csv(paths.OUTPUT+'/train_te_updrs_2.csv',index=False)
train_te_updrs_3.to_csv(paths.OUTPUT+'/train_te_updrs_3.csv',index=False)
train_te_updrs_4.to_csv(paths.OUTPUT+'/train_te_updrs_4.csv',index=False)

## month <- updrs1-4

In [13]:
train_clisup = train_clisup_[['visit_month','updrs_1','updrs_2','updrs_3','updrs_4']]

train_te_month_agg = pl.DataFrame(train_clisup).groupby(['visit_month']).agg(
    [
        pl.mean("*").suffix("_mean_by_month"),
        pl.std("*").suffix("_std_by_month"),
        pl.median("*").suffix("_median_by_month"),
        #pl.min("*").suffix("_min_by_Uni"),
    ]
).to_pandas()

#Protain

In [113]:
def feature_proteins(df_key,df_proteins):
  #UniProtのcount情報を付与
  #pro_Uniprot_count = pd.DataFrame(df_proteins['UniProt'].value_counts()).reset_index()
  #df_proteins = df_proteins.merge(pro_Uniprot_count.rename(columns={'UniProt':'value_counts'}),left_on='UniProt',right_on='index',how='left')

  #統計量の作成
  df_proteins_ = pl.DataFrame(df_proteins).select([
      pl.mean('NPX').over('visit_id').alias('NPX_mean_by_id')
    ,pl.std('NPX').over('visit_id').alias('NPX_std_by_id')
    ,pl.median('NPX').over('visit_id').alias('NPX_median_by_id')
    ,pl.mean('NPX').over('UniProt').alias('NPX_mean_by_Uni')
    ,pl.std('NPX').over('UniProt').alias('NPX_std_by_Uni')
    ,pl.median('NPX').over('UniProt').alias('NPX_median_by_Uni')
    #,pl.mean('value_counts').over('visit_id').alias('Unicount_mean_by_id')
    #,pl.std('value_counts').over('visit_id').alias('Unicount_std_by_id')
    #,pl.median('value_counts').over('visit_id').alias('Unicount_median_by_id')
  ])

  df_proteins_agg_ = df_proteins[['visit_id']].join(df_proteins_.to_pandas())
  df_proteins_agg = df_key.merge(df_proteins_agg_.groupby('visit_id').mean().reset_index(),on='visit_id',how='left').sort_values(['patient_id','visit_month']).reset_index(drop=True).drop(['visit_month'],axis=1)

  #欠損値を1つ前の値で埋める
  df_proteins_agg_fill = df_proteins_agg.groupby('patient_id').fillna(method='ffill')
  df_proteins_toall = df_proteins_agg_fill.merge(train_pro_te_agg_fill,on='visit_id',how='left').copy()

  return df_proteins_toall

train_proteins_feature = feature_proteins(train_sort,train_proteins)

#Peptide

In [100]:
def feature_peptide(df_key,df_peptides):
  #pep_Peptide_count = pd.DataFrame(df_peptides['Peptide'].value_counts()).reset_index()
  #df_peptides = train_peptides.merge(pep_Peptide_count.rename(columns={'Peptide':'value_counts'}),left_on='Peptide',right_on='index',how='left')

  df_peptides_ = pl.DataFrame(df_peptides).select([
      pl.mean('PeptideAbundance').over('visit_id').alias('Abu_mean_by_id')
    ,pl.std('PeptideAbundance').over('visit_id').alias('Abu_std_by_id')
    ,pl.median('PeptideAbundance').over('visit_id').alias('Abu_median_by_id')
    ,pl.mean('PeptideAbundance').over('Peptide').alias('Abu_mean_by_Pep')
    ,pl.std('PeptideAbundance').over('Peptide').alias('Abu_std_by_Pep')
    ,pl.median('PeptideAbundance').over('Peptide').alias('Abu_median_by_Pep')
    #,pl.mean('value_counts').over('visit_id').alias('Pepcount_mean_by_id')
    #,pl.std('value_counts').over('visit_id').alias('Pepcount_std_by_id')
    #,pl.median('value_counts').over('visit_id').alias('Pepcount_median_by_id')
  ])

  df_peptides_agg_ = df_peptides[['visit_id']].join(df_peptides_.to_pandas())
  df_peptides_agg_  = df_key.merge(df_peptides_agg_.groupby('visit_id').mean().reset_index(),on='visit_id',how='left').sort_values(['patient_id','visit_month']).reset_index(drop=True).drop(['visit_month'],axis=1)

  #欠損値を1つ前の値で埋める
  df_peptides_agg_fill = df_peptides_agg_.groupby('patient_id').fillna(method='ffill')
  df_peptides_agg_toall = df_peptides_agg_fill.merge(train_pep_te_agg_fill,on='visit_id',how='left').copy()
  return df_peptides_agg_toall

train_peptides_feature = feature_peptide(train_sort,train_peptides)

Unnamed: 0,visit_id,Abu_mean_by_id,Abu_std_by_id,Abu_median_by_id,Abu_mean_by_Pep,Abu_std_by_Pep,Abu_median_by_Pep,visit_month,updrs_1_mean_by_Pep_mean_by_id,updrs_2_mean_by_Pep_mean_by_id,...,updrs_1_median_by_Pep_std_by_id,updrs_2_median_by_Pep_std_by_id,updrs_3_median_by_Pep_std_by_id,updrs_4_median_by_Pep_std_by_id,upd23b_clinical_state_on_medication_median_by_Pep_std_by_id,updrs_1_max_by_Pep_std_by_id,updrs_2_max_by_Pep_std_by_id,updrs_3_max_by_Pep_std_by_id,updrs_4_max_by_Pep_std_by_id,upd23b_clinical_state_on_medication_max_by_Pep_std_by_id
0,55_0,748153.907014,4.231806e+06,93134.8,619334.666897,202617.423261,597764.341992,0,6.499588,5.775430,...,0.309644,0.148859,0.426175,0.0,0.491248,1.317100,0.212778,3.240491,1.951132,0.0
1,55_3,748153.907014,4.231806e+06,93134.8,619334.666897,202617.423261,597764.341992,3,6.499588,5.775430,...,0.309644,0.148859,0.426175,0.0,0.491248,1.317100,0.212778,3.240491,1.951132,0.0
2,55_6,685218.599872,3.491859e+06,84814.9,620241.001591,203152.040437,598682.013953,6,6.501296,5.777437,...,0.308946,0.144019,0.415266,0.0,0.490972,1.291932,0.212234,3.135952,1.944267,0.0
3,55_9,685218.599872,3.491859e+06,84814.9,620241.001591,203152.040437,598682.013953,9,6.501296,5.777437,...,0.308946,0.144019,0.415266,0.0,0.490972,1.291932,0.212234,3.135952,1.944267,0.0
4,55_12,740535.423563,3.882536e+06,91575.2,612046.308868,200234.524653,590729.986479,12,6.498244,5.774333,...,0.307975,0.150702,0.450148,0.0,0.491258,1.343520,0.215993,3.263424,1.952518,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,669411.457384,3.243698e+06,87860.4,616804.742484,201873.668733,595463.081888,48,6.501298,5.780114,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0
2611,65043_54,669411.457384,3.243698e+06,87860.4,616804.742484,201873.668733,595463.081888,54,6.501298,5.780114,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0
2612,65043_60,669411.457384,3.243698e+06,87860.4,616804.742484,201873.668733,595463.081888,60,6.501298,5.780114,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0
2613,65043_72,669411.457384,3.243698e+06,87860.4,616804.742484,201873.668733,595463.081888,72,6.501298,5.780114,...,0.311186,0.155418,0.445967,0.0,0.490867,1.264117,0.197362,3.153475,1.963058,0.0


#Clinical

In [16]:
#train_clinical['visit_month'].nunique()
#train_clinical['visit_month'].unique()
#train_clinical['visit_month'].value_counts()
#[  0,   3,   6,   9,  12,  18,  24,  30,  36,  42,  48,  54,  60, 72,  84,  96, 108] 17types
#⇒ Phase_1 初期 [0,3,6,9,12,18,24] Phase_2 中期 [30,36,42,48,54,60] Phase_3 後期[72,84,96,108]

In [17]:
#欠損値処理
train_clinical.dropna(subset=['updrs_1'],inplace=True)
train_clinical.dropna(subset=['updrs_2'],inplace=True)
train_clinical.dropna(subset=['updrs_3'],inplace=True) #とりあえず

#upd23dの置き換え
#train_clinical.replace({'On':1,'Off':2},inplace=True)

train_clinical.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2588 entries, 0 to 2614
Data columns (total 8 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   visit_id                             2588 non-null   object 
 1   patient_id                           2588 non-null   int64  
 2   visit_month                          2588 non-null   int64  
 3   updrs_1                              2588 non-null   float64
 4   updrs_2                              2588 non-null   float64
 5   updrs_3                              2588 non-null   float64
 6   updrs_4                              1562 non-null   float64
 7   upd23b_clinical_state_on_medication  1280 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 182.0+ KB


In [18]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html

train_clinical_rolling = train_clinical.copy()

for i in range(3):
  train_clinical_rolling['updrs_{0}_rolling'.format(i+1)] = train_clinical_rolling.groupby('patient_id')['updrs_{0}'.format(i+1)].rolling(3,min_periods=1).mean().reset_index(drop=True)
  train_clinical_rolling['updrs_{0}_ewm'.format(i+1)] = train_clinical_rolling.groupby('patient_id')['updrs_{0}'.format(i+1)].transform(lambda x: x.ewm(halflife=10).mean()).reset_index(drop=True)
train_clinical_rolling.head()


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,updrs_1_rolling,updrs_1_ewm,updrs_2_rolling,updrs_2_ewm,updrs_3_rolling,updrs_3_ewm
0,55_0,55,0,10.0,6.0,15.0,,,10.0,10.0,6.0,6.0,15.0,15.0
1,55_3,55,3,10.0,7.0,25.0,,,10.0,10.0,6.5,6.517322,20.0,20.173217
2,55_6,55,6,8.0,10.0,34.0,,,9.333333,9.286627,7.666667,7.759546,24.666667,25.105042
3,55_9,55,9,8.0,9.0,30.0,0.0,1.0,8.666667,8.930796,8.666667,8.102607,29.666667,26.458798
4,55_12,55,12,10.0,10.0,41.0,0.0,1.0,8.666667,9.175259,9.666667,8.536426,35.0,29.783493


# supplemental_clinical

In [19]:
supplemental_clinical.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,35_0,35,0,5.0,3.0,16.0,0.0,
1,35_36,35,36,6.0,4.0,20.0,0.0,
2,75_0,75,0,4.0,6.0,26.0,0.0,
3,75_36,75,36,1.0,8.0,38.0,0.0,1.0
4,155_0,155,0,,,0.0,,


#Train All

In [126]:
def make_df_all(df_cli,df_pro,df_pep):
  df_propep = df_pro.merge(df_pep,on='visit_id',how='left')
  df_all = df_cli.merge(df_propep,on='visit_id',how='left')
  return df_all

train_clinical_toall = train_clinical.merge(train_te_month_agg,on='visit_month',how='left')

train_all = make_df_all(train_clinical_toall,train_proteins_feature,train_peptides_feature)
print('shape',train_all.shape)
train_all.head()

train_all.describe()

train_all.to_csv(paths.OUTPUT+'/train_all.csv',index=False)

#Test All

In [128]:
#test_clinical.shape
#test_peptides
#test_proteins
#display(sample_submission)
#********************************
#3342_0_plus0
#3342_0_plus6     3342_6_plus0
#3342_0_plus12    3342_6_plus6
#                 3342_6_plus12
#3342_0_plus24
#                 3342_6_plus24

#50423_0_plus0
#50423_0_plus6     50423_6_plus0
#50423_0_plus12    50423_6_plus6
#                  50423_6_plus12
#50423_0_plus24
#                  50423_6_plus24
#*************************************

In [127]:
test_sort = test_clinical[['visit_id','visit_month','patient_id']]

test_train_peptides = pd.concat([train_peptides,test_peptides.drop(['group_key'],axis=1)],axis=0).reset_index(drop=True)
test_peptide_feature = feature_peptide(test_sort,test_train_peptides)

test_train_proteins = pd.concat([train_proteins,test_proteins.drop(['group_key'],axis=1)],axis=0).reset_index(drop=True)
test_protein_feature = feature_proteins(test_sort,test_train_proteins)

test_clinical_toall = test_clinical.merge(train_te_month_agg,on='visit_month',how='left')

test_all = make_df_all(test_clinical_toall,test_protein_feature,test_peptide_feature)

test_all.drop(['updrs_test','row_id','group_key'],axis=1,inplace=True)

test_all.shape

test_all.to_csv(paths.OUTPUT+'/test_all.csv',index=False)
