In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
#create dataframes from csv files

peptides = pd.read_csv("PD-datasets/train_peptides.csv")
proteins = pd.read_csv("PD-datasets/train_proteins.csv")
clinical = pd.read_csv("PD-datasets/train_clinical_data.csv")

In [3]:
pep_pivot = peptides.pivot(index="visit_id",columns="Peptide",values="PeptideAbundance").sort_values(by="visit_id")

In [4]:
pro_pivot = proteins.pivot(index="visit_id",columns="UniProt",values="NPX")

In [5]:
merged_pivots = pep_pivot.merge(pro_pivot,how='left',on='visit_id')
print(merged_pivots.shape)
merged_pivots.head()

(1113, 1195)


Unnamed: 0_level_0,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,ADDLGKGGNEESTKTGNAGSR,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.4
10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,,...,,14408.4,,,28537.0,171733.0,65668.1,,9295.65,25697.8
10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,30838.2,...,317477.0,38667.2,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.7
10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,6201210.0,26720.0,...,557904.0,44556.9,155619.0,14647.9,36927.7,229232.0,106564.0,26077.7,21441.8,7642.42
10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,15645.2,...,,47836.7,177619.0,17061.1,25510.4,176722.0,59471.4,12639.2,15091.4,6168.55


In [6]:
full_merge = merged_pivots.merge(clinical,how='left',on='visit_id')
print(full_merge.shape)
full_merge.head()

(1113, 1203)


Unnamed: 0,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,Q9UNU6,Q9Y646,Q9Y6R7,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,15382.0,,19017.4,10053.0,0.0,3.0,0.0,13.0,0.0,
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,,9295.65,25697.8,10053.0,12.0,4.0,2.0,8.0,0.0,
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,10813.3,,29102.7,10053.0,18.0,2.0,2.0,0.0,0.0,
3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,6201210.0,...,26077.7,21441.8,7642.42,10138.0,12.0,3.0,6.0,31.0,0.0,On
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,...,12639.2,15091.4,6168.55,10138.0,24.0,4.0,7.0,19.0,10.0,On


In [7]:
le = LabelEncoder()
encoded_mm = full_merge[["upd23b_clinical_state_on_medication"]].apply(le.fit_transform)
full_merge["upd23b_clinical_state_on_medication"] = encoded_mm["upd23b_clinical_state_on_medication"]
full_merge.head()

Unnamed: 0,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,Q9UNU6,Q9Y646,Q9Y6R7,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,15382.0,,19017.4,10053.0,0.0,3.0,0.0,13.0,0.0,2
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,,9295.65,25697.8,10053.0,12.0,4.0,2.0,8.0,0.0,2
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,10813.3,,29102.7,10053.0,18.0,2.0,2.0,0.0,0.0,2
3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,6201210.0,...,26077.7,21441.8,7642.42,10138.0,12.0,3.0,6.0,31.0,0.0,1
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,...,12639.2,15091.4,6168.55,10138.0,24.0,4.0,7.0,19.0,10.0,1


In [8]:
clean_df = full_merge[full_merge.upd23b_clinical_state_on_medication != 2]
clean_df = clean_df.fillna(0)
clean_df.head()

Unnamed: 0,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,Q9UNU6,Q9Y646,Q9Y6R7,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
3,10138_12,7404780.0,46107.2,10610900.0,0.0,20910.2,66662.3,55253.9,79575.5,6201210.0,...,26077.7,21441.8,7642.42,10138.0,12.0,3.0,6.0,31.0,0.0,1
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,...,12639.2,15091.4,6168.55,10138.0,24.0,4.0,7.0,19.0,10.0,1
5,10138_36,6924040.0,64313.5,9098610.0,26400.9,18784.6,78724.1,50155.8,508579.0,5601500.0,...,0.0,22910.5,9596.48,10138.0,36.0,5.0,2.0,11.0,0.0,1
13,10541_60,5578010.0,61435.3,4902210.0,0.0,11377.5,107775.0,40710.1,330576.0,2868310.0,...,0.0,23121.7,17482.5,10541.0,60.0,5.0,3.0,29.0,3.0,0
14,10541_84,4339140.0,18940.0,4370350.0,26725.7,8668.14,94758.2,36451.5,325450.0,2418020.0,...,0.0,16998.2,16885.9,10541.0,84.0,5.0,9.0,38.0,3.0,0


In [9]:
OUTCOME_COLUMNS = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
TARGET_COLUMN = OUTCOME_COLUMNS[2]

In [None]:
# def clean_visit_id(raw_visit_id):
#     chunks = raw_visit_id.split("_")
#     visitstr = chunks[1]
#     visit_id = int(visitstr)
#     return visit_id