In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import os
os.chdir('/Users/jhou2/Documents/GitHub/PertussisVaccine_Prediction/')

In [None]:
RNAseq_df = pd.read_csv('data/output/RNAseq.csv')
Cell_Freq_df = pd.read_csv('data/output/Cell_Freq.csv')

In [19]:
# Load data
Ab_titer_df = pd.read_csv('data/output/Ab_titer.csv')

# Prepare Ab single modulaity data
Ab_titer_day0 = Ab_titer_df[Ab_titer_df['timepoint']== 0]
Ab_titer_day14 = Ab_titer_df[Ab_titer_df['timepoint']== 14].rename(columns={'IgG_PT': 'IgG_PT_day14'})

In [20]:
# Feature engineering: remove non-specific antibodies
nospecific_ab = Ab_titer_day0.columns.str.contains('TT|DT|OVA')
Ab_titer_day0 = Ab_titer_day0.loc[:, ~nospecific_ab]

In [21]:
# split into train and test datasets
Ab_day0_train, Ab_day0_test = [x for _, x in Ab_titer_day0.groupby(Ab_titer_day0['dataset'] == "2023_dataset")]

In [None]:
# Combine Ab features data (day 0) with target (day 14) by merging on 'subject_id'
# innter join to ensure only subjects present in both datasets are included
Ab_titer = pd.merge(Ab_day0_train, Ab_titer_day14[['IgG_PT_day14', 'subject_id']], on='subject_id', how='inner')

In [None]:
# Drop unnecessary columns
Ab_titer = Ab_titer.drop(columns=['specimen_id', 'subject_id', 'timepoint', 'dataset', 'date_of_boost', 'race', 'age_at_boost'])

In [31]:
Ab_titer_encoded = pd.get_dummies(Ab_titer, columns=['infancy_vac', 'biological_sex'], drop_first=True)
Ab_titer_encoded

Unnamed: 0,IgG_PT,IgG_PRN,IgG_FHA,IgG1_PT,IgG1_PRN,IgG1_FHA,IgG1_FIM2.3,IgG2_PT,IgG2_PRN,IgG2_FHA,...,IgG3_FHA,IgG3_FIM2.3,IgG4_PT,IgG4_PRN,IgG4_FHA,IgG4_FIM2.3,age,IgG_PT_day14,infancy_vac_wP,biological_sex_Male
0,2.979295,2.006372,26.636688,10.098853,2.044475,4.616537,0.155137,2.409254,1.330562,2.229043,...,0.470632,-17.267982,-3.679833,9.723329,0.701456,0.644320,30,10.089424,True,False
1,1.232919,1.096891,1.743372,1.899791,1.328943,1.105913,-1.473059,2.409254,2.559018,1.385129,...,1.032893,-17.267982,-3.732561,0.710648,1.348163,0.644320,19,2.515514,False,True
2,0.945724,1.977415,1.425324,-1.648747,1.938158,0.308922,1.449089,2.409254,-2.816451,-0.214036,...,0.470632,-17.267982,-3.488708,0.474926,0.224647,4.296317,23,11.828905,True,True
3,0.196815,1.005308,1.823017,-5.022841,-0.000014,0.833370,-1.610675,2.409254,57.184495,-0.494971,...,0.576771,-17.267982,-3.732561,0.434722,0.085908,0.644320,27,4.388773,True,True
4,2.279289,0.602752,2.685105,4.185719,1.361650,3.642262,-1.340657,2.409254,-3.565405,-0.712774,...,1.542051,-15.381595,-3.732561,0.434722,0.041880,0.644320,29,4.342456,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,6.051650,-1.904160,2.341280,9.861642,3.007505,1.557882,5.908761,-1.731454,0.485349,4.683775,...,9.962974,753.723649,39.893896,26.502647,1.627105,13.158537,19,1.222487,False,False
107,5.720888,0.785478,0.549210,7.326589,1.489968,1.561535,3.993959,22.803705,3.453364,2.838332,...,-0.875346,38.052262,80.085182,-2.054115,21.339125,7.896994,21,7.337052,False,True
108,2.029688,4.075797,8.247274,9.366217,2.391124,5.487408,3.325470,-2.651417,0.145091,0.483205,...,-3.192818,14.846072,28.038609,-3.797454,5.172956,7.718030,27,4.008829,False,False
109,1.581305,-3.371448,-4.091363,5.761383,-0.321361,-1.453895,-1.736421,-2.560220,0.043994,0.931262,...,-0.784303,38.273989,16.999704,1.254939,-0.929505,5.534668,24,3.409768,False,True


In [32]:
Ab_titer

Unnamed: 0,IgG_PT,IgG_PRN,IgG_FHA,IgG1_PT,IgG1_PRN,IgG1_FHA,IgG1_FIM2.3,IgG2_PT,IgG2_PRN,IgG2_FHA,...,IgG3_FHA,IgG3_FIM2.3,IgG4_PT,IgG4_PRN,IgG4_FHA,IgG4_FIM2.3,infancy_vac,biological_sex,age,IgG_PT_day14
0,2.979295,2.006372,26.636688,10.098853,2.044475,4.616537,0.155137,2.409254,1.330562,2.229043,...,0.470632,-17.267982,-3.679833,9.723329,0.701456,0.644320,wP,Female,30,10.089424
1,1.232919,1.096891,1.743372,1.899791,1.328943,1.105913,-1.473059,2.409254,2.559018,1.385129,...,1.032893,-17.267982,-3.732561,0.710648,1.348163,0.644320,aP,Male,19,2.515514
2,0.945724,1.977415,1.425324,-1.648747,1.938158,0.308922,1.449089,2.409254,-2.816451,-0.214036,...,0.470632,-17.267982,-3.488708,0.474926,0.224647,4.296317,wP,Male,23,11.828905
3,0.196815,1.005308,1.823017,-5.022841,-0.000014,0.833370,-1.610675,2.409254,57.184495,-0.494971,...,0.576771,-17.267982,-3.732561,0.434722,0.085908,0.644320,wP,Male,27,4.388773
4,2.279289,0.602752,2.685105,4.185719,1.361650,3.642262,-1.340657,2.409254,-3.565405,-0.712774,...,1.542051,-15.381595,-3.732561,0.434722,0.041880,0.644320,wP,Female,29,4.342456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,6.051650,-1.904160,2.341280,9.861642,3.007505,1.557882,5.908761,-1.731454,0.485349,4.683775,...,9.962974,753.723649,39.893896,26.502647,1.627105,13.158537,aP,Female,19,1.222487
107,5.720888,0.785478,0.549210,7.326589,1.489968,1.561535,3.993959,22.803705,3.453364,2.838332,...,-0.875346,38.052262,80.085182,-2.054115,21.339125,7.896994,aP,Male,21,7.337052
108,2.029688,4.075797,8.247274,9.366217,2.391124,5.487408,3.325470,-2.651417,0.145091,0.483205,...,-3.192818,14.846072,28.038609,-3.797454,5.172956,7.718030,aP,Female,27,4.008829
109,1.581305,-3.371448,-4.091363,5.761383,-0.321361,-1.453895,-1.736421,-2.560220,0.043994,0.931262,...,-0.784303,38.273989,16.999704,1.254939,-0.929505,5.534668,aP,Male,24,3.409768


In [None]:
df_encoded = pd.get_dummies(Ab_day0_train_filtered, columns=['infancy_vac', 'biological_sex'], drop_first=True)

# Scale 'age' column (mean=0, std=1)
scaler = StandardScaler()
df_encoded['age_scaled'] = scaler.fit_transform(df_encoded[['age']])

# Optionally drop original 'age' column if you want only scaled
df_encoded = df_encoded.drop(columns=['age'])
df_encoded

Unnamed: 0,specimen_id,IgG_PT,IgG_PRN,IgG_FHA,IgG1_PT,IgG1_PRN,IgG1_FHA,IgG1_FIM2.3,IgG2_PT,IgG2_PRN,...,IgG4_FIM2.3,subject_id,dataset,timepoint,date_of_boost,race,age_at_boost,infancy_vac_wP,biological_sex_Male,age_scaled
0,1,2.979295,2.006372,26.636688,10.098853,2.044475,4.616537,0.155137,2.409254,1.330562,...,0.644320,1,2020_dataset,0,2016-09-12,White,30,True,False,0.814390
9,102,1.232919,1.096891,1.743372,1.899791,1.328943,1.105913,-1.473059,2.409254,2.559018,...,0.644320,13,2020_dataset,0,2016-07-25,White,19,False,True,-0.983747
36,109,0.945724,1.977415,1.425324,-1.648747,1.938158,0.308922,1.449089,2.409254,-2.816451,...,4.296317,14,2020_dataset,0,2016-08-15,White,23,True,True,-0.329879
54,114,0.196815,1.005308,1.823017,-5.022841,-0.000014,0.833370,-1.610675,2.409254,57.184495,...,0.644320,15,2020_dataset,0,2016-08-15,Asian,27,True,True,0.323989
80,121,2.279289,0.602752,2.685105,4.185719,1.361650,3.642262,-1.340657,2.409254,-3.565405,...,0.644320,16,2020_dataset,0,2016-07-25,Unknown or Not Reported,29,True,False,0.650923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,902,6.051650,-1.904160,2.341280,9.861642,3.007505,1.557882,5.908761,-1.731454,0.485349,...,13.158537,115,2022_dataset,0,2021-11-01,Asian,19,False,False,-0.983747
822,912,5.720888,0.785478,0.549210,7.326589,1.489968,1.561535,3.993959,22.803705,3.453364,...,7.896994,116,2022_dataset,0,2021-11-29,White,21,False,True,-0.656813
831,922,2.029688,4.075797,8.247274,9.366217,2.391124,5.487408,3.325470,-2.651417,0.145091,...,7.718030,117,2022_dataset,0,2021-11-29,More Than One Race,27,False,False,0.323989
839,932,1.581305,-3.371448,-4.091363,5.761383,-0.321361,-1.453895,-1.736421,-2.560220,0.043994,...,5.534668,118,2022_dataset,0,2022-01-24,Asian,24,False,True,-0.166412


In [42]:
Ab_day0_train

Unnamed: 0,specimen_id,IgG_PT,IgG_PRN,IgG_FHA,IgG1_PT,IgG1_PRN,IgG1_FHA,IgG1_FIM2.3,IgG2_PT,IgG2_PRN,...,IgG4_FIM2.3,subject_id,dataset,timepoint,infancy_vac,biological_sex,date_of_boost,race,age,age_at_boost
0,1,2.979295,2.006372,26.636688,10.098853,2.044475,4.616537,0.155137,2.409254,1.330562,...,0.644320,1,2020_dataset,0,wP,Female,2016-09-12,White,30,30
9,102,1.232919,1.096891,1.743372,1.899791,1.328943,1.105913,-1.473059,2.409254,2.559018,...,0.644320,13,2020_dataset,0,aP,Male,2016-07-25,White,19,19
36,109,0.945724,1.977415,1.425324,-1.648747,1.938158,0.308922,1.449089,2.409254,-2.816451,...,4.296317,14,2020_dataset,0,wP,Male,2016-08-15,White,23,23
54,114,0.196815,1.005308,1.823017,-5.022841,-0.000014,0.833370,-1.610675,2.409254,57.184495,...,0.644320,15,2020_dataset,0,wP,Male,2016-08-15,Asian,27,27
80,121,2.279289,0.602752,2.685105,4.185719,1.361650,3.642262,-1.340657,2.409254,-3.565405,...,0.644320,16,2020_dataset,0,wP,Female,2016-07-25,Unknown or Not Reported,29,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,902,6.051650,-1.904160,2.341280,9.861642,3.007505,1.557882,5.908761,-1.731454,0.485349,...,13.158537,115,2022_dataset,0,aP,Female,2021-11-01,Asian,19,19
822,912,5.720888,0.785478,0.549210,7.326589,1.489968,1.561535,3.993959,22.803705,3.453364,...,7.896994,116,2022_dataset,0,aP,Male,2021-11-29,White,21,21
831,922,2.029688,4.075797,8.247274,9.366217,2.391124,5.487408,3.325470,-2.651417,0.145091,...,7.718030,117,2022_dataset,0,aP,Female,2021-11-29,More Than One Race,27,27
839,932,1.581305,-3.371448,-4.091363,5.761383,-0.321361,-1.453895,-1.736421,-2.560220,0.043994,...,5.534668,118,2022_dataset,0,aP,Male,2022-01-24,Asian,24,24
