This notebook is to add additional covariates (health, stool info, medication, dietary history) to the mapping file.

In [1]:
import pandas as pd
import numpy as np

In [2]:
mf = pd.read_csv('../data/mapping_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID') 

In [3]:
print(mf.shape)
mf.head()

(599, 64)


Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,Experiment_Design_Description,Library_Construction_Protocol,Linker,Platform,Center_Name,Center_Project,Instrument_Model,Title,...,OHVD3,OHVD2,OHV1D2,OHV1D2CT,OHVD2CT,OHVDTOT,OHV1DTOT,OHSEAS,VDstatus,Description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BI0023,TCTGGTGACATT,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,25.8,0.0,0.0,1: Yes,1: Yes,25.8,39.3,3:SUMMER,sufficiency,Orwoll.BI0023.BI
BI0056,CAAGCATGCCTA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,39.2,0.0,0.0,1: Yes,1: Yes,39.2,61.9,2:SPRING,sufficiency,Orwoll.BI0056.BI
BI0131,CTATTTGCGACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,23.1,0.0,0.0,1: Yes,1: Yes,23.1,52.1,2:SPRING,sufficiency,Orwoll.BI0131.BI
BI0153,ATCGGCGTTACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,27.3,0.0,0.0,1: Yes,1: Yes,27.3,43.1,2:SPRING,sufficiency,Orwoll.BI0153.BI
BI0215,CCTCTCGTGATC,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,33.0,0.0,0.0,1: Yes,1: Yes,33.0,50.2,4:FALL,sufficiency,Orwoll.BI0215.BI


## convert covariates to proper continuous or categorical format

In [4]:
vars_cat = np.array(['BarcodeSequence', 'LinkerPrimerSequence', 'Experiment_Design_Description',
             'Library_Construction_Protocol', 'Linker', 'Platform', 'Center_Name', 'Center_Project', 'Instrument_Model',
             'Title', 'Anonymized_Name', 'Scientific_Name', 'Taxon_ID', 'Sample_Type', 'Geo_Loc_Name', 'Elevation', 'Env_Biome',
             'Env_Feature', 'Env_Material', 'Env_Package', 'Collection_Timestamp', 'DNA_Extracted', 'Physical_Specimen_Location',
             'Physical_Specimen_Remaining', 'Age_Units', 'Host_Subject_ID', 'Host_Taxid','Host_Scientific_Name', 'Host_Common_Name',
             'Life_Stage', 'Sex', 'Height_Units', 'Weight_Units', 'Body_Habitat', 'Body_Site', 'Body_Product', 'GIERACE', 'SITE',
             'TUDRAMT', 'TURSMOKE', 'M1ADEPR', 'M1VITMND', 'M1ANTIB', 'M1PROBI', 'OHSEAS', 'VDstatus', 'Description',
             'OHV1D2CT', 'OHVD2CT'])
vars_cts = np.array(['Latitude', 'Longitude', 'Age', 'Height', 'Weight', 'BMI', 'PASCORE', 'DTVITD', 
             'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2',  'OHVDTOT', 'OHV1DTOT'])

In [5]:
# convert vars_cts to numeric and vars_cat to factors
df = mf.copy()
df[vars_cts] = df[vars_cts].apply(pd.to_numeric, errors='coerce')
df[vars_cat] = df[vars_cat].apply(lambda x: x.astype('category'))

In [6]:
# convert all pg/ml to ng/ml note: 1 ng/ml = 1000 pg/ml
df.OHV1D3 = df.OHV1D3/1000
df.OHV1D2 = df.OHV1D2/1000
df.OHV1DTOT = df.OHV1DTOT/1000

## add two ratio variables of Vitamin D

In [7]:
df['ratio_activation'] = df.OHV1D3/df.OHVD3
df['ratio_catabolism'] = df.OHV24D3/df.OHVD3 
vars_cts = np.append(vars_cts, ['ratio_activation', 'ratio_catabolism'])

In [8]:
df[vars_cts].describe()

Unnamed: 0,Latitude,Longitude,Age,Height,Weight,BMI,PASCORE,DTVITD,OHV1D3,OHV24D3,OHVD3,OHVD2,OHV1D2,OHVDTOT,OHV1DTOT,ratio_activation,ratio_catabolism
count,599.0,599.0,599.0,599.0,599.0,599.0,599.0,584.0,567.0,567.0,556.0,567.0,567.0,556.0,567.0,556.0,556.0
mean,39.131492,-105.850862,84.237062,172.143573,80.173957,27.009647,122.482829,164.395957,0.057775,3.430864,35.229137,0.769136,0.000177,36.013489,0.057951,0.001772,0.094776
std,5.226215,17.221362,4.061471,6.80362,12.994918,3.788662,66.66765,127.4586,0.019773,1.834771,12.450758,4.26621,0.002232,12.410942,0.019656,0.000735,0.02977
min,32.715738,-122.6765,78.0,153.85,51.4,17.599566,0.0,0.429,0.0107,0.3,7.8,0.0,0.0,7.8,0.0107,0.000398,0.018788
25%,33.520661,-122.143,81.0,167.325,71.0,24.506948,70.142857,77.9625,0.0441,2.175,27.4,0.0,0.0,28.1,0.04425,0.001316,0.074216
50%,40.440625,-117.1611,83.0,172.1,79.0,26.724552,116.75,128.125,0.0555,3.18,33.65,0.0,0.0,34.15,0.0555,0.00166,0.092821
75%,44.977753,-86.80249,87.0,176.725,87.5,28.97548,165.785714,211.6425,0.0663,4.235,41.825,0.0,0.0,42.3,0.06645,0.002081,0.112849
max,45.523062,-79.99589,98.0,194.65,127.6,42.931509,359.142857,982.83,0.156,14.07,104.0,59.0,0.0378,104.0,0.156,0.006727,0.197786


In [9]:
df[vars_cat].describe()

Unnamed: 0,BarcodeSequence,LinkerPrimerSequence,Experiment_Design_Description,Library_Construction_Protocol,Linker,Platform,Center_Name,Center_Project,Instrument_Model,Title,...,TURSMOKE,M1ADEPR,M1VITMND,M1ANTIB,M1PROBI,OHSEAS,VDstatus,Description,OHV1D2CT,OHVD2CT
count,599,599,599,599,599,599,599,599,599,599,...,599,599,599,599,599,599,588,599,599,599
unique,599,1,1,1,1,1,6,1,1,1,...,4,2,2,2,2,5,3,599,3,3
top,TTGTCTGGAAGC,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,SD,MrOS,Illumina MiSeq,MrOS_VitaminD,...,1:PAST,0: No,1: Yes,0: No,0: No,3:SUMMER,sufficiency,Orwoll.SD9009.SD,1: Yes,1: Yes
freq,1,599,599,599,599,599,134,599,599,599,...,289,546,445,558,573,220,516,1,563,448


In [10]:
# count missing values
df[vars_cts].isnull().sum()

Latitude             0
Longitude            0
Age                  0
Height               0
Weight               0
BMI                  0
PASCORE              0
DTVITD              15
OHV1D3              32
OHV24D3             32
OHVD3               43
OHVD2               32
OHV1D2              32
OHVDTOT             43
OHV1DTOT            32
ratio_activation    43
ratio_catabolism    43
dtype: int64

In [11]:
# check
df.shape

(599, 66)

## add health variables

In [12]:
health = pd.read_csv('../data/MrOS_healthvari.csv', sep=',', dtype=str)
health = health.rename(columns={'ID': '#SampleID'}).set_index('#SampleID')
health['QLCOMP'] = health['QLCOMP'].astype('category')

In [13]:
df = pd.merge(df, health, left_index=True, right_index=True)

In [14]:
print(df.shape)
df.head()

(599, 67)


Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,Experiment_Design_Description,Library_Construction_Protocol,Linker,Platform,Center_Name,Center_Project,Instrument_Model,Title,...,OHV1D2CT,OHVD2CT,OHVDTOT,OHV1DTOT,OHSEAS,VDstatus,Description,ratio_activation,ratio_catabolism,QLCOMP
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BI0023,TCTGGTGACATT,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,1: Yes,1: Yes,25.8,0.0393,3:SUMMER,sufficiency,Orwoll.BI0023.BI,0.001523,0.068605,1:GOOD/EXCELLENT
BI0056,CAAGCATGCCTA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,1: Yes,1: Yes,39.2,0.0619,2:SPRING,sufficiency,Orwoll.BI0056.BI,0.001579,0.099745,1:GOOD/EXCELLENT
BI0131,CTATTTGCGACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,1: Yes,1: Yes,23.1,0.0521,2:SPRING,sufficiency,Orwoll.BI0131.BI,0.002255,0.064502,1:GOOD/EXCELLENT
BI0153,ATCGGCGTTACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,1: Yes,1: Yes,27.3,0.0431,2:SPRING,sufficiency,Orwoll.BI0153.BI,0.001579,0.078388,1:GOOD/EXCELLENT
BI0215,CCTCTCGTGATC,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,1: Yes,1: Yes,33.0,0.0502,4:FALL,sufficiency,Orwoll.BI0215.BI,0.001521,0.109697,1:GOOD/EXCELLENT


## add stool sample variables

In [15]:
stool = pd.read_csv('../data/Stool600comments_JS.txt', sep='\t', dtype=str)
stool = stool.rename(columns={'ID': '#SampleID'}).set_index('#SampleID')

In [16]:
id_stool = list(stool.index)
id_mapping = list(df.index)

for i in id_mapping:
    if i not in id_stool:
        print(i)
        
for i in id_stool:
    if i not in id_mapping:
        print(i)

PO7020
PO7029
PO7020 
PO7029 
PO7100


In [17]:
stool['Timestool '] = stool['Timestool '].astype('category') # notice a space in name
stool['Timestoolc'] = stool['Timestoolc'].astype('category')
stool['StoolCollected'] = stool['StoolCollected'].astype('category')
stool['StoolShipped'] = stool['StoolShipped'].astype('category')
stool['Days( between collection and shipment)'] = stool['Days( between collection and shipment)'].astype('category')
stool['StoolQualityC'] = stool['StoolQualityC'].astype('category')

## add medication variables

In [18]:
med = pd.read_csv('../data/Microbiom_medsupdate.csv', sep=',', dtype=str)

In [19]:
med.head()

Unnamed: 0,ID,M1STATIN,Antihistamine,Laxative
0,BI0023,0: No,0:No,0:No
1,BI0056,0: No,0:No,0:No
2,BI0131,1: Yes,0:No,0:No
3,BI0153,1: Yes,0:No,0:No
4,BI0215,0: No,0:No,0:No


In [20]:
med = med.rename(columns={'ID': '#SampleID'}).set_index('#SampleID')
med = med.apply(lambda x: x.astype('category'))

In [21]:
med.head()

Unnamed: 0_level_0,M1STATIN,Antihistamine,Laxative
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BI0023,0: No,0:No,0:No
BI0056,0: No,0:No,0:No
BI0131,1: Yes,0:No,0:No
BI0153,1: Yes,0:No,0:No
BI0215,0: No,0:No,0:No


In [22]:
df = pd.merge(df, med, left_index=True, right_index=True)

In [23]:
df.shape

(599, 70)

In [24]:
df.head()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,Experiment_Design_Description,Library_Construction_Protocol,Linker,Platform,Center_Name,Center_Project,Instrument_Model,Title,...,OHV1DTOT,OHSEAS,VDstatus,Description,ratio_activation,ratio_catabolism,QLCOMP,M1STATIN,Antihistamine,Laxative
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BI0023,TCTGGTGACATT,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.0393,3:SUMMER,sufficiency,Orwoll.BI0023.BI,0.001523,0.068605,1:GOOD/EXCELLENT,0: No,0:No,0:No
BI0056,CAAGCATGCCTA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.0619,2:SPRING,sufficiency,Orwoll.BI0056.BI,0.001579,0.099745,1:GOOD/EXCELLENT,0: No,0:No,0:No
BI0131,CTATTTGCGACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.0521,2:SPRING,sufficiency,Orwoll.BI0131.BI,0.002255,0.064502,1:GOOD/EXCELLENT,1: Yes,0:No,0:No
BI0153,ATCGGCGTTACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.0431,2:SPRING,sufficiency,Orwoll.BI0153.BI,0.001579,0.078388,1:GOOD/EXCELLENT,1: Yes,0:No,0:No
BI0215,CCTCTCGTGATC,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.0502,4:FALL,sufficiency,Orwoll.BI0215.BI,0.001521,0.109697,1:GOOD/EXCELLENT,0: No,0:No,0:No


## add starch variables

In [25]:
st = pd.read_csv('../data/Microbiome_DietaryStarchesnew.csv', sep=',')

In [26]:
st.shape

(600, 4)

In [27]:
st = st.rename(columns={'ID': '#SampleID', 'Rstarches_c (0: <5 and 1>=5)': 'Rstarches_c'}).set_index('#SampleID')

In [28]:
st.head()

Unnamed: 0_level_0,MIDATA,Rstarch_total,Rstarches_c
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BI0023,1: Yes,3.066569,0.0
BI0056,1: Yes,3.038136,0.0
BI0131,1: Yes,2.840599,0.0
BI0153,1: Yes,4.106798,0.0
BI0215,1: Yes,0.971114,0.0


In [29]:
df = pd.merge(df, st, left_index=True, right_index=True)

In [30]:
df.shape

(599, 73)

### add PPI medication information

In [31]:
ppi = pd.read_csv('../data/PPI_info_mros.txt', sep='\t')

In [32]:
ppi.shape

(599, 3)

In [33]:
ppi = ppi.rename(columns={'ID': '#SampleID'}).set_index('#SampleID')

In [34]:
ppi.head()

Unnamed: 0_level_0,MIDATA,M1PPUMP
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1
BI0023,1: Yes,0: No
BI0056,1: Yes,1: Yes
BI0131,1: Yes,0: No
BI0153,1: Yes,0: No
BI0215,1: Yes,0: No


In [35]:
ppi = ppi.drop(columns=['MIDATA'])
df = pd.merge(df, ppi, left_index=True, right_index=True)

In [36]:
df.shape

(599, 74)

## output new mapping file

In [37]:
df.to_csv('../data/mapping_MrOS_add.txt', sep='\t', index_label='#SampleID')

In [38]:
df.head()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,Experiment_Design_Description,Library_Construction_Protocol,Linker,Platform,Center_Name,Center_Project,Instrument_Model,Title,...,ratio_activation,ratio_catabolism,QLCOMP,M1STATIN,Antihistamine,Laxative,MIDATA,Rstarch_total,Rstarches_c,M1PPUMP
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BI0023,TCTGGTGACATT,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.001523,0.068605,1:GOOD/EXCELLENT,0: No,0:No,0:No,1: Yes,3.066569,0.0,0: No
BI0056,CAAGCATGCCTA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.001579,0.099745,1:GOOD/EXCELLENT,0: No,0:No,0:No,1: Yes,3.038136,0.0,1: Yes
BI0131,CTATTTGCGACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.002255,0.064502,1:GOOD/EXCELLENT,1: Yes,0:No,0:No,1: Yes,2.840599,0.0,0: No
BI0153,ATCGGCGTTACA,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.001579,0.078388,1:GOOD/EXCELLENT,1: Yes,0:No,0:No,1: Yes,4.106798,0.0,0: No
BI0215,CCTCTCGTGATC,GGACTACHVGGGTWTCTAAT,16S stool samples sequenced for MrOS Vitamin D...,16S rRNA v4,GT,Illumina,BI,MrOS,Illumina MiSeq,MrOS_VitaminD,...,0.001521,0.109697,1:GOOD/EXCELLENT,0: No,0:No,0:No,1: Yes,0.971114,0.0,0: No
