# Goal

Jacobo de la Cuesta. November 2018.

The goal of this notebook is to gather and organize the metadata files for the association analyses of vadinCA11 and cardiovascular disease

# Init

In [1]:
library(tidyverse)

Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
“package ‘dplyr’ was built under R version 3.4.3”Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats


In [2]:
source("/ebio/abt3_projects/vadinCA11/data/V11/R_utils/utils.R")

# Var

In [3]:
# Blood pressure data
E788_16S_file = '/ebio/abt3_projects/TwinsUK/PhenotypeData/E788/E788_230317.1_16S.txt'

E788 = read.delim(E788_16S_file, sep='\t') %>%
    mutate(DATE_BORN = DATE_BORN %>% as.Date(format='%Y-%m-%d'),
           Visit_Date = Visit_Date %>% as.Date(foramt='%Y-%m-%d')) %>%
    mutate(calculated_age = floor(difftime(Visit_Date, DATE_BORN)/365) %>% as.vector)
E788 %>% dfhead

“package ‘bindrcpp’ was built under R version 3.4.3”

[1] 162575     13


MicrobiomeID,DATE_BORN,SEX,ACTUAL_ZYGOSITY,Anomaly,Ethnic_Origin,Visit_Date,PhenID,Phen_value,FPBarcode,SequencingSpecificName,delta_days,calculated_age
NA52071,1931-07-15,F,MZ,,White,2008-05-07,P001394,125,1002741,Plate6.49,1042,76
NA52071,1931-07-15,F,MZ,,White,2008-05-07,P001395,62,1002741,Plate6.49,1042,76
NA52071,1931-07-15,F,MZ,,White,2008-05-07,P001396,125,1002741,Plate6.49,1042,76


In [4]:
# BGI250 names and barcodes file
BGI250_names_file = '/ebio/abt3_projects/TwinsUK/BGItwin250/2015-BGItwin250_sample_info_for_Ilana.txt'
BGI250_names = read.delim(BGI250_names_file, sep='\t')
BGI250_names %>% dfhead

[1] 250  56


BGI_ID,BGI_DNA_Concentration_ng_per_ul,BGI_DNA_Total_Quantity_ug,TwinsUK_ID_on_sample,CornellSampleBarcode,CollectionMethod,SampleShipmentNum,SampleTwinsUKReceiveDate,SampleCollectionDate,SampleCollectionTime,⋯,Anxiety_Q17A_12,Anxiety_Q17A_13,Anxiety_Q17A_14,Anxiety_Q17A_15,Anxiety_Q17A_16,Anxiety_Q17A_17,Anxiety_Q17A_18,Anxiety_Q17A_19,Anomaly,Ethnic_Origin
35577,62.341,3.117,10601,1002409,Visit,2,1/4/11,1/3/11,21:20,⋯,0,1,0,3,0,0,0,0,,White
35578,135.406,6.77,10602,1002425,Visit,2,1/4/11,1/2/11,11:10,⋯,0,0,0,0,0,0,0,0,,White
37096,31.06946,1.553473,1161,1003428,Visit,5,9/13/11,9/12/11,11:00,⋯,2,2,0,0,0,1,0,0,,White


In [5]:
# BGI250 metadata
# Load
xu_metadata_file1 = '/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metadata/Xu_BGI250MG_metadata1.txt'
xu_metadata1 = read.delim(xu_metadata_file1, sep='\t')

xu_metadata_file2 = '/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metadata/Xu_BGI250MG_metadata2.txt'
xu_metadata2 = read.delim(xu_metadata_file2, sep='\t')

# Join by BGI_ID
# 
BGI250_metadata = left_join(xu_metadata1, xu_metadata2, 
              by = c("BGI.ID" = "BGI.id", "Twin.Pair.Number", "Zygosity", "Sex", "Age.at.metagenomics.sample"))
BGI250_metadata %>% dfhead

[1] 254  28


BGI.ID,KCL.ID,Twin.Pair.Number,Zygosity,Sex,Age.at.metagenomics.sample,Age.at.twins.start.living.apart,Birth.Order,Type.of.delivery,BMI,⋯,Currently..how.many.minutes.per.week.do.you.spend.walking.briskly...gardening.vigorously.,Diabetes,Drugs.Diabetic.Tablets.,Drugs.Insulin.,FamilyID,TwinID,BirthDate,Age,Ethnic_Origin,X2008_AS
35365,81971,89,MZ,False,62,28,,,38.35,⋯,600.0,Y,N,GLARGINE;NOVORAPID,NA8197,NA81971,11/12/48,61.0,White,3.0
35366,81972,89,MZ,False,62,28,,,40.99,⋯,,Y,N,N,NA8197,NA81972,11/12/48,61.0,White,2.0
35387,21081,18,DZ,False,50,16,Second,Natural Birth,19.69,⋯,240.0,N,N,N,,,,,,


In [6]:
# Merge the two BGI250 tables
BGI_tmp_names = BGI250_names %>% select(BGI.ID = BGI_ID, CornellSampleBarcode, IndividualID,
                                  IndividualGender, IndividualZygosity, IndividualFamilyID, 
                                  IndividualTwinID, IndividualAge, IndividualBMI) 

BGI_merged = left_join(BGI_tmp_names, BGI250_metadata) 
BGI_merged %>% dfhead

Joining, by = "BGI.ID"


[1] 250  36


BGI.ID,CornellSampleBarcode,IndividualID,IndividualGender,IndividualZygosity,IndividualFamilyID,IndividualTwinID,IndividualAge,IndividualBMI,KCL.ID,⋯,Currently..how.many.minutes.per.week.do.you.spend.walking.briskly...gardening.vigorously.,Diabetes,Drugs.Diabetic.Tablets.,Drugs.Insulin.,FamilyID,TwinID,BirthDate,Age,Ethnic_Origin,X2008_AS
35577,1002409,NA10601,False,MZ,NA1060,NA10601,59,38.04688,10601,⋯,240,N,N,N,NA1060,NA10601,7/17/51,59,White,1
35578,1002425,NA10602,False,MZ,NA1060,NA10602,59,31.79819,10602,⋯,0,N,N,N,NA1060,NA10602,7/17/51,59,White,1
37096,1003428,NA1161,False,DZ,NA116,NA1161,64,35.74951,1161,⋯,80,N,N,N,NA116,NA1161,4/1/47,64,White,1


In [7]:
# # BGI250 metadata
# BGI250_metadata_file = "/ebio/abt3_projects/vadinCA11/data/V11/TUK_samples/BGI250_metadata.txt"
# BGI250_metadata = read.delim(BGI250_metadata_file, sep='\t')
# # Remove duplicated columns
# duplicated_cols = colnames(BGI250_metadata) %>% grep(x = .,pattern =  ".y", fixed = T)
# BGI250_metadata = BGI250_metadata %>% select(-duplicated_cols)
# # fix column names
# colnames(BGI250_metadata) = colnames(BGI250_metadata) %>% 
#     sub(x = ., ".x", "", fixed = T)

# BGI250_metadata %>% dfhead

In [8]:
# Anxiety
anxiety_file = '/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metadata/TwinsUK_MZ-disAnx_metadata.txt'
anxiety_metadata = read.delim(anxiety_file, sep='\t') 
anxiety_metadata$s.NameOnSampleWithAnon = anxiety_metadata$s.NameOnSampleWithAnon %>% 
    as.character %>% 
    sub(x = ., pattern = "^A", replacement = "")
anxiety_metadata  %>% dfhead

[1] 19 24


s.NameOnSampleWithAnon,s.NameOnSampleWithOutAnon,Anxiety_score,i.IndividualID,i.TwinID,i.zygosity,age,i.DateOfBirth,i.FamilyID,i.gender,⋯,s.FPBarcode,s.IsNameOnSampleAnon,s.MapSampleToIndividualID,s.ShipmentNum,s.TwinsUKCollectionDate,s.TwinsUKCollectionTime,s.TwinsUKReceiveDate,s.TwinsUKReceiveTime,Extractedby,SequencingRunID
61154,61154,3,A61154,A1202,MZ,44,1967-09-07,A120,False,⋯,1003127,1,A61154,7,2012-07-02,14:00,2012-07-02,,Jess,TwinsUKPlate14
49283,49283,1,A49283,A1201,MZ,44,1967-09-07,A120,False,⋯,1003130,1,A49283,7,2012-07-02,9:15,2012-07-02,,Jess,TwinsUKPlate14
31520,31520,3,A31520,A2601,MZ,58,1954-04-21,A260,False,⋯,1006101,1,A31520,8,2013-02-18,4:40,2013-02-18,,Jess,TwinsUKPlate16


In [9]:
# vadinCA11 TUK sample names file
vadinCA11_names_file = "/ebio/abt3_projects/vadinCA11/data/V11/TUK_samples/fixed_samplenames.txt"
v11_names = read.delim(vadinCA11_names_file, sep='\t')
v11_names %>% dfhead

# Load vadinCA11 metadata
vadinCA11_map_file = "/ebio/abt3_projects/vadinCA11/data/metadata/HiSeqRuns83-91_TUK-metadata.tsv"
v11_map = read.delim(vadinCA11_map_file, sep='\t')

# Rename barcode column
v11_map$FPBarcode = v11_map$s.FPBarcode
v11_map$s.FPBarcode = NULL

# Filter samples used for assembly and tax profile
v11_map = inner_join(v11_map, v11_names, by = c("FPBarcode" = "new_name")) %>%
    group_by(Sample) %>% 
    filter(row_number(Sample) == 1)
v11_map %>% dfhead

[1] 128   3


old_name,new_name,changed
50491,1002428,True
1171,1002424,True
32031,1002537,True


[1] 128  52


Sample,s.NameOnSampleWithAnon,s.NameOnSampleWithOutAnon,s.IsNameOnSampleAnon,s.CollectionMethod,s.TwinsUKReceiveDate,s.TwinsUKReceiveTime,s.TwinsUKCollectionTime,s.TwinsUKCollectionDate,s.CornellRecordedCollectionDate,⋯,Extractedby,gDNAConcentration,ExtractionPlate,WellInExtractionPlate,NameOnExtractionPlate,ExtractionComments,SequencingRunID,FPBarcode,old_name,changed
1002424,NA1171,1171,0,Visit,2010-12-14,,,,2010-12-10,⋯,Jess,209.42,p10,G12,1171,,TwinsUKHigh1,1002424,1171,True
1002874,NA4632,4632,0,Visit,2011-05-18,,,,2011-05-16,⋯,Nick,,S4p2,D4,4632,,TwinsUKPlate8,1002874,4632,True
1002875,NA8571,8571,0,Visit,2011-05-19,,,,2011-05-17,⋯,Nick,,S4p2,C5,8571,,TwinsUKPlate8,1002875,8571,True


In [10]:
# Flagellin metagenomes
Flagellin_barcodes = c(1002372, 1003344, 1003430, 1003466, 1003585, 1005954, 
                      1005995, 1006002, 1006095, 1006102, 1006675, 1006723, 
                      1012268, 1012289, 1012307, 1012313, 1012485, 1012548, 
                      1012620, 1013576, 1013577, 1013662, 1013663, 1013861, 
                      1013862, 1014063, 1014064, 1014133, 1014134, 1014231, 
                      1014285, 1014291)

Flagellin_samples = data.frame(FPBarcode = Flagellin_barcodes)
Flagellin_samples %>% dfhead

[1] 32  1


FPBarcode
1002372
1003344
1003430


# Organize data

## TUK blood pressure

In [11]:
# Number of individuals
E788 %>% 
    distinct(MicrobiomeID) %>%
    nrow

In [12]:
# Determine whether multiple measurements per visit
# Replace BP codes for meassurement
bp_ids = c('P001394','P001395','P001396','P001397')
bp_text = c('BP_SYS_2', 'BP_DIAS_2', 'BP_SYS_3', 'BP_DIAS_3')
df_bp = data.frame(bp_ids, bp_text)

# Count the number of duplicate BP meassuements
E788 %>%
    filter(PhenID %in% df_bp$bp_ids) %>%
    mutate(Phen_value = Phen_value %>% as.Num) %>%
    group_by(MicrobiomeID, Visit_Date, PhenID) %>%
    summarize(n=n()) %>%
    ungroup() %>%
    filter(n > 1) %>%
    nrow

In [13]:
# Add column mapping phenotype IDs to blood pressure code
TUK_BP = E788 %>%
    inner_join(df_bp, c('PhenID'='bp_ids')) %>%
    mutate(PhenID = PhenID %>% as.character,
           Phen_value = Phen_value %>% as.Num)

“Column `PhenID`/`bp_ids` joining factors with different levels, coercing to character vector”

## VadinCA11 samples

In [14]:
# Subset TUK blood pressure table to include only vadinCA11 subjects
# Identify samples with BP data
v11_BP = inner_join(TUK_BP, v11_map, by = c("FPBarcode"))
v11_BP %>% dfhead

[1] 472  65


MicrobiomeID,DATE_BORN,SEX,ACTUAL_ZYGOSITY,Anomaly,Ethnic_Origin,Visit_Date,PhenID,Phen_value,FPBarcode,⋯,ExtractionPlateLoadedby,Extractedby,gDNAConcentration,ExtractionPlate,WellInExtractionPlate,NameOnExtractionPlate,ExtractionComments,SequencingRunID,old_name,changed
A71040,1944-11-25,F,DZ,,White,2014-09-22,P001394,142,1019314,⋯,Noah,Jess,,Plate36,C9,1019314,,TwinsUKPlate35_36,1019314,False
A71040,1944-11-25,F,DZ,,White,2014-09-22,P001395,72,1019314,⋯,Noah,Jess,,Plate36,C9,1019314,,TwinsUKPlate35_36,1019314,False
A71040,1944-11-25,F,DZ,,White,2014-09-22,P001396,137,1019314,⋯,Noah,Jess,,Plate36,C9,1019314,,TwinsUKPlate35_36,1019314,False


In [15]:
# create data frame with mean BP of both meassurements
v11_mBP = v11_BP%>%
    mutate(PhenID = PhenID %>% as.character,
           Phen_value = Phen_value %>% as.Num,
           bp_group = gsub('_[23]$', '', bp_text)) %>%
    group_by(FPBarcode, bp_group, ACTUAL_ZYGOSITY, 
             MicrobiomeID, calculated_age, s.BMI, age) %>%
    summarize(mean_value = mean(Phen_value, na.rm=TRUE),
              sd_value = sd(Phen_value, na.rm=TRUE)) %>%
    ungroup()

In [16]:
v11_mBP %>% dfhead

[1] 236   9


FPBarcode,bp_group,ACTUAL_ZYGOSITY,MicrobiomeID,calculated_age,s.BMI,age,mean_value,sd_value
1002319,BP_DIAS,MZ,NA55891,70,37.84576,73,69.5,2.12132
1002319,BP_SYS,MZ,NA55891,70,37.84576,73,118.5,2.12132
1002320,BP_DIAS,MZ,NA55892,70,36.13234,73,77.0,1.414214


## Anxiety samples

In [17]:
# Subset TUK blood pressure table to include only vadinCA11 subjects
# Identify samples with BP data
anx_BP = inner_join(TUK_BP, anxiety_metadata, by = c("FPBarcode" = "s.FPBarcode"))
anx_BP %>% dfhead

[1] 36 37


MicrobiomeID,DATE_BORN,SEX,ACTUAL_ZYGOSITY,Anomaly,Ethnic_Origin,Visit_Date,PhenID,Phen_value,FPBarcode,⋯,s.CollectionTime,s.IsNameOnSampleAnon,s.MapSampleToIndividualID,s.ShipmentNum,s.TwinsUKCollectionDate,s.TwinsUKCollectionTime,s.TwinsUKReceiveDate,s.TwinsUKReceiveTime,Extractedby,SequencingRunID
A55827,1946-11-20,F,MZ,,White,2013-01-24,P001394,137,1006002,⋯,8:30,1,A55827,8,2013-01-24,8:30,2013-01-24,,Jess,TwinsUKPlate15
A55827,1946-11-20,F,MZ,,White,2013-01-24,P001395,85,1006002,⋯,8:30,1,A55827,8,2013-01-24,8:30,2013-01-24,,Jess,TwinsUKPlate15
A55827,1946-11-20,F,MZ,,White,2013-01-24,P001396,140,1006002,⋯,8:30,1,A55827,8,2013-01-24,8:30,2013-01-24,,Jess,TwinsUKPlate15


In [18]:
# create data frame with mean BP of both meassurements
anx_mBP = anx_BP%>%
    mutate(PhenID = PhenID %>% as.character,
           Phen_value = Phen_value %>% as.Num,
           bp_group = gsub('_[23]$', '', bp_text)) %>%
    group_by(FPBarcode, bp_group, ACTUAL_ZYGOSITY, MicrobiomeID, s.BMI, age, i.FamilyID) %>%
    summarize(mean_value = mean(Phen_value, na.rm=TRUE),
              sd_value = sd(Phen_value, na.rm=TRUE)) %>%
    ungroup()

In [19]:
anx_mBP %>% dfhead

[1] 18  9


FPBarcode,bp_group,ACTUAL_ZYGOSITY,MicrobiomeID,s.BMI,age,i.FamilyID,mean_value,sd_value
1006002,BP_DIAS,MZ,A55827,20.83471,66,A259,82.5,3.535534
1006002,BP_SYS,MZ,A55827,20.83471,66,A259,138.5,2.12132
1006101,BP_DIAS,MZ,A31520,27.01636,58,A260,80.0,2.828427


## BGI250 data

In [20]:
# There are subjects with multiple samples (i.e. multiple barcodes), each with a blood pressure value
# e.g. 

TUK_BP %>% group_by(MicrobiomeID) %>% summarize(n =  n()) %>% filter(n>4)
TUK_BP %>% filter(MicrobiomeID == "NA1372")

# Therefore, the BGI250 and the TUK_BP datasets should be joined by barcode

MicrobiomeID,n
A12397,8
A19032,8
A23825,8
A28196,8
A49876,8
A68710,8
A82394,8
NA10321,8
NA10601,8
NA10602,8


MicrobiomeID,DATE_BORN,SEX,ACTUAL_ZYGOSITY,Anomaly,Ethnic_Origin,Visit_Date,PhenID,Phen_value,FPBarcode,SequencingSpecificName,delta_days,calculated_age,bp_text
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001394,127,1002896,Low2.14,812,49,BP_SYS_2
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001394,127,1002858,Plate8.30,808,49,BP_SYS_2
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001395,88,1002896,Low2.14,812,49,BP_DIAS_2
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001395,88,1002858,Plate8.30,808,49,BP_DIAS_2
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001396,150,1002896,Low2.14,812,49,BP_SYS_3
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001396,150,1002858,Plate8.30,808,49,BP_SYS_3
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001397,92,1002896,Low2.14,812,49,BP_DIAS_3
NA1372,1959-04-05,F,DZ,,White,2009-03-09,P001397,92,1002858,Plate8.30,808,49,BP_DIAS_3
NA1372,1959-04-05,F,DZ,,White,2014-04-14,P001394,134,1014073,Plate30.89,2,55,BP_SYS_2
NA1372,1959-04-05,F,DZ,,White,2014-04-14,P001395,78,1014073,Plate30.89,2,55,BP_DIAS_2


In [21]:
# Join BGI250 and TUK BP data by barcode
BP_data = TUK_BP %>% select(MicrobiomeID, Phen_value, FPBarcode, bp_text)

BGI250_complete = left_join(BGI_merged, BP_data, 
               by = c("CornellSampleBarcode" = "FPBarcode"))

BGI250_BP = BGI250_complete %>% select(BGI.ID, age = Age.at.metagenomics.sample, 
                                       Type.of.delivery, vegetarian.or.Vegan, Smoking, Diabetes, 
                                       Drugs.Diabetic.Tablets., Drugs.Insulin., IndividualID, IndividualZygosity,
                                       IndividualFamilyID, BMI = IndividualBMI, 
                                       Phen_value, FPBarcode = CornellSampleBarcode, bp_text)

# Obtain the mean (sd) sys and dias blood pressure of each subject
BGI250_mBP = BGI250_BP %>%
    mutate(Phen_value = Phen_value %>% as.Num,
           bp_group = gsub('_[23]$', '', bp_text)) %>%
    group_by(FPBarcode, BGI.ID, IndividualID, bp_group, BMI, age, 
             vegetarian.or.Vegan, Smoking, Diabetes, 
             IndividualZygosity, IndividualFamilyID) %>%
    summarize(mean_value = mean(Phen_value, na.rm=TRUE),
              sd_value = sd(Phen_value, na.rm=TRUE)) %>%
    ungroup()

BGI250_mBP %>% dfhead

[1] 452  13


FPBarcode,BGI.ID,IndividualID,bp_group,BMI,age,vegetarian.or.Vegan,Smoking,Diabetes,IndividualZygosity,IndividualFamilyID,mean_value,sd_value
1002314,35391,NA3241,BP_DIAS,21.42317,65,N,Never Smoked,N,DZ,NA324,61.0,1.414214
1002314,35391,NA3241,BP_SYS,21.42317,65,N,Never Smoked,N,DZ,NA324,124.0,2.828427
1002317,35387,NA21081,,19.68891,50,N,Never Smoked,N,DZ,NA2108,,


In [22]:
# Number of individuals with and without BP data from the BGI250 dataset
BGI250_mBP %>% group_by(FPBarcode) %>% summarize(n = n()) %>% pull(n) %>% table

.
  1   2 
 48 202 

In [23]:
# Intersect between vadinCA11 and BGI250
overlap_v11_BGI = intersect(BGI250_mBP$FPBarcode, v11_names$new_name) %>% print

[1] 1002409 1002424 1002474 1002503 1002521 1002774 1002868 1003472


In [24]:
# Remove overlapping samples between vadinCA11 and BGI250 before merging
v11_mBP = v11_mBP %>% filter(!(FPBarcode %in% overlap_v11_BGI))
v11_mBP %>% dfhead

[1] 222   9


FPBarcode,bp_group,ACTUAL_ZYGOSITY,MicrobiomeID,calculated_age,s.BMI,age,mean_value,sd_value
1002319,BP_DIAS,MZ,NA55891,70,37.84576,73,69.5,2.12132
1002319,BP_SYS,MZ,NA55891,70,37.84576,73,118.5,2.12132
1002320,BP_DIAS,MZ,NA55892,70,36.13234,73,77.0,1.414214


## Flagellin samples

In [25]:
# Subset TUK blood pressure table to include only vadinCA11 subjects
# Identify samples with BP data
flag_BP = inner_join(TUK_BP, Flagellin_samples, by = c("FPBarcode"))
flag_BP %>% dfhead

[1] 124  14


MicrobiomeID,DATE_BORN,SEX,ACTUAL_ZYGOSITY,Anomaly,Ethnic_Origin,Visit_Date,PhenID,Phen_value,FPBarcode,SequencingSpecificName,delta_days,calculated_age,bp_text
NA92292,1947-01-23,F,DZ,,White,2012-10-16,P001394,145,1003430,Plate8.60,400,65,BP_SYS_2
NA92292,1947-01-23,F,DZ,,White,2012-10-16,P001395,83,1003430,Plate8.60,400,65,BP_DIAS_2
NA92292,1947-01-23,F,DZ,,White,2012-10-16,P001396,137,1003430,Plate8.60,400,65,BP_SYS_3


In [26]:
# create data frame with mean BP of both meassurements
flag_mBP = flag_BP %>%
    mutate(PhenID = PhenID %>% as.character,
           Phen_value = Phen_value %>% as.Num,
           bp_group = gsub('_[23]$', '', bp_text)) %>%
    group_by(FPBarcode, bp_group, ACTUAL_ZYGOSITY, MicrobiomeID) %>%
    summarize(mean_value = mean(Phen_value, na.rm=TRUE),
              sd_value = sd(Phen_value, na.rm=TRUE)) %>%
    ungroup()

In [27]:
flag_mBP %>% dfhead

[1] 62  6


FPBarcode,bp_group,ACTUAL_ZYGOSITY,MicrobiomeID,mean_value,sd_value
1002372,BP_DIAS,DZ,NA59111,68.5,0.7071068
1002372,BP_SYS,DZ,NA59111,106.0,4.2426407
1003344,BP_DIAS,MZ,NA81561,67.5,0.7071068


# Single table with age, BMI and BP data

In [28]:
# Merge vadinCA11 and BGI250 tables and rename variables
TUK_merged_metadata = full_join(v11_mBP, BGI250_mBP, by = c("FPBarcode", "bp_group", 
                                                            "mean_value", "sd_value", 
                                                            "ACTUAL_ZYGOSITY" = "IndividualZygosity",
                                                            "s.BMI" = "BMI",
                                                            "MicrobiomeID" = "IndividualID", 
                                                            "age")) %>%
                      rename(Barcode = FPBarcode, 
                             BP_measure = bp_group, 
                             Zygosity = ACTUAL_ZYGOSITY, 
                             BP_mean_value = mean_value, 
                             BP_sd_value = sd_value, 
                             BGI_ID = BGI.ID, 
                             Subject_ID = MicrobiomeID, 
                             Age = age,  
                             Vegetarian_Vegan = vegetarian.or.Vegan, 
                             Family_ID = IndividualFamilyID,
                             BMI = s.BMI) 

TUK_merged_metadata %>% distinct(Barcode) %>% nrow
TUK_merged_metadata %>% dfhead

“Column `MicrobiomeID`/`IndividualID` joining factors with different levels, coercing to character vector”

[1] 674  14


Barcode,BP_measure,Zygosity,Subject_ID,calculated_age,BMI,Age,BP_mean_value,BP_sd_value,BGI_ID,Vegetarian_Vegan,Smoking,Diabetes,Family_ID
1002319,BP_DIAS,MZ,NA55891,70,37.84576,73,69.5,2.12132,,,,,
1002319,BP_SYS,MZ,NA55891,70,37.84576,73,118.5,2.12132,,,,,
1002320,BP_DIAS,MZ,NA55892,70,36.13234,73,77.0,1.414214,,,,,


In [29]:
# Merge Anx and merged tables and rename variables
TUK_merged_metadata = full_join(TUK_merged_metadata, anx_mBP, by = c('Barcode' = 'FPBarcode', 
                                                                     'BP_measure' = 'bp_group', 
                                                                     'Zygosity' = 'ACTUAL_ZYGOSITY',
                                                                     'Subject_ID' = 'MicrobiomeID', 
                                                                     'BP_mean_value' = 'mean_value', 
                                                                     'BP_sd_value' = 'sd_value', 
                                                                     'BMI' = 's.BMI', 
                                                                     'Age' = 'age',
                                                                     'Family_ID' = 'i.FamilyID'))

TUK_merged_metadata %>% distinct(Barcode) %>% nrow
TUK_merged_metadata %>% dfhead

“Column `Family_ID`/`i.FamilyID` joining factors with different levels, coercing to character vector”

[1] 692  14


Barcode,BP_measure,Zygosity,Subject_ID,calculated_age,BMI,Age,BP_mean_value,BP_sd_value,BGI_ID,Vegetarian_Vegan,Smoking,Diabetes,Family_ID
1002319,BP_DIAS,MZ,NA55891,70,37.84576,73,69.5,2.12132,,,,,
1002319,BP_SYS,MZ,NA55891,70,37.84576,73,118.5,2.12132,,,,,
1002320,BP_DIAS,MZ,NA55892,70,36.13234,73,77.0,1.414214,,,,,


In [30]:
# Merge flagellin and merged tables and rename variables
TUK_merged_metadata = full_join(TUK_merged_metadata, flag_mBP, by = c('Barcode' = 'FPBarcode', 
                                                                     'BP_measure' = 'bp_group', 
                                                                     'Zygosity' = 'ACTUAL_ZYGOSITY',
                                                                     'Subject_ID' = 'MicrobiomeID', 
                                                                     'BP_mean_value' = 'mean_value', 
                                                                     'BP_sd_value' = 'sd_value'))

TUK_merged_metadata %>% distinct(Barcode) %>% nrow
TUK_merged_metadata %>% dfhead

“Column `Subject_ID`/`MicrobiomeID` joining character vector and factor, coercing into character vector”

[1] 748  14


Barcode,BP_measure,Zygosity,Subject_ID,calculated_age,BMI,Age,BP_mean_value,BP_sd_value,BGI_ID,Vegetarian_Vegan,Smoking,Diabetes,Family_ID
1002319,BP_DIAS,MZ,NA55891,70,37.84576,73,69.5,2.12132,,,,,
1002319,BP_SYS,MZ,NA55891,70,37.84576,73,118.5,2.12132,,,,,
1002320,BP_DIAS,MZ,NA55892,70,36.13234,73,77.0,1.414214,,,,,


In [31]:
# One row per subject
mean_sd_sys = TUK_merged_metadata %>%
              filter(!is.na(BP_measure) & BP_measure == "BP_SYS") %>% 
              select(Barcode, BP_mean_value, BP_sd_value)


mean_sd_dias = TUK_merged_metadata %>%
              filter(!is.na(BP_measure) & BP_measure == "BP_DIAS") %>% 
              select(Barcode, BP_mean_value, BP_sd_value)

TUK_merged_metadata_tmp = TUK_merged_metadata %>%
    filter(!is.na(BP_measure)) %>%
    select(-BP_measure, -BP_mean_value, -BP_sd_value) %>%
    distinct(Barcode, .keep_all = T) %>%
    inner_join(., mean_sd_sys, by = "Barcode") %>%
    rename(sys_mean = BP_mean_value, sys_sd = BP_sd_value) %>%
    inner_join(., mean_sd_dias, by = "Barcode") %>% 
    rename(dias_mean = BP_mean_value, dias_sd = BP_sd_value)

TUK_merged_metadata_tmp %>% dfhead

TUK_merged_metadata = TUK_merged_metadata %>% 
    filter(is.na(BP_measure)) %>% 
    select(-BP_measure, -BP_mean_value, -BP_sd_value) %>%
    bind_rows(TUK_merged_metadata_tmp)

TUK_merged_metadata %>% dfhead

[1] 350  15


Barcode,Zygosity,Subject_ID,calculated_age,BMI,Age,BGI_ID,Vegetarian_Vegan,Smoking,Diabetes,Family_ID,sys_mean,sys_sd,dias_mean,dias_sd
1002319,MZ,NA55891,70,37.84576,73,,,,,,118.5,2.1213203,69.5,2.12132
1002320,MZ,NA55892,70,36.13234,73,,,,,,139.5,0.7071068,77.0,1.414214
1002420,MZ,NA66981,59,21.60833,57,,,,,,140.0,5.6568542,79.0,1.414214


[1] 398  15


Barcode,Zygosity,Subject_ID,calculated_age,BMI,Age,BGI_ID,Vegetarian_Vegan,Smoking,Diabetes,Family_ID,sys_mean,sys_sd,dias_mean,dias_sd
1002317,DZ,NA21081,,19.68891,50,35387,N,Never Smoked,N,NA2108,,,,
1002357,MZ,NA262,,21.84265,72,35520,N,Ex-Cigarette Smoker,N,NA26,,,,
1002445,MZ,NA30792,,22.77945,66,35636,N,Ex-Cigarette Smoker,N,NA3079,,,,


In [32]:
# Retain one age column
# If there is age from BGI250 use that
# else, use the calculated age from TUK_BP data

TUK_tmp_age = TUK_merged_metadata %>% 
    select(calculated_age, Age) %>% 
    apply(., MARGIN = 1, function(x) ifelse(is.na(x[1]), x[2], x[1]))
 
          
TUK_merged_metadata = TUK_merged_metadata %>% 
          select(-calculated_age, -Age) %>% 
          mutate(age = TUK_tmp_age)
          
TUK_merged_metadata %>% dfhead

[1] 398  14


Barcode,Zygosity,Subject_ID,BMI,BGI_ID,Vegetarian_Vegan,Smoking,Diabetes,Family_ID,sys_mean,sys_sd,dias_mean,dias_sd,age
1002317,DZ,NA21081,19.68891,35387,N,Never Smoked,N,NA2108,,,,,50
1002357,MZ,NA262,21.84265,35520,N,Ex-Cigarette Smoker,N,NA26,,,,,72
1002445,MZ,NA30792,22.77945,35636,N,Ex-Cigarette Smoker,N,NA3079,,,,,66


In [33]:
# The final table does not contain the total number of samples
# Find differences

# Names in final table
obs = TUK_merged_metadata %>% distinct(Barcode) %>% pull

# Input samples (unique)
exp = c(BGI_merged$CornellSampleBarcode, v11_names$new_name, 
    anxiety_metadata$s.FPBarcode, Flagellin_samples$FPBarcode) %>% unique

# Lengths
obs %>% length
exp %>% length

# Samples missing from final table
dif = exp[!exp %in% obs] 
dif %>% length

# Origin of missing samples

# BGI250
dif[dif %in% BGI250_metadata$CornellSampleBarcode]
# vadinCA11
dif[dif %in% v11_names$new_name]
# Anxiety
dif[dif %in% anxiety_metadata$s.FPBarcode]
# Flagellin
# Anxiety
dif[dif %in% Flagellin_samples$FPBarcode]

# Map file names to barcode

In [34]:
# Old names, new names, dataset of origin and whether the name should be changed to barcode
TUK_old_names = c(v11_names$old_name, BGI250_names$BGI_ID, 
                  anxiety_metadata$s.NameOnSampleWithAnon, Flagellin_samples$FPBarcode)

TUK_new_names = c(v11_names$new_name, BGI250_names$CornellSampleBarcode, 
                  anxiety_metadata$s.FPBarcode, Flagellin_samples$FPBarcode)

TUK_subset = rep(c("V11", "BGI250", "Anx", "Flagellin"), 
                 times = c(nrow(v11_names), nrow(BGI250_names), 
                           nrow(anxiety_metadata), nrow(Flagellin_samples)))

TUK_replace = c(v11_names$changed, 
                rep(TRUE, times = nrow(BGI250_names)), 
                rep(TRUE, times = nrow(anxiety_metadata)),
                rep(FALSE, times = nrow(Flagellin_samples)))

# Create a single table
TUK_name_map = data.frame(old_name = TUK_old_names, 
                          new_name = TUK_new_names, 
                          subset = TUK_subset, 
                          change = TUK_replace)


In [35]:
# Append missing samples to table with NA in all fields
# Samples without metadata
no_metadata = data.frame(Barcode = c(dif[dif %in% v11_names$new_name], 
                                     dif[dif %in% anxiety_metadata$s.FPBarcode],
                                     dif[dif %in% Flagellin_samples$FPBarcode]))

no_metadata = no_metadata %>% unique

no_metadata = left_join(no_metadata, TUK_name_map,  by = c("Barcode" = "new_name")) %>%
    select(Barcode, old_name) %>%
    group_by(Barcode) %>% 
    filter(row_number(Barcode) == 1)

# Join
TUK_merged_metadata = full_join(TUK_merged_metadata, no_metadata, 
                                by = c("Barcode", "Subject_ID" ="old_name"))


“Column `Subject_ID`/`old_name` joining character vector and factor, coercing into character vector”

In [36]:
TUK_merged_metadata %>% dfhead

[1] 417  14


Barcode,Zygosity,Subject_ID,BMI,BGI_ID,Vegetarian_Vegan,Smoking,Diabetes,Family_ID,sys_mean,sys_sd,dias_mean,dias_sd,age
1002317,DZ,NA21081,19.68891,35387,N,Never Smoked,N,NA2108,,,,,50
1002357,MZ,NA262,21.84265,35520,N,Ex-Cigarette Smoker,N,NA26,,,,,72
1002445,MZ,NA30792,22.77945,35636,N,Ex-Cigarette Smoker,N,NA3079,,,,,66


# Write files 

In [37]:
# Write map file 
TUK_name_file = file.path("/ebio/abt3_projects/vadinCA11/data/V11/TUK_samples/TUK_name_map.txt")
write.table(TUK_name_map, file = TUK_name_file, quote = F, 
            row.names = F, col.names = T, sep = "\t")

In [38]:
# Write metadata file 
TUK_merged_metadata_file = file.path("/ebio/abt3_projects/vadinCA11/data/V11/TUK_samples/TUK_combined_metadata.txt")
write.table(TUK_merged_metadata, file = TUK_merged_metadata_file, 
            row.names = F, col.names = T, sep = "\t", quote = F)


# Session info

In [39]:
sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.1 LTS

Matrix products: default
BLAS: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/UpSetR/lib/R/lib/libRblas.so
LAPACK: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/UpSetR/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] bindrcpp_0.2    dplyr_0.7.4     purrr_0.2.4     readr_1.1.1    
[5] tidyr_0.7.2     tibble_1.4.1    ggplot2_2.2.1   tidyverse_1.1.1

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.14     cellra