In [None]:
library(dplyr)
library(data.table)
library(tidyr)

# Import data

For token go to https://copdgene.hms.harvard.edu/transmart/login/auth, utilities, user profile, IRCT Token

In [None]:
source("functions.R")
token = ""
IRCT_REST_BASE_URL = "https://copdgene.hms.harvard.edu"

Phenotypes of interest for all patients in freeze 5

In [None]:
pheno.df = reloadDfFormFile("/path_to_ids.txt", token, IRCT_REST_BASE_URL)

Remove "other" and "exclusionary disease" from Affection status such that you only have binary value


In [None]:
pheno.df = pheno.df[pheno.df$Affection_status == "Control" | pheno.df$Affection_status =="Case", ] 
summary(as.factor(pheno.df$Affection_status)) #check

Change phenotypic scores to 1s, 0s and NAs 


In [None]:
pheno.df[pheno.df == "Yes"] = 1
pheno.df[pheno.df == "No"]  = 0 
pheno.df[pheno.df == "Do not know"] = NA

Change affection status to 1s, 0s, but keep as class character

In [None]:
pheno.df$Affection_status[pheno.df$Affection_status == "Case"]    = 1
pheno.df$Affection_status[pheno.df$Affection_status == "Control"] = 0

Change phenotypic scores from character to numeric


In [None]:
pheno.df[ ,3:ncol(pheno.df)] = lapply(pheno.df[ ,3:ncol(pheno.df)], function(x) {
  if(is.character(x)) as.numeric((x))
})
str(pheno.df) # check

s.df has samples of interest: freeze 4 (n = 2000)

In [None]:
s.df = read.csv("data/COPDannotations.txt", 
                stringsAsFactors = FALSE, 
                header = TRUE,  
                sep = "\t")

Subset the samples in pheno.df based on samples in s.df

In [None]:
final.df = pheno.df[pheno.df$Patient.Id %in% s.df$dbGaP_Subject_ID, ]

# Building Output

df of possible phenotypes

In [None]:
pheno.names = names(final.df[ ,c(3:ncol(final.df))])

Create df to store results in

In [None]:
phewasOutput.df = as.data.frame(matrix(ncol = 7)) 


In [None]:
for(i in 1:length(pheno.names)){
    
  #general linear regression on phenotype of interest(POI)
  POI.glm = glm(formula = final.df[ ,pheno.names[i]]~Affection_status, family = binomial(), data = final.df, na.action = na.omit)
  
  # confidence interval
  ci = exp(summary(POI.glm)$coefficients["Affection_status1", 1] + qnorm(c(0.025, 0.975)) * summary(POI.glm)$coefficients["Affection_status1", 2])
  
  # caco.df is df with specific phenotype, patient ID, case/ctrl
  caco.df        = final.df[ ,c((pheno.names)[i], "Patient.Id", "Affection_status")]
  caco.df        = na.omit(caco.df)
  
  # Count number of cases with disease (CaseDisease), without etc etc 
  CaseDisease   = length(unique(caco.df[caco.df[ ,1] == 1 & caco.df$Affection_status == "1", "Patient.Id"]))
  CaseNoDisease = length(unique(caco.df[caco.df[ ,1] == 0 & caco.df$Affection_status == "1", "Patient.Id"]))
  CtrlDisease   = length(unique(caco.df[caco.df[ ,1] == 1 & caco.df$Affection_status == "0", "Patient.Id"]))
  CtrlNoDisease = length(unique(caco.df[caco.df[ ,1] == 0 & caco.df$Affection_status == "0", "Patient.Id"]))
  newRow        = c(pheno.names[i], 
                    round(summary(POI.glm)$coefficients[2] ,2), 
                    round(exp(summary(POI.glm)$coefficients[2]), 2), 
                    paste0("[", round(ci[1][1],3), ", ", round(ci[2][1],3), "]"),
                    if(summary(POI.glm)$coefficients[2,4] > 10e-150){
                      summary(POI.glm)$coefficients[2,4]
                    # all p-values lower than 10e-150 will be noted as 10e-150
                    }else{
                      10e-150
                    }, 
                    paste0(CaseDisease   + CtrlDisease,   "(", CaseDisease,  "/", CtrlDisease,  ")"), 
                    paste0(CaseNoDisease + CtrlNoDisease, "(", CaseNoDisease,"/", CtrlNoDisease,")")
  )
  phewasOutput.df = rbind(newRow, phewasOutput.df)
}


In [None]:
colnames(phewasOutput.df) = c("Phenotype", "Coefficient", "OR", "Confidence_interval", "Pvalue", "Phenotype_present", "Phenotype_absent")

Multiple testing correction (bonferroni)

In [None]:
phewasOutput.df$adjPvalue = p.adjust(as.numeric(phewasOutput.df$Pvalue), method = "bonferroni")

Remove all rows with only NAs

In [None]:
phewasOutput.df = phewasOutput.df[complete.cases(phewasOutput.df), ]

Clean up representation of names of clinical variables

In [None]:
phewasOutput.df$Phenotype = gsub("_", " ", phewasOutput.df$Phenotype)

Significant phenotypes based onf  adj


In [None]:
adjPvalue_Sign.df = phewasOutput.df[as.numeric(phewasOutput.df$adjPvalue) < 0.05, ]
OR_Sign.df   = adjPvalue_Sign.df[as.numeric(adjPvalue_Sign.df$OR) > 2, ] 