In [None]:
library(dplyr)
library(data.table)
library(tidyr)
library(Hmisc)

# Import data

To obtain token go to https://copdgene.hms.harvard.edu/transmart/login/auth, utilities, user profile, IRCT Token

In [None]:
source("src/functions.R")
token = "token"
IRCT_REST_BASE_URL = "https://copdgene.hms.harvard.edu"


Phenotypes of interest for all patients in freeze 5 are loaded in with reloadDfFormFile() from function.R


In [None]:
pheno.df = reloadDfFormFile("data/ids.txt", token, IRCT_REST_BASE_URL)

Select only cases

In [None]:
pheno.df = pheno.df[pheno.df$Affection_status =="Case", ] 

Show which factors appear within Affection_status

In [None]:
summary(as.factor(pheno.df$Affection_status))

Remove phenotypes that are unspecific, that are inherent to COPD (because all subjects are COPD cases) and lung cancer

In [None]:
pheno.df = subset(pheno.df, select = -c(COPD,                     
                                          Affection_status,        
                                          Chest_injuries,           
                                          Chest_operations,
                                          Other_chest_illnesses,
                                          Prostate_cancer,
                                          High_cholesterol,
                                          High_blood_pressure
                                          ))

GT.df has genotype calls of selected variants for samples of interest: freeze 4 (n = 2000)

In [None]:
GT.df = read.csv("path_to_file.csv",       
                 stringsAsFactors = FALSE, 
                 header = TRUE,  
                 sep = ",")           

Assign 1 if GT call is [1,0] or [0,1] and 0 if [0,0]

In [None]:
GT.df$GT.alleles[GT.df$GT.alleles != "[0, 0]"] = 1
GT.df$GT.alleles[GT.df$GT.alleles == "[0, 0]"] = 0
GT.df$locus = paste0(GT.df$locus.contig, ":", GT.df$locus.position)

Only keep genotype calls of cases as was already selected for in pheno.df

In [None]:
GT.df = GT.df[GT.df$dbGaP_Subject_ID %in% pheno.df$Patient.Id, ]

Subset the samples in pheno.df (freeze 5) based on samples in GT.df (freeze 4)

In [None]:
pheno.df = pheno.df[pheno.df$Patient.Id %in% GT.df$dbGaP_Subject_ID, ]

# Filter

Create new dataframe without patientID such that you have only numerical data.

In [None]:
pheno.df1 = subset(pheno.df, select = -c(Patient.Id))

#### 1. Remove clinical variable when missingness above 5%

In [None]:
pheno.df1 = pheno.df1[ ,colSums(pheno.df1, na.rm = TRUE) > 0.05 * dim(pheno.df1)]
colSums(pheno.df1, na.rm = TRUE)

#### 2. Check correlation between clinical variables using Pearson correlation

In [None]:
correlation = rcorr(as.matrix(pheno.df1), type="pearson")

Represent data in a clearer way

In [None]:
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
  )
}

corflat = flattenCorrMatrix(correlation$r, correlation$P)

Display which combinations have high correlation above 0.3

In [None]:
na.omit(corflat[corflat$cor > 0.3, ])    

Remove highly correlated clinical variables from data

In [None]:
pheno.df1 = subset(pheno.df1, select = -c(Bronchitis, Heart_attack, Angina)) #Bronchitis, Angina, Heart_attack, high_cholesterol, 

Add the non-numerical patientID

In [None]:
pheno.df1$Patient.Id = pheno.df$patientID

# PheWAS per variant


Collect all phenotypes and loci in objects

In [None]:
final.df = pheno.df1
pheno.names = names(final.df[ ,-ncol(final.df)]) #all phenotypes
locus.names = unique(GT.df$locus) # all loci

Bonferroni correctioni by number of variants (va), by phenotypes (ph) and by both (vaph), for alpha significance level of 0.05

In [None]:
signlv = 0.05 
va = signlv/length(locus.names) 
ph = signlv/length(pheno.names) 
vaph = signlv/length(locus.names)/length(pheno.names)
print(c("corrected by variants = ", round(va, 4), 
        "corrected by phenotypes = ", round(ph, 4), 
        "corrected by variants & phenotypes = ", round(vaph,8)))
ph = round(ph, 3)

#### General linear regression model on each variant with each clinical variable.
- Unless all genotype calls for this loci is homozygous to reference (0). 
- Display those associations with p-value < 0.05

In [None]:
for(l in 1:length(locus.names)){
  # select for variant
  var.df = GT.df[GT.df$locus == locus.names[l], ]
  #annotate GT calls of selected variant
  if(sum(as.numeric(var.df$GT.alleles)) == 0) next # skip iteration
  final.df$GTalleles = as.character(var.df$GT.alleles)    
  for(i in 1:length(pheno.names)){
    #general linear regression on phenotype of interest(POI)
    POI.glm = glm(formula = final.df[ ,pheno.names[i]]~GTalleles, 
                   family = binomial(), 
                   data = final.df, 
                   na.action = na.omit)
    #summary(POI.glm)
    if(summary(POI.glm)$coefficients[2,4] < 0.05){
      print(c(locus.names[l] , pheno.names[i], summary(POI.glm)$coefficients[2,4], round(exp(summary(POI.glm)$coefficients[2]) ,3)))
    }
  }
}

General linear regression model on a SINLGE variant (l)

Specify locus position in final.df as l

In [None]:
l = 61

Select only one specific variant in the genotype dataframe

In [None]:
var.df = GT.df[GT.df$locus == locus.names[l], ]

Add the genotype calls from var.df that were just selected to final.df

In [None]:
final.df$GTalleles = var.df$GT.alleles

Create dataframe to store output in


In [None]:
phewasOutput.df = as.data.frame(matrix(ncol = 7)) 


In [None]:
for(i in 1:length(pheno.names)){
  
  #general linear regression on phenotype of interest(POI)
  POI.glm = glm(formula = final.df[ ,pheno.names[i]]~GTalleles, family = binomial(), data = final.df, na.action = na.omit)
  
  # confidence interval
  ci = exp(summary(POI.glm)$coefficients["GTalleles1", 1] + qnorm(c(0.025, 0.975)) * summary(POI.glm)$coefficients["GTalleles", 2])
  
  # caco.df: for COPD cases, which patient has pheno (case (1)), and which do not (ctrl (0)) 
  caco.df        = final.df[ ,c((pheno.names)[i], "Patient.Id", "GTalleles")]
  caco.df        = na.omit(caco.df)
  
  # Count number of cases with disease (CaseDisease), without (controls) etc etc 
  CaseDisease   = length(unique(caco.df1[caco.df[ ,1] == 1 & caco.df$GTalleles == "1", "Patient.Id"])) # pheno yes; variant yes
  CaseNoDisease = length(unique(caco.df1[caco.df[ ,1] == 0 & caco.df$GTalleles == "1", "Patient.Id"])) # pheno no; variant yes
  CtrlDisease   = length(unique(caco.df1[caco.df[ ,1] == 1 & caco.df$GTalleles == "0", "Patient.Id"])) # pheno yes; variant no
  CtrlNoDisease = length(unique(caco.df1[caco.df[ ,1] == 0 & caco.df$GTalleles == "0", "Patient.Id"])) # pheno no; variant no
  
  #Add values to dataframe
  newRow        = c(pheno.names[i], 
                     round(summary(POI.glm)$coefficients[2] , 3), 
                     round(exp(summary(POI.glm)$coefficients[2]) ,3), 
                     paste0("[", round(ci[1][1],3), ", ", round(ci[2][1],3), "]"),
                     round(summary(POI.glm)$coefficients[2,4] , 4), 
                     paste0(CaseDisease   + CtrlDisease,   "(", CaseDisease,  "/", CtrlDisease,  ")"), 
                     paste0(CaseNoDisease + CtrlNoDisease, "(", CaseNoDisease,"/", CtrlNoDisease,")"))
  phewasOutput.df = rbind(newRow, phewasOutput.df)
}

Add colnames 

In [None]:
colnames(phewasOutput.df) = c("Phenotype", "Coefficient", "OR", "Confidence_interval", "Pvalue", "Pheno_present(variant/novariant)", "Pheno_absent(variant/novariant)")

Remove all rows with only NAs

In [None]:
phewasOutput.df = phewasOutput.df[complete.cases(phewasOutput.df), ]

Replace "_" in the names of the clinical variables to " "

In [None]:
phewasOutput.df$Phenotype = gsub("_", " ", phewasOutput.df$Phenotype)

Perform multiple correction (benjamini & hochberg)

In [None]:
phewasOutput.df$adjustPvalue <- p.adjust(as.numeric(phewasOutput.df$Pvalue), method = "BH")

Significant phenotypes based on adjusted P-values

In [None]:
adjPvalue_Sign.df = phewasOutput.df[as.numeric(phewasOutput.df$adjPvalue) < 0.05, ]
OR_Sign.df   = adjPvalue_Sign.df[as.numeric(adjPvalue_Sign.df$OR) > 2, ] 