# Targetted SNPs analyses + Score analysis for ASYN

In [1]:
import glob
import os
import subprocess
import pandas as pd
import subprocess

In [2]:
T="/lscratch/"+os.environ['SLURM_JOB_ID']+"/"
scriptfile='script_target.txt'
output='target/'
if not os.path.exists(output):
    os.makedirs(output)
# Prepare input file
Meta5=pd.read_table('data/Meta5.tab', index_col='SNP') # SNP as index
SNP=Meta5.index.to_series()
SNP.to_csv(T+'extract', header =True, index=False)

BFILE='../../PPMI_WGS/july_2018/PPMI_july2018'
IDLIST='../dataset/PPMI/WGS/clean_NONGENT/3armOUTR.fam'

# Create plink script
plink_extract ='plink --bfile {bfile} --extract {extract} --keep {keep} --recodeA include-alt --out {out} \n'
script = plink_extract.format(bfile=BFILE, extract=T+'extract', keep=IDLIST, out=T+"extract")

# write bash file
if os.path.exists(scriptfile):
  os.remove(scriptfile)
with open(scriptfile, 'a') as f:
    f.write('rm -rf T/* \n')
    f.write('module load plink\n')
    f.write(script)

In [3]:
t = """ # Study level analysis
library(tidyr);library(data.table);library(dplyr);library(lme4);library(lmerTest)
t=commandArgs(trailingOnly = TRUE)
# "dataset/PPMI/pheno/all.txt;HEMO+FEMALE+AAO+AGE+PC1+PC2+PC3;/WGS/clean_NONGENT/3a.eigenvec;test.csv;PPMIPD;target/"
DATASET=t[1];COVPC=t[2];PCS=t[3];GEN=t[4];MEM=t[5];OUTPUT=t[6]
COVs=strsplit(COVPC, "\\\\+")[[1]]

# Read data and put together
cohort = fread(DATASET) %>% mutate(OUTCOME=logASYN) %>% 
  mutate(FID=paste("PPMISI",IID,sep=""), IID=paste("PPMISI",IID,sep="")) %>% 
  filter(grepl(MEM, DX)) %>% 
  inner_join(., fread(PCS), by = c("FID", "IID")) %>% 
  select("FID", "IID", "OUTCOME", COVs) %>% filter(complete.cases(.))
cohort[COVs] = as.data.frame(scale(cohort[COVs])) 
gen=fread(GEN)
df=inner_join(cohort, gen, by =c("FID", "IID"))
e=setdiff(names(gen), c("FID", "IID"))
# ANALYSIS
test.listfunc = function(x){
  MODEL = paste("OUTCOME~", "`", e[x], "`+", COVPC, "+(1|IID)", sep = "")
  testLmer = try(lmer(eval(parse(text = MODEL)), data = df),silent = T)
  if(class(testLmer)[1]=="try-error"){
    sumstat=c(e[x], "NoConverge", rep(NA,4))
  }else{
    temp = summary(testLmer)
    temp1 = temp$coefficients
    if(grep(substr(e[x],1,4), rownames(temp1)) %>% length == 0){
      sumstat=c(e[x], "RankDeficient", rep(NA,4))
    }else{
      RES = temp1[2,] # The first row is intercept
      OBS_N = paste(length(temp$residuals), "_", temp$ngrps, sep="")
      sumstat <- c(e[x], OBS_N, RES[4], RES[1], RES[2], RES[5])
    }
  }
  return(sumstat)
}
temp = lapply(1:length(e), test.listfunc)
temp2 = do.call(rbind, temp) %>% data.frame
names(temp2)=c("POS_A1_A2", "OBS_N", "Tvalue", "BETA", "SE", "P")
temp2$cohort=MEM
write.table(temp2, paste(OUTPUT, MEM, ".txt", sep=""), row.names = F, quote = F, sep = "\t")
"""
Rscript1 = "Rscript1.R"
with open(Rscript1, 'w') as f:
    f.write(t)

In [4]:
Rheader = "Rscript --vanilla "
Ranalysis = Rheader + Rscript1+" {phenofile} {covpc} {pcsfile} {genefile} {member} {out}\n"
PHENOFILE='dataset/PPMI/pheno/all.txt'
PCSFILE='../dataset/PPMI/WGS/clean_NONGENT/3a.eigenvec'
GENEFILE=T+'extract.raw'
script2 = Ranalysis.format(phenofile=PHENOFILE, covpc='HEMO+FEMALE+AAO+AGE+PC1+PC2+PC3', \
                 pcsfile=PCSFILE, genefile=GENEFILE, member="PPMIPD", out=output)
script3 = Ranalysis.format(phenofile=PHENOFILE, covpc='HEMO+FEMALE+AGE+PC1+PC2+PC3', \
                 pcsfile=PCSFILE, genefile=GENEFILE, member="PPMIHC", out=output)
with open(scriptfile, 'a') as f:
    f.write('module load R\n')
    f.write(script2)
    f.write(script3)

In [5]:
t = """ # Meta analysis
library(data.table);library(dplyr);library(metafor)
time=gsub(" ", "_",Sys.time()) %>% gsub("-", "_", .) %>% gsub(":", "_",.) 
t=commandArgs(trailingOnly = TRUE)
df = bind_rows(lapply(t, fread)) %>% data.frame
V = unique(df[,1])
analysis=function(i){
  d=df[df[,1]==as.character(V[i]), ]
  d=d[complete.cases(d),]
  if(nrow(d)==0){res = c(V[i], 0, rep(NA,5))}else{
    res_fe = try(rma(yi=BETA, sei = SE, method = "FE", data = d), silent = T)
    if(class(res_fe)[1]=='try-error'){
      res = c(V[i], nrow(d), rep(NA,5))
    }else{
      res_me = try(rma(yi=BETA, sei = SE, method = "REML", data = d), silent = T)
      if(class(res_me)[1]=='try-error'){
        res = c(V[i], nrow(d), rep(NA,5))
      }else{
        res = c(V[i], nrow(d), res_fe$beta, res_fe$se, res_fe$pval, res_me$QEp, res_me$I2)
      }
    }
  }
  return(res)
}
temp=lapply(1:length(V), analysis)
temp2 = do.call(rbind, temp) %>% data.frame
names(temp2)=c("POS_A1_A2", "N_study", "BETA", "SE", "P", "QEp", "I2")
temp2$cohort="META"
temp3 = temp2 %>% 
  mutate_at(vars("BETA", "SE", "P"), as.character) %>% 
  mutate_at(vars("BETA", "SE", "P"), as.numeric) %>% 
  bind_rows(df,.) %>% filter(!POS_A1_A2%in% c("PAT", "MAT","SEX","PHENOTYPE", "PHENO", "CNT", "CNT2"))
write.csv(temp3, "meta.csv", row.names = F)
"""
Rscript2 = "Rscript2.R"
with open(Rscript2, 'w') as f:
    f.write(t)

files = [i for i in glob.glob(output+"*.txt")]
script4 = Rheader + Rscript2 +" "+(" ").join(files) + "\n"
with open(scriptfile, 'a') as f:
    f.write(script4)
    f.write("mv meta.csv "+output+"\n")

In [6]:
t = """ # Visualize results (extract)
t=commandArgs(trailingOnly = TRUE)
library(tidyr);library(dplyr);library(data.table);library(ggplot2)
temp = fread(t) %>%  mutate(POS_A1_A2=sub("\\\\)", "", sub("\\\\(/", "_", POS_A1_A2))) %>%
  separate(POS_A1_A2, c("SNP", "A1", "A2"), sep="_")
gene = fread("data/Meta5.tab") %>% select(SNP, name)
df = left_join(temp, gene, by = "SNP") %>% mutate(SNP_GENE = paste(SNP, name, sep="_"))
png('meta.jpg', width=600, height=2000, pointsize=18)
  df %>% ggplot(aes(x=SNP_GENE, y=-log10(P), color=cohort)) + geom_point() +
    geom_hline(yintercept=-log10(0.05), linetype="dashed", color = "black", size=1) +
    geom_hline(yintercept=-log10(0.05/92), linetype="dashed", color = "red", size=1) +
    coord_flip()
dev.off()
"""
Rscript3 = "Rscript3.R"
metafile = output+"meta.csv"
with open(Rscript3, 'w') as f:
    f.write(t)
with open(scriptfile, 'a') as f:
    f.write( Rheader + Rscript3 + " " + metafile+ "\n")
    f.write("mv meta.jpg "+output+"\n")

In [7]:
subprocess.run(["bash", scriptfile])

CompletedProcess(args=['bash', 'script_target.txt'], returncode=0)

# Score analysis

In [8]:
T="/lscratch/"+os.environ['SLURM_JOB_ID']+"/"
scriptfile='script_score.txt'
output='meta5prs/'
# Prepare input file
SCORE=pd.read_table('data/Meta5.tab', index_col='SNP') # SNP as index
PRS=SCORE[['Beta_all']]
PRS=PRS.assign(A1=Meta5['Allele1_all'].str.upper())
PRS.to_csv(T+'score', sep='\t')

BFILE='../../PPMI_WGS/july_2018/PPMI_july2018'
IDLIST='../dataset/PPMI/WGS/clean_NONGENT/3armOUTR.fam'

# Create plink script
plink_score ='plink --bfile {bfile} --score {score} {id_a1_beta} --keep {keep} --out {out} \n'
script1= plink_score.format(bfile=BFILE, score=T+'score', id_a1_beta='1 3 2', keep=IDLIST, out=T+'score')

# write bash file
if os.path.exists(scriptfile):
  os.remove(scriptfile)
with open(scriptfile, 'a') as f:
    f.write('rm -rf T/* \n')
    f.write('module load plink\n')
    f.write('mkdir -p ' + output + "\n")
    f.write(script1)

In [9]:
t = """ # Standardize score
t=commandArgs(trailingOnly = TRUE)
INFILE = t[1];OUTFILE = t[2]
df = read.table(INFILE, header = T)
mean=mean(df$SCORE)
sd=sd(df$SCORE)
df$SCORE=(df$SCORE-mean)/sd
write.table(df, OUTFILE, row.names = F, quote = F, sep = "\t")
"""
Rscript4 = "Rscript4.R"
with open(Rscript4, 'w') as f:
    f.write(t)
    
SCORE=T+'score.profile'
GENEFILE=SCORE.replace('profile', 'txt')
with open(scriptfile, 'a') as f:
    f.write('module load R\n')
    f.write(Rheader + Rscript4 +" "+SCORE+" "+GENEFILE+"\n")

In [10]:
PHENOFILE='dataset/PPMI/pheno/all.txt'
PCSFILE='../dataset/PPMI/WGS/clean_NONGENT/3a.eigenvec'
script2 = Ranalysis.format(phenofile=PHENOFILE, covpc='HEMO+FEMALE+AAO+AGE+PC1+PC2+PC3', \
                 pcsfile=PCSFILE, genefile=GENEFILE, member="PPMIPD", out=output)
script3 = Ranalysis.format(phenofile=PHENOFILE, covpc='HEMO+FEMALE+AGE+PC1+PC2+PC3', \
                 pcsfile=PCSFILE, genefile=GENEFILE, member="PPMIHC", out=output)
with open(scriptfile, 'a') as f:
    f.write(script2)
    f.write(script3)

In [11]:
files = [i for i in glob.glob(output+"*.txt")]
script4 = Rheader + Rscript2 +" "+(" ").join(files) + "\n"
with open(scriptfile, 'a') as f:
    f.write(script4)
    f.write("mv meta.csv "+output+"\n")

In [12]:
subprocess.run(["bash", scriptfile])

CompletedProcess(args=['bash', 'script_score.txt'], returncode=0)