In [4]:
%%bash
mkdir -p script t result

In [4]:
%%bash
ls download/* |  head

download/PPMISI10874.diploidSV.vcf.gz
download/PPMISI12224.diploidSV.vcf.gz
download/PPMISI12499.diploidSV.vcf.gz
download/PPMISI12593.diploidSV.vcf.gz
download/PPMISI13039.diploidSV.vcf.gz
download/PPMISI13424.diploidSV.vcf.gz
download/PPMISI14281.diploidSV.vcf.gz
download/PPMISI14331.diploidSV.vcf.gz
download/PPMISI14426.diploidSV.vcf.gz
download/PPMISI15761.diploidSV.vcf.gz


In [9]:
# Derive SVs from Individual diploid file
t = """
library(data.table);library(dplyr)
# Paramaters
fnames = list.files("manta/")
# filename = "manta/PPMISI10874.diploidSV.vcf.gz"
# folder = tstrsplit(filename, "/") %>% .[[1]]
pdgenes = fread('../../../tool/PDgenes/PD_GENES_2017_v1.csv', skip = 1) %>% filter(complete.cases(.)) %>% 
  rename(GENE = V8) %>% 
  mutate(chr = paste("chr", chromosome, sep= "")) %>% 
  select(GENE, chr, start, end)

for (i in 1:length(fnames)){
  filename = paste("manta", fnames[i], sep = "/") 
  PATNO = tstrsplit(filename, "PPMISI") %>% .[[2]] %>% tstrsplit(., "\\\\.") %>% .[[1]]
  output = paste("t/", "PPMISI", PATNO, sep = "")
  
  # Read diploid model outputs
  dip = fread(cmd = paste("zless", filename, "| grep -v '##'"))
  dip = dip %>% filter(FILTER=="PASS")
  dip = dip %>% filter(`#CHROM` %in% paste("chr", c(1:22, "X"), sep = "")) %>%
    rename(chr = `#CHROM`)
  
  # Filter SVs by their positions
  j = inner_join(dip, pdgenes, by = "chr") %>% 
    mutate(PATNO = as.numeric(PATNO))
  
  f10K = j %>% 
    filter(POS> start - 10000 & POS < end + 10000) %>% # 10K margin
    fwrite(., paste(output, '10K.txt', sep = "_"))
  
  f1K = j %>% 
    filter(POS> start - 1000 & POS < end + 1000) %>% # 1K margin
    fwrite(., paste(output, '1K.txt', sep = "_"))
}
"""
with open('script/deriveMantaOutpus.R', 'w') as f:
    f.write(t)

In [5]:
# Populate derived file
t = """
library(data.table);library(dplyr)
KEY = "t/*_1K.txt"
tempfunc = function(K){
  t = fread(cmd = paste("awk 'NR==1{print;next}FNR>1{print}'", K))
  names(t)[10]="GENOTYPE"
  dx = fread("../../../PDcohorts/PPMI/out181018/DEMOG_DIAG.csv")
  tab = semi_join(dx, t, by = "PATNO") %>% with(table(RECRUIT)) %>% data.frame(.) 
  j = inner_join(t, dx, by = "PATNO") %>% inner_join(., tab, by="RECRUIT")
  j %>% with(table(GENE, RECRUIT))
  dip = j %>% 
    tidyr::separate("GENOTYPE", c("GENOTYPE", rep(NA, 5)), sep=":") %>% 
    mutate(DOSE = case_when(
      GENOTYPE=="1/1" ~ 2, 
      GENOTYPE=="1/0" ~ 1,
      GENOTYPE=="0/1" ~ 1,
      GENOTYPE=="0/0" ~ 0,
      TRUE ~ 99))
  t = dip %>% select(ID) %>%
    tidyr::separate(ID, paste("V", 1:8, sep=""), sep=":", fill = 'right')
  dip$TYPE = ifelse(is.na(as.numeric(t$V2)), paste(t$V1, t$V2, sep = ":"), t$V1)
  
  # Get the SV length
  t = dip %>% select(INFO) %>%
    tidyr::separate(INFO, paste("V", 1:10, sep=""), sep=";", fill = 'right')
  getLEN = function(x){
    v = t[x,] %>% t %>% as.vector
    j = grep("SVLEN\\\\=", v)
    if(length(j)==1){
      g = v[j]
      SVLEN = tstrsplit(g, "\\\\=")[[2]][1] %>% as.numeric
    }else{SVLEN=NA}
    return(SVLEN)
  }
  dip$SVLEN = lapply(1:nrow(t), getLEN) %>% unlist
  out = sub("t/\\\\*", "result/SV", K) %>% sub("txt", "csv", .) # Name for the output file
  fwrite(dip, out, row.names = F)
}
tempfunc(KEY)
tempfunc("t/*_10K.txt")
"""
with open ("script/populateDerivedSVs.R", "w") as f:
    f.write(t)

In [1]:
# Get table do burden test and explore SNCA
t = """
library(data.table);library(dplyr)
fname = "result/SV_1K.csv"
dip = fread(fname)
# Doses
dip %>% with(table(GENE, DOSE)) %>% addmargins(., 2)

## Burden table
dft = dip %>% 
  # filter(TYPE!="MantaBND") %>% 
  group_by(PATNO, GENE) %>% summarise_at(vars("DOSE"), funs(sum(., na.rm = T))) %>% 
  tidyr::spread(key = GENE, value = DOSE)
dft[is.na(dft)]=0
dft = inner_join(dft, dx, by = "PATNO")
dft %>% CreateContTable(vars = names(dft)[2:16], strata = "RECRUIT", data = .) %>% print(., explain = F, test = F)
# Burden test
## raw p
rp = dft %>% filter(RECRUIT %in% c("HC", "PD")) %>% 
  CreateContTable(vars = names(dft)[2:16], strata = "RECRUIT", data = .) %>% print(., explain = F) %>% 
  data.frame() %>% .[,1:3]
## adj-p
pca = fread("../WGS/clean_NONGENT/3a.eigenvec") %>% mutate(PATNO = sub("PPMISI","", IID) %>% as.numeric)
dfts = dft %>% filter(RECRUIT %in% c("HC", "PD")) %>% inner_join(., pca, by ="PATNO") 
rp$`adj-p`=NA
for (i in 2:nrow(rp)){
  gene = rownames(rp)[i]
  model = paste("I(RECRUIT=='PD') ~", gene, "+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")
  tdf = lm(model, data = dfts) %>% summary %>% coef
  ans = try(tdf[gene, 4], silent = T)
  if(class(ans)=="try-error"){rp$`adj-p`[i]=NA}else{rp$`adj-p`[i] = round(ans, 3)}
}
print(rp)

## Test without MantaBND
dft = dip %>% 
  filter(TYPE!="MantaBND") %>%
  group_by(PATNO, GENE) %>% summarise_at(vars("DOSE"), funs(sum(., na.rm = T))) %>% 
  tidyr::spread(key = GENE, value = DOSE)
dft[is.na(dft)]=0
dft = inner_join(dft, dx, by = "PATNO")
## raw p
rp = dft %>% filter(RECRUIT %in% c("HC", "PD")) %>% 
  CreateContTable(vars = names(dft)[2:15], strata = "RECRUIT", data = .) %>% print(., explain = F) %>% 
  data.frame() %>% .[,1:3]
## adj-p
dfts = dft %>% filter(RECRUIT %in% c("HC", "PD")) %>% inner_join(., pca, by ="PATNO") 
rp$`adj-p`=NA
for (i in 2:nrow(rp)){
  gene = rownames(rp)[i]
  model = paste("I(RECRUIT=='PD') ~", gene, "+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")
  tdf = lm(model, data = dfts) %>% summary %>% coef
  ans = try(tdf[gene, 4], silent = T)
  if(class(ans)=="try-error"){rp$`adj-p`[i]=NA}else{rp$`adj-p`[i] = round(ans, 3)}
}
print(rp)


## SNCA
snca = dip %>% filter(RECRUIT %in% c("PD", "HC")) %>% 
  filter(GENE=="SNCA") 
t = snca %>% 
  group_by(POS, REF, ALT, TYPE, SVLEN, RECRUIT, DOSE) %>% mutate(COUNT = n()) %>% ungroup() %>% 
  distinct(POS, REF, ALT, TYPE, SVLEN, RECRUIT, DOSE, COUNT)
t %>% filter(POS %in% c(89821199, 89821299)) %>% arrange(POS, ALT, desc(DOSE))

snca %>% group_by(POS, PATNO) %>% filter(n()>1) # no variation for SVs for the same position in the same person

snca_filt = snca %>% filter(POS %in% c(89821199, 89821299)) %>% 
  mutate(N_ALT_POS = paste("N_ALT", POS, sep="_")) %>% 
  tidyr::spread(key=N_ALT_POS, value=DOSE, fill = 0) %>% 
  group_by(PATNO) %>% 
  mutate(N_ALT_89821199=max(N_ALT_89821199),
         N_ALT_89821299=max(N_ALT_89821299)) %>% ungroup() %>% 
  distinct(PATNO, .keep_all = T)

# Create Analysis set
full = dip %>% distinct(PATNO, .keep_all = T) %>% select(PATNO, RECRUIT) %>% 
  filter(RECRUIT %in% c("PD", "HC"))

df = snca_filt %>% select(-RECRUIT) %>% 
  left_join(full, ., by = "PATNO")
df[is.na(df)]=0
df %>% with(table(N_ALT_89821199, N_ALT_89821299))

df %>% with(table(RECRUIT, N_ALT_89821199)) %>% chisq.test()
df %>% with(table(RECRUIT, N_ALT_89821299)) %>% chisq.test()
df %>% with(table(RECRUIT, I((N_ALT_89821199+(N_ALT_89821299))))) %>% chisq.test()

pca = fread("../WGS/clean_NONGENT/3a.eigenvec") %>% mutate(PATNO = sub("PPMISI","", IID) %>% as.numeric)
df = inner_join(df, pca, by = "PATNO")
glm(RECRUIT=="PD" ~ N_ALT_89821199+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = df, family = binomial()) %>% summary
glm(RECRUIT=="PD" ~ I(N_ALT_89821299)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = df, family = binomial()) %>% summary
glm(RECRUIT=="PD" ~ I((N_ALT_89821199+(N_ALT_89821299>0))>0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = df, family = binomial()) %>% summary
glm(RECRUIT=="PD" ~ I(N_ALT_89821199+N_ALT_89821299)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = df, family = binomial()) %>% summary
"""
with open ("script/exploreSNCA.R", "w") as f:
    f.write(t)