In [None]:
import os
import numpy as np
import pandas as pd


In [None]:
!pip install --upgrade dsub


In [None]:
bucket = os.getenv("WORKSPACE_BUCKET")
bucket

In [None]:
%%writefile ~/aou_dsub.bash

#!/bin/bash

function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --boot-disk-size 55 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

In [None]:

%%bash
chmod +x ~/aou_dsub.bash
echo source ~/aou_dsub.bash >> ~/.bashrc

In [None]:
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env USER_NAME={USER_NAME}

# PRS-CSx

In [None]:
%%writefile merge_scoreFiles_PRS_CSx.R
#!/usr/bin/env Rscript

library(data.table)

setDTthreads(32)

scores1 <- c("IID", "SCORE1_SUM")

prsdir2 <- dirname(Sys.getenv("PRS_DIR"))
phe <- Sys.getenv("PHENO")
outdir <- Sys.getenv("OUT_DIR")
post <- Sys.getenv("POSTERIOR")
pcount <- Sys.getenv("PCOUNT")

print(paste0("PRS_DIR is ", prsdir2))
print(paste0("OUT_DIR is ", outdir))


listDf1 <- list()

for(chrom in 1:22){

    df1 <- fread(paste0(prsdir2, "/", phe, "_chr", chrom, "_posterior",post,".sscore"),  stringsAsFactors = F)[,..scores1]

    names(df1)[2] <- paste0("chr", chrom)
    listDf1[[chrom]] <- df1
    rm(df1)
}
dfMerged <- Reduce(function(...) merge(..., by = c("IID")), listDf1)
dfMerged[,SCORE1_SUM := rowSums(as.matrix(dfMerged[,2:ncol(dfMerged)]))]
prs <- dfMerged[,c("IID", "SCORE1_SUM")]

fwrite(prs, file = paste0(outdir, "/", phe, "_posterior",pcount,".sscore"), col.names = T, row.names = F, quote = F, sep = "\t")


In [None]:
files = {'--env PHENO':[],'--input PRS_DIR':[],'--output-recursive OUT_DIR':[],'--env POSTERIOR':[],'--env PCOUNT':[]}

phenos = []
with open('phenos.list','rt') as inpu:
    for i in inpu:
        if i =="\n":
            continue
        phenos.append(i.replace('\n',''))
            

for pheno in phenos:
    for pos in range(100):
        files['--env PHENO'].append(pheno)
        files['--input PRS_DIR'].append(f'{bucket}/individualPRS/indiv_result/{pheno}_chr*_posterior{pos+6}.sscore')
        files['--output-recursive OUT_DIR'].append(f'{bucket}/individualPRS/merged_result/')
        files['--env POSTERIOR'].append(pos+6)
        files['--env PCOUNT'].append(pos+1)

files = pd.DataFrame(files)

PARAMETER_FILENAME = 'merge_prs.tsv'
TEST_FILENAME = 'merge_prs_test.tsv'
# Save this Python variable value an environment variable so that its easier to use within %%bash cells.
%env PARAMETER_FILENAME={PARAMETER_FILENAME}
%env TEST_FILENAME={TEST_FILENAME}

files.to_csv(PARAMETER_FILENAME, sep='\t', index=False)
!head -n 1 {PARAMETER_FILENAME} > {TEST_FILENAME}
!tail -n 1 {PARAMETER_FILENAME} >> {TEST_FILENAME}



In [None]:
!cat {TEST_FILENAME}

In [None]:
# Use hyphens, not whitespace since it will become part of the bucket path.
job = TEST_FILENAME.replace('.tsv','')

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={job}

In [None]:
%%bash --out merge_prs_test

source ~/aou_dsub.bash


aou_dsub \
  --image "gcr.io/ukbb-diversepops-neale/yw-prs-r:test" \
  --min-cores 16 \
  --min-ram 20 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "${JOB_NAME}"  \
  --preemptible \
  --logging "${WORKSPACE_BUCKET}/panukbb/individualPRS/indiv_result/logging" \
  --tasks "${TEST_FILENAME}" \
  --script 'merge_scoreFiles_PRS_CSx.R'


In [None]:
!dstat --provider google-cls-v2 --project terra-vpc-sc-da94e041 --location us-central1 --jobs 'merge-prs---zhuozshi--240508-173917-74' --users 'zhuozshi' --status '*'




In [None]:
# Use hyphens, not whitespace since it will become part of the bucket path.
job = PARAMETER_FILENAME.replace('.tsv','')

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={job}

In [None]:
%%bash --out merge_prs_test

source ~/aou_dsub.bash


aou_dsub \
  --image "gcr.io/ukbb-diversepops-neale/yw-prs-r:test" \
  --min-cores 16 \
  --min-ram 20 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "${JOB_NAME}"  \
  --preemptible \
  --logging "${WORKSPACE_BUCKET}/panukbb/individualPRS/indiv_result/logging" \
  --tasks "${PARAMETER_FILENAME}" \
  --script 'merge_scoreFiles_PRS_CSx.R'


In [None]:
!dstat --provider google-cls-v2 --project terra-vpc-sc-da94e041 --location us-central1 --jobs 'merge-prs--zhuozshi--240508-174240-96' --users 'zhuozshi' --status '*'


