# PRS Project

### Create a <u>phased</u> vcf.gz file from haps/sample

#### convert shapeit's haps/sample files to vcf

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="kdv" # eur-minus-${p}-aj-snps

for a in {1..22}; do shapeit -convert --input-haps $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased --output-vcf $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf && echo "done chr${a}" & done

#### Define export (if not exists):

In [None]:
export BCFTOOLS_PLUGINS=/specific/elkon/tools/bcftools-git/bcftools/plugins

#### Creates a compressed (bgz) and indexed (tabix) file  

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps

declare -a chrs=({1..22}); for a in ${chrs[@]}; do bgzip $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf && tabix -f $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf.gz && echo "done chr${a}"  &  done

#### Flip strands in **phased** data (using bcftools + +fixref plugin)

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps

for a in {1..22}; do bcftools +fixref $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf.gz -Oz -o $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz -- -d -f $HOME_ELKON/data-scratch/human_g1k_v37.fasta -i $HOME_ELKON/data-scratch/1000G_ALL/chr${a}.vcf.gz && echo "done chr${a}" & done

#### sort position in (unsorted) vcf files

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps
declare -a chrs=(8 15); 

for a in ${chrs[@]}; do mv $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.unsorted.vcf.gz; done
for a in ${chrs[@]}; do bcftools sort $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.unsorted.vcf.gz -Oz -o $PRS_DATASETS/1kg/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz && echo "done chr${a}"; done

#### Index a vcf.gz (bgz) file (create a tbi file):

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps

declare -a chrs=({1..22}); for a in ${chrs[@]}; do tabix -f $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz && echo "done chr${a}" & done

#### Merge files

In [None]:
p="ceu"
ds1_name="1kg"
ds1_pop="eur"
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop}-${ds2_name}-${ds2_pop}-merged

mkdir -p $PRS_DATASETS/${ds1_name}/imputation_panels/${merged_panel} || true
declare -a chrs=({1..22}); for a in ${chrs[@]}; do bcftools merge -Oz -o $PRS_DATASETS/${ds1_name}/imputation_panels/${merged_panel}/chr${a}.vcf.gz $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz  $PRS_DATASETS/${ds2_name}/imputation_panels/${ds2_pop}/phased/chr${a}.phased.flipped.vcf.gz && echo "done chr${a}" & done

#### Make haps/sample from merged files

In [None]:
p="ceu"
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop}-${ds2_name}-${ds2_pop}-merged

declare -a chrs=({1..22}); for a in ${chrs[@]}; do plink2 --vcf $PRS_DATASETS/${ds1_name}/imputation_panels/${merged_panel}/chr${a}.vcf.gz --export haps --out $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a} --geno 0 --min-alleles 2 --max-alleles 2 --threads 50 & done


#### Generate imputation panels

In [None]:
p="ceu"
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop}-${ds2_name}-${ds2_pop}-merged

declare -a chrs=({1..22}); for a in ${chrs[@]}; do  shapeit -convert --input-haps $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a} --output-log $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a}.log --output-ref $PRS_DATASETS/1kg/imputation_panels/${pop}/chr${a}.ref.hap $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a}.ref.legend.gz $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a}.ref.sample --thread 50 & done

#### Run impute2 using run_by_capacity.sh script

In [None]:
ds1_name="1kg"
ds1_pop="tsi2" # "eur-minus-${p}-aj-snps" # "eur"
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop} # ${ds1_pop}-${ds2_name}-${ds2_pop}-merged

bash run_by_capacity.sh --cmd_fn impute_${merged_panel}_scz --input_fn chrs

#### Convert chrs in shamir to bed in elkon

In [None]:
target="ukbb_sas"
imp="impute2_1kg_eur100"
target_path="/specific/netapp5/gaga/gaga-pd/prs_data/datasets/dec/"${target}"/${imp}/"
target_path_elkon="/specific/elkon/hagailevi/PRS/datasets/dec/"${target}"/${imp}/"

mkdir -p ${target_path_elkon}/raw/impute2/bed || echo ""
for a in {1..22}; do plink --gen ${target_path}/raw/impute2/chrs/chr${a}.impute2 --sample $PRS_DATASETS/${target}/original/raw/phased/chr${a}.phased.sample --oxford-single-chr ${a} --make-bed --threads 50 --out ${target_path_elkon}/raw/impute2/bed/ds${a}; done


#### merge impute2 files

In [None]:
target='dbg-scz19'
ds1_name="1kg"
ds1_pop="eur100-2" # "eur-minus-${p}-aj-snps # eur
# ds2_name="ajkg14"
# ds2_pop="t101"
ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # -${ds2_name}-${ds2_pop}-merged

bash merge_chrs_impute2_all_seq.sh --target ${target} --imp ${ds_merged_imputed}
bash merge_chrs_impute2.sh --target ${target} --imp ${ds_merged_imputed} 



#### move after merge (Shamir)

In [None]:
# p="fin"
target="dbg-scz19"
ds1_name="1kg"
ds1_pop="eur100-2"  # "eur-minus-${p}-aj-snps"
# ds2_name="ajkg14"
# ds2_pop="t101"

ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # impute2_${ds1_name}_${ds1_pop}-${ds2_name}-${ds2_pop}impute2_

for a in bim bed fam; do  mv $PRS_DATASETS/${target}/${ds_merged_imputed}/raw/impute2/bed/ds.all.${a} $PRS_DATASETS/${target}/${ds_merged_imputed}/ds.${a} & done 

#### move after merge (Elkon)

In [None]:
# p="fin"
target="ukbb_afr"
ds1_name="1kg"
ds1_pop="eur-minus-gbr"  # "eur-minus-${p}-aj-snps"
# ds2_name="ajkg14"
# ds2_pop="t101"

PRS_ELKON_DATASETS="/specific/elkon/hagailevi/PRS/datasets/dec"

ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # impute2_${ds1_name}_${ds1_pop}-${ds2_name}-${ds2_pop}impute2_

for a in bim bed fam; do  mv $PRS_ELKON_DATASETS/${target}/${ds_merged_imputed}/raw/impute2/bed/ds.all.${a} $PRS_DATASETS/${target}/${ds_merged_imputed}/ds.${a} & done 

#### Run qc_target and calc_prs 

In [None]:
# p="ceu"
discovery="D2_hght_yengo_2018" # "UKB_ht_EUR_1"# "D_t2d_mahajan_2018"
target="ukbb_sas" # "ukbb_afr"
ds1_name="1kg"
ds1_pop="eur100-minus-gbr" # "eur-minus-${p}-aj-snps" # "eur" # 
pheno="" # "ht"
# ds2_name="ajkg14"
# ds2_pop="t101"
ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # impute2_${ds1_name}_${ds1_pop}-${ds2_name}-${ds2_pop}-merged


bash qc_target_data.sh --target ${target} --imp ${ds_merged_imputed}  # && bash calc_prs.sh --target ${target} --imp ${ds_merged_imputed} --discovery ${discovery} --stage 0 --pheno ${pheno} --pval_th 0.00000005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5

In [None]:
# p="ceu"
target='ukbb_afr'
ds1_name="1kg"
ds1_pop="afr" # "eur-minus-${p}-aj-snps" # "eur" # 
discovery='ukbb_utfi_morton_2019'
pheno="utfi"
# ds2_name="ajkg14"
# ds2_pop="t101"
ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # impute2_${ds1_name}_${ds1_pop}-${ds2_name}-${ds2_pop}-merged

bash calc_prs.sh --target ${target} --imp ${ds_merged_imputed} --discovery ${discovery} --pheno ${pheno} --stage 0 --pval_th 0.00000005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5

## Check if parts files are not corrupted:

In [None]:
## Default params
# panel="1kg"
# start_pos=0
# end_pos=500

imp="impute2_1kg_sas"
target="ukbb_sas"

bash remove_inconsistent_part_files.sh --target ${target} --imp ${imp} --chrs 3-22

## Remove a corrupted file by the number of the corrupted line

In [None]:
sum=0 
chr=8
line_number=2160093

datasets_folder=$PRS_DATASETS
for a in {1..500}; do 
    echo "start $a"; 
    sum=$((sum + $(cat ${datasets_folder}/ukbb_sas/impute2_1kg_sas/raw/impute2/parts/chr${chr}.${a}.legend | wc -l))); echo "end $a: $sum"; if [[ $sum -gt  ${line_number} ]]; then echo "here $a"; break; fi; done

## zip part files:

In [None]:
## Default params
# panel="1kg"
# start_pos=0
# end_pos=500

# Single imp
# imp="impute2_1kg_eur2"

target="dbg-scz19"
ls -1 $PRS_DATASETS | grep impute2 | xargs -I {} -n 1 -P 10 bash zip_part_files.sh --target ${target} --imp {}



## zip chr files:

In [None]:
## Default params
# panel="1kg"

# Single imp
# imp="impute2_1kg_eur2"

target="dbg-scz19"
ls -1 $PRS_DATASETS | grep impute2 | xargs -I {} -n 1 -P 10 bash zip_chr_files.sh --target ${target} --imp {}



## Preparing new pheno files from the UKB

In [None]:
for pop in "gbr" "sas" "afr"; do
     for pheno in "t2d" "osar" "hfvr" "chol" "ht" "ast"; do
         ( python prepare_pheno_file.py --raw_phenotype_file $PRS_DATASETS/ukbb/ukb_code6.csv --pop_file $PRS_DATASETS/ukbb/pop.panel.${pop} --pheno_file $PRS_DATASETS/ukbb/pheno_${pheno}_${pop} --conditions_file $PRS_DATASETS/ukbb/conditions_${pheno}.json
         echo -e "FID\tIID\tlabel" > $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}.tmp
         tail -n +2 $PRS_DATASETS/ukbb/pheno_${pheno}_${pop} >> $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}.tmp
         mv $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}.tmp $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}
         if [[ -d $PRS_DATASETS/ukbb_${pop} ]]; then cp $PRS_DATASETS/ukbb/pheno_${pheno}_${pop} $PRS_DATASETS/ukbb_${pop}/pheno_${pheno}_; fi 
         echo "done ${pop} ${pheno}!" ) &
     done
done

In [None]:
pheno="ctrt" # "t2d" "osar" "hfvr" "chol" "ht" "ast" ## "hyty" "gerx" "madd" "utfi" "angna" "ctrt"
pops=("gbr" "eur")
for pop in ${pops[@]}; do 
    bash generate_gwas.sh --target ukbb --imp imputeX --pop ${pop} --pheno ${pheno} --discovery UKB_${pheno}_${pop} --stage 2;
done


#### Generate GWAS from directly from pop target set

In [None]:
pheno="ctrt" # "t2d" "osar" "hfvr" "chol" "ht" "ast"
pop="eur"
bash generate_gwas.sh --target ukbb_${pop} --imp imputeX_new --pheno ${pheno} --discovery UKB_${pheno}_${pop} --stage 2

## Calc PRS for UKB GWASs

In [None]:
# available imps: "impute2_1kg_eur" "impute2_1kg_sas" "impute2_1kg_afr" "imputeX_new" "imputeX_ibs"
# phenos of UKB: "t2d" "osar" "hfvr" "chol" "ht" "ast"
# targets : "ukbb_sas" "ukbb_afr"


discovery_pops=("eur" "gbr")
declare -a phenos=("hyty" "gerx" "madd" "angna" "utfi" "ctrt") # ("height") # ("ast" "t2d" "osar" "hfvr" "chol" "ht" "height")
declare -a targets=("ukbb_afr" "ukbb_sas")  
declare -a imps=("impute2_1kg_eur" "impute2_1kg_sas" "impute2_1kg_afr") # ("impute2_1kg_eur100" "impute2_1kg_eur" "impute2_1kg_sas" "impute2_1kg_afr" "imputeX_new" "impute2_1kg_ibs" "impute2_1kg_eur-minus-gbr" "impute2_1kg_eur100") 
declare -A continuouss=(['utfi']='false' ['ctrt']='false' ['angna']='false' ['gerx']='false' ['madd']='false' ['hyty']='false') # (['height']='true') # (['ast']='false' ['t2d']='false' ['osar']='false'  ['hfvr']='false' ['chol']='false' ['ht']='false' ['height']='true') # ['D2_hdlp_willer_2013']='false' ['D_t2d_mahajan_2018']='false' 
for discovery_pop in ${discovery_pops[@]}; do
    for pheno in ${phenos[@]}; do 
        discovery="UKB_${pheno}_${discovery_pop}"
        continuous=${continuouss[${pheno}]}
        for target in ${targets[@]}; do
             for imp in ${imps[@]}; do # 
                 echo $PRS_DATASETS/${target}/${imp}
                    if [[ -d $PRS_DATASETS/${target}/${imp} ]]; then
                     bash calc_prs.sh --target ${target} --imp ${imp} --discovery ${discovery} --stage 5 --pheno ${pheno} --continuous ${continuous} --pval_th 0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5;
                 fi       
             done
        done
    done 
done


## Calc PRS for Public GWASs

In [None]:
# available imps: "impute2_1kg_eur" "impute2_1kg_sas" "impute2_1kg_afr" "imputeX_new" "imputeX_ibs"
# phenos of UKB: "t2d" "osar" "hfvr" "chol" "ht" "ast"
# public GWASs corresponding to UKB phenos: "D2_ldlp_willer_2013" "D2_chol_willer_2013" "D2_hdlp_willer_2013" "D2_sysp_evangelou_2018" "D2_dias_evangelou_2018" "D2_asth_zhu_2019" "D_t2d_mahajan_2018" "D2_t2di_mahajan_2018"

declare -a targets=("ukbb_sas" "ukbb_afr")
declare -a imps=("impute2_1kg_eur100-minus-gbr") ("impute2_1kg_eur" "impute2_1kg_sas" "impute2_1kg_afr" "imputeX_new" "impute2_1kg_ibs" "impute2_1kg_eur-minus-gbr" "impute2_1kg_eur100") 
declare -A gwas_to_pheno=(['D2_hght_yengo_2018']='height' ['D2_ldlp_willer_2013']='chol' ['D2_chol_willer_2013']='chol'  ['D2_sysp_evangelou_2018']='ht' ['D2_dias_evangelou_2018']='ht' ['D2_asth_zhu_2019']='ast' ['D2_t2di_mahajan_2018']='t2d') # ['D2_hdlp_willer_2013']='chol' ['D_t2d_mahajan_2018']='t2d' 
declare -A continuouss=(['D2_hght_yengo_2018']='true' ['D2_ldlp_willer_2013']='false' ['D2_chol_willer_2013']='false'  ['D2_sysp_evangelou_2018']='false' ['D2_dias_evangelou_2018']='false' ['D2_asth_zhu_2019']='false' ['D2_t2di_mahajan_2018']='false') # ['D2_hdlp_willer_2013']='false' ['D_t2d_mahajan_2018']='false' 
for discovery in ${!gwas_to_pheno[@]} ; do 
    pheno=${gwas_to_pheno[${discovery}]}
    continuous=${continuouss[${discovery}]}
    for target in ${targets[@]}; do
         for imp in ${imps[@]}; do
             if [[ -d $PRS_DATASETS/${target}/${imp} ]]; then
                 echo "bash calc_prs.sh --target ${target} --imp ${imp} --discovery ${discovery} --stage 0 --pheno ${pheno} --pval_th 0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5";
                 bash calc_prs.sh --target ${target} --imp ${imp} --discovery ${discovery} --stage 0 --pheno ${pheno} --continuous ${continuous} --pval_th 0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5;
             fi       
         done
    done
done

## merge UKB chrs files

In [None]:
pop="eur"
ukbb_chr_path=$PRS_DATASETS/ukbb/raw/chrs
rm $ukbb_chr_path/mergelist.txt || echo ""
for i in {1..22}; do
    echo ${ukbb_chr_path}/chr${i}_${pop} >> ${ukbb_chr_path}/mergelist.txt
done
plink --merge-list ${ukbb_chr_path}/mergelist.txt --make-bed --out ${ukbb_chr_path}/ds_${pop}.all || ret=$?
echo $ret
if [[ -f ${ukbb_chr_path}/ds_${pop}.all-merge.missnp ]] && [[ ret -ne 0  ]]; then
    echo 'fix bed files'
    for i in {1..22}; do
        plink --bfile ${ukbb_chr_path}/chr${i}_${pop} --exclude ${ukbb_chr_path}/ds_${pop}.all-merge.missnp --make-bed --out ${ukbb_chr_path}/chr${i}_${pop}.fixed || echo "" 
        mv ${ukbb_chr_path}/chr${i}_${pop}.fixed.fam ${ukbb_chr_path}/chr${i}_${pop}.fam
        mv ${ukbb_chr_path}/chr${i}_${pop}.fixed.bim ${ukbb_chr_path}/chr${i}_${pop}.bim
        mv ${ukbb_chr_path}/chr${i}_${pop}.fixed.bed ${ukbb_chr_path}/chr${i}_${pop}.bed
    done

echo "retry to merge (fixed) chrs files"
plink --exclude ${ukbb_chr_path}/ds_${pop}.all-merge.missnp --merge-list ${ukbb_chr_path}/mergelist.txt --make-bed --out ${ukbb_chr_path}/ds_${pop}.all

fi


## Subsets of 1kg

In [None]:
for a in "ACB" "ASW" "BEB" "GBR" "CDX" "CLM" "ESN" "GWD" "GIH" "CHB" "CHS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PUR" "PJL" "STU" "YRI" "FIN" "TSI" "IBS" "CEU"; do cat pop.panel | grep ${a} | cut -f 1 > pop.${a,,}.panel; done

In [None]:
for a in "ACB" "ASW" "BEB" "GBR" "CDX" "CLM" "ESN" "GWD" "GIH" "CHB" "CHS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PUR" "PJL" "STU" "YRI" "CEU" "IBS" "TSI" "FIN"; do head -n 20  pop.${a,,}.panel > pop.${a,,}20.panel; done

In [None]:
for a in "ACB" "ASW" "BEB" "GBR" "CDX" "CLM" "ESN" "GWD" "GIH" "CHB" "CHS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PUR" "PJL" "STU" "YRI" "CEU" "IBS" "TSI" "FIN"; do tail -n +21 pop.${a,,}.panel | head -n 70 > pop.${a,,}70.panel; done

In [None]:
sp="eur"; b=20; rm pop.${sp}${b}.panel; for a in gbr fin tsi ibs ceu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="eur"; b=70; rm pop.${sp}${b}.panel; for a in gbr fin tsi ibs ceu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done

sp="afr"; b=20; rm pop.${sp}${b}.panel; for a in acb esn gwd lwk yri; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done #  asw msl
sp="afr"; b=70; rm pop.${sp}${b}.panel; for a in acb esn gwd lwk yri; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done #  asw msl

sp="sas"; b=20; rm pop.${sp}${b}.panel; for a in beb gih itu pjl stu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="sas"; b=70; rm pop.${sp}${b}.panel; for a in beb gih itu pjl stu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done

sp="eas"; b=20; rm pop.${sp}${b}.panel; for a in cdx chb chs jpt khv; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="eas"; b=70; rm pop.${sp}${b}.panel; for a in cdx chb chs jpt khv; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done

sp="amr"; b=20; rm pop.${sp}${b}.panel; for a in clm mxl pel pur; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="amr"; b=70; rm pop.${sp}${b}.panel; for a in clm mxl pel pur; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done



## Aggreagte impute 1kg results

In [None]:
cat ds.fam | awk '{print $1"\t"$2"\t"0"\t"$2}' >recoded.txt
plink --bfile ds --update-ids recoded.txt --make-bed --out ds2
plink --bfile ds2 --bmerge ../true/ds --make-bed --out ds_merged
plink --bfile ds2 --exclude ds_merged-merge.missnp --make-bed --out ds2_excluded
plink --bfile ds2_excluded --bmerge ../true/ds --make-bed --out ds_merged

In [None]:
plink --bfile ds_merged --extract $ELKON_SCRATCH/1000G_ALL/strict_mask --range --make-bed --out ds_merged_strict

In [None]:
pop="1kg_sas70" # "1kg_afr2"

for a in {1..22}; do 
    plink --bfile impute2_${pop}/ds_merged --chr ${a} --geno 0.1 --maf 0.01 --hwe 1e-06 --recode vcf --out impute2_${pop}/ds.QC_merged_${a}; 
done

In [None]:
pop=afr;
idx=3

for chr in {1..22}; do 
    echo "start $chr";  
    bcftools view -S reordered_samples${idx} impute2_1kg_${pop}70/ds.QC_merged_${chr}.vcf > impute2_1kg_${pop}70/ds.QC_merged_reordered_${chr}.vcf; 
done

In [None]:
pop=eas;
header_idx=12
for chr in {1..2}; do 
    echo "start chr $chr"; 
    for a in {0..545}; do 
        org=$(($a+10)); 
        imp=$(($a+556)); 
        cat impute2_1kg_${pop}70/ds.QC_merged_reordered_${chr}.vcf | awk '{sum+=($'$org'==$'$imp' && index($'$imp',".")==0 && NR>'$header_idx'); count+=(index($'$imp',".")==0 && NR > '$header_idx' );}; END{print sum"\t"count"\t"sum/count}'; 
    done > impute2_1kg_${pop}70/chr${chr}_qc_merged_reordered_imputation_score
done

In [None]:
declare -A eur_ar=([min]=1 [max]=100 [name]=EUR); 
declare -A eas_ar=([min]=101 [max]=200 [name]=EAS); 
declare -A sas_ar=([min]=201 [max]=300 [name]=SAS); 
declare -A afr_ar=([min]=301 [max]=400 [name]=AFR); 
declare -A afr_ar2=([min]=401 [max]=600 [name]=AFR2); 
chrs=(22 21)
arr=(eur_ar eas_ar sas_ar afr_ar afr_ar2); 

for chr in {1..22}; do # 
    for idx in ${arr[@]}; do 
    declare -n cur_ar="$idx" 
        for pop in eur eas afr sas; 
            do echo -n -e "${cur_ar[name]}\t$chr\t$pop\t"; head -n ${cur_ar[max]} impute2_1kg_${pop}70/chr${chr}_qc_merged_reordered_imputation_score | tail -n +${cur_ar[min]}| awk '{sum0+=$2; sum+=($2-$1); sum1+=$3; count+=1;}; END{print sum0/count"\t"sum/count"\t"sum1/count}'; 
        done;
        echo ""  
    done;  
done >res_reordered

## Aggreagte impute ajkg14-27 results

In [None]:
cat ds.fam | awk '{print $1"\t"$2"\t"0"\t"$2}' >recoded.txt
plink --bfile ds --update-ids recoded.txt --make-bed --out ds2
plink --bfile ds2 --bmerge ../true/ds --make-bed --out ds_merged
plink --bfile ds2 --exclude ds_merged-merge.missnp --make-bed --out ds2_excluded
plink --bfile ds2_excluded --bmerge ../true/ds --make-bed --out ds_merged



In [None]:
pop="1kg_eur-ajkg14-t101-merged" # "1kg_afr2"

for a in {1..22}; do 
    plink --bfile impute2_${pop}/ds_merged --chr ${a} --geno 0.1 --maf 0.01 --hwe 1e-06 --recode vcf --out impute2_${pop}/ds.QC_merged_${a}; 
done

In [None]:
pop="1kg_eur-ajkg14-t101-merged" # "ajkg14_t101" # "1kg_afr2"

for chr in {22..1}; do 
    echo "start chr $chr"; 
    for a in {0..26}; do 
        org=$(($a+10)); 
        imp=$(($a+37)); 
        cat impute2_${pop}/ds.QC_merged_${chr}.vcf | awk '{sum+=($'$org'==$'$imp' && index($'$imp',".")==0 && NR>7); count+=(index($'$imp',".")==0 && NR > 7 );}; END{print sum"\t"count"\t"sum/count}'; 
    done > impute2_${pop}/chr${chr}_qc_imputation_score1 
done

In [None]:
chrs=(1 20 22)

for chr in {1..22}; do # 
    for pop in ajkg14_t101 1kg_eur2 1kg_eas2 1kg_afr2 1kg_gbr2 1kg_eur-ajkg14-t101-merged;
        do echo -n -e "$chr\t$pop\t"; cat impute2_${pop}/chr${chr}_qc_imputation_score1 | awk '{sum0+=$2; sum+=($2-$1); sum1+=$3; count+=1;}; END{print sum0/count"\t"sum/count"\t"sum1/count}'; 
    done;  
    echo ""
done >res_reordered

## BCAC 

### generate imputed version from oxford files

In [None]:
eth="euro"
array="icogs"
echo "ID_1 ID_2 missing sex" > elkon_723_${eth}_${array}_sample_order.sample
echo "0 0 0 D" >> elkon_723_${eth}_${array}_sample_order.sample
cat elkon_723_${eth}_${array}_sample_order.txt | awk '{print $1" "$1" 0 2"}' >> elkon_723_${eth}_${array}_sample_order.sample

#### For Hispanic only use this block:

In [None]:
eth="hispanic"
echo "ID_1 ID_2 missing sex" > elkon_723_${eth}_sample_order.sample
echo "0 0 0 D" >> elkon_723_${eth}_sample_order.sample
cat elkon_723_${eth}_sample_order.txt | awk '{print $2" "$2" 0 2"}' >> elkon_723_${eth}_sample_order.sample

In [None]:
target="bcac_onco_aj"
eth="euro"
array="icogs"
imp="impX"
target_path_elkon="/specific/elkon/hagailevi/PRS/datasets/dec/"${target}"/${imp}/"
mkdir -p ${target_path_elkon}/raw/ox_to_bed || echo ""
for a in {22..22}; do plink --gen elkon_723_${eth}_${array}_imputed_probs_chr${a}.txt.gz --sample elkon_723_${eth}_${array}_sample_order.sample --oxford-single-chr ${a} --keep pop.il.panel --make-bed --threads 50 --out ${target_path_elkon}/raw/ox_to_bed/ds${a}; done


In [None]:
target="bcac_icogs_euro"
eth="euro"
array="icogs"
imp="impX"
target_path_elkon="/specific/elkon/hagailevi/PRS/datasets/dec/"${target}"/${imp}/"
mkdir -p ${target_path_elkon}/raw/ox_to_bed || echo ""
for a in {16..22}; do plink --gen elkon_723_${eth}_${array}_imputed_probs_chr${a}.fixed2.txt.gz --sample elkon_723_${eth}_${array}_sample_order.sample --oxford-single-chr ${a} --make-bed --threads 50 --out ${target_path_elkon}/raw/ox_to_bed/ds${a}; done


#### fix icogs files

In [None]:
gunzip -c  elkon_723_euro_icogs_imputed_probs_chr2.txt.gz | sed 's/^/--- /' | gzip >  elkon_723_euro_icogs_imputed_probs_chr2.fixed2.txt.gz

In [None]:
 for a in {10..15}; do plink --gen elkon_723_${eth}_${array}_imputed_probs_chr${a}.fixed2.txt.gz --sample elkon_723_${eth}_${array}_sample_order.sample --oxford-single-chr ${a} --make-bed --threads 50 --out ${target_path_elkon}/raw/ox_to_bed/ds${a}; done

#### Generate GWAS using plink2 with PCs (TBD!)

In [None]:
eth=pl
target="bcac_onco_eur-minus-${eth}"
imp="impX_new"

if [[ ! -d $PRS_GWASS/${target} ]]
    echo "Creating dir: $PRS_GWASS/${target}"
    mkdir -p $PRS_GWASS/${target}
fi

plink2 --bfile ds.QC --glm --pheno $PRS_DATASETS/${target}/${imp}/pheno --allow-no-sex --covar ds...TBD! --covar-name PC1,PC2,PC3,PC4,PC5 --out $PRS_GWASS/${target}/gwas
#plink2 --bfile ds.QC --glm --pheno $PRS_DATASETS/${target}/${imp}/pheno --allow-no-sex --covar covar_reformatted --covar-name PC1,PC2,PC3,PC4,PC5,country,history,age,er,brca12 --out gwas_covar

#### Modify BCAC's SNP id file to correct format (using vim)

In [None]:
:   1,$s/\:*\:.\{-}\t/\t
:   g!/rs/d

## CV

#### Create CV

In [None]:
folds=5
rep_start=1
rep_end=1

discovery='bcac_onco_eur-5pcs'
target='bcac_onco_aj'
imp='impX_new'
method='ls'
hp="0.2,0.5,0.9,1"
# hp="0.00000005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5"
stage=1

for rep in `seq ${rep_start} ${rep_end}`; do 
    
    echo start rep ${rep}...
 
    # Create cv
    #python create_cv.py --rep ${rep}
    
    # Estimate hyperparameters on validation and test sets
    for fold in `seq 1 ${folds}`; do 
        bash calc_prs_cv_${method}.sh --discovery ${discovery} --target ${target} --imp ${imp} --cv ${fold}_${folds} --stage ${stage} --hp ${hp} --rep ${rep};
    done
    
    # Aggregate metrics
    python aggregate_prs_statistics_cv_${method}_bcac_aj.py --imp ${imp} --rep ${rep}

done

python plot_metrics_boxplots_cv_${method}_bcac_aj.py --rep_start ${rep_start} --rep_end ${rep_end}

#### create "both" set (train + validation)

In [None]:
for a in {1..10}; do cat $PRS_DATASETS/bcac_onco_aj/rep_${a}/pheno___1_5_train > $PRS_DATASETS/bcac_onco_aj/rep_${a}/pheno___5_both; tail -n +2 $PRS_DATASETS/bcac_onco_aj/rep_${a}/pheno___1_5_validation >> $PRS_DATASETS/bcac_onco_aj/rep_${a}/pheno___5_both; done

In [None]:

discovery_path=$PRS_GWASS/bcac_onco_eur-5pcs/

if [[ ! -f ds.dupvar ]]; then
    touch ds.dupvar
fi
echo clumping
plink \
    --bfile ds.QC \
  --clump-p1 1 \
  --clump-p2 1 \
  --clump-r2 0.2 \
  --clump-kb 500 \
  --clump ${discovery_path}gwas.QC.Transformed \
  --clump-snp-field SNP \
  --clump-field P \
  --out eur \
  --exclude ds.dupvar

echo filter by clumping
awk 'NR!=1{print $3}' eur.clumped >  eur.valid.snp

#### create "both" datasets

In [None]:
for a in {71..110}; do cat rep_${a}/pheno___1_5_train > rep_${a}/pheno___5_both; tail -n +2  rep_${a}/pheno___1_5_validation >> rep_${a}/pheno___5_both; done

In [None]:
for b in {71..75}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done 
for b in {76..80}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done 
for b in {81..85}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done 
for b in {86..90}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done 
for b in {91..95}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done 
for b in {96..100}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done
for b in {100..105}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done 
for b in {105..110}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv ${a}_5 --rep ${b}; done ; bash prepare_cv_datasets.sh --discovery test --target bcac_onco_aj --imp impX_new --cv 5 --rep ${b}; done


In [None]:
for b in {72..72}; do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method ls; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method ls --stage 1; done 
for b in {76..80}; do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method pt; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method pt --stage 1; done 
for b in {81..85}; do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method pt; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method pt --stage 1; done 
for b in {86..90}; do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method pt; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method pt --stage 1; done 
for b in {91..95}; do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method pt; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method pt --stage 1; done 
for b in {96..100};  do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method pt; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method pt --stage 1; done 
for b in {100..105}; do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method pt; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method pt --stage 1; done 
for b in {105..110}; do bash run_cv_single_rep_bcac_aj.sh --rep ${b} --method pt; bash run_cv_single_rep_test_bcac_aj.sh --rep ${b} --method pt --stage 1; done 


In [None]:
plink --bfile ds0 --extract $PRS_GWASS/bca_313/313.valid.snp --a1-allele $PRS_GWASS/bca_313/313_rsids.tsv 5 3 --make-bed --out ds

## liftover

In [None]:
python $PRS_TOOLS/liftOverPlink/liftOverPlink.py --map $PRS_DATASETS/humc/original/raw/liftover/ds38.map --out $PRS_DATASETS/hadassa/original/raw/liftover/lifted --chain $PRS_TOOLS/liftOverPlink/hg38ToHg19.over.chain
python $PRS_TOOLS/liftOverPlink/rmBadLifts.py --map lifted.map --out good_lifted.map --log bad_lifted.dat
cut -f 2 bad_lifted.dat > to_exclude.dat
cut -f 4 lifted.bed.unlifted | sed "/^#/d" >> to_exclude.dat
plink --file ds38 --recode --out lifted_filtered --exclude to_exclude.dat --allow-extra-chr
plink --ped lifted_filtered.ped --map good_lifted.map --recode --out final


## convert vcf to hg19 using annotations

#### copy headers

In [None]:
# Retrieve first data lines
a=$(awk '{if(substr($0,1,1)!="#"){print NR; exit;}}' humc19.vcf)
# Copy headers
head -n $(($a-1)) humc19.vcf > humc19_reformatted.vcf

####  reformat data

In [None]:
tail -n +66  humc19.vcf  | awk '{len=split($8,ar,";"); is_hg19=0; for(i=1;i<=len; i++){if (ar[i] ~ /hg19_chr/){split(ar[i],a,"="); a_val=a[2]}; if (ar[i] ~ /hg19_position/){split(ar[i],b,"="); b_val=b[2]; is_hg19=1}}; if(is_hg19==1){printf a_val"\t"b_val"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9; for(i=1; i<=181; i++){printf "\t"$(9+i)}; printf "\n"} }' >> humc19_reformatted.vcf

#### Convert to bed files

In [None]:
plink --vcf humc19_reformatted.vcf --vcf-idspace-to $ --double-id --make-bed --allow-extra-chr --out ../ds

In [None]:
discoveries="bcac_onco_eur-5pcs" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt3,pt2,ls,ld"
methods=${methods//,/ }
base_rep=105
suffix="bcac_aj2"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="bcac_onco_eur-minus-uk-5pcs" 
targets="bcac_onco_uk_6000"
imps="impX_new"
methods="ls,pt"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_uk_uk"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..5}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="bcac_onco_eur-minus-uk-5pcs" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt3,ls"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_uk_aj"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

#### aggregate bcac_onco_eur-minus-pl

In [None]:
discoveries="bcac_onco_eur-minus-pl" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt2,pt3,ls,ld"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_pl_aj"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="bcac_onco_eur-minus-pl" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt3,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_pl_aj"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

# Imptutation: revised

### prepare datasets

#### single rep

In [None]:
discovery="PGC2_noAJ"
target="dbg-scz19"
imp="impute2_ajkg14_t101"
base_rep="105"
cur_rep="2"
rep="${base_rep}_${cur_rep}"
n_folds=5
for fold in {1..5}; do bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} --imp ${imp} --rep ${rep} --cv ${fold}_${n_folds}; done
bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} --imp ${imp} --rep ${rep} --cv ${fold}

#### multiple reps

In [None]:
discovery="PGC2_noAJ"
target="dbg-scz19"
imp="impute2_1kg_afr2"
base_rep="105"
n_folds=5
for cur_rep in {1..6}; for cur_fold in {1..5}; do bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} --imp ${imp} --rep ${base_rep}_${cur_rep} --cv ${cur_fold}_${n_folds}; done; done
for cur_rep in {1..6}; do bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} --imp ${imp} --rep ${base_rep}_${cur_rep} --cv ${n_folds}; done;

#### multiple imps (UKB)

In [None]:
discovery="PGC2_noAJ"
target="dbg-scz19"
imp_pops="eur,sas,gbr"
imp_pops=(${imp_pops//,/ })
base_rep="105"
n_folds=5

for imp_pop in imp_pops; do for cur_rep in {1..6}; do for cur_fold in {1..5}; do bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} --imp impute2_1kg_${imp_pop} --rep ${base_rep}_${cur_rep} --cv ${cur_fold}_${n_folds} --pheno ht; done; done; done
for imp_pop in imp_pops; do for cur_rep in {1..6}; do bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} --imp impute2_1kg_${imp_pop} --rep ${base_rep}_${cur_rep} --cv ${n_folds} --pheno ht; done; done


### run PRS method on rep

In [None]:
discovery="PGC2_noAJ"
target="dbg-scz19"
imp="impute2_ajkg14_t101" # "impute2_1kg_ceu2"
method="ld"
base_rep=105

for cur_rep in {1..6}; do bash run_cv_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} --method ${method} --rep ${base_rep}_${cur_rep} --start_fold 2 && bash run_cv_test_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} --method ${method} --rep ${base_rep}_${cur_rep}; done

## decomposed into separated commands:
# for cur_rep in {1..6}; do bash run_cv_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} --method pt2 --rep ${base_rep}_${cur_rep};  done
# for cur_rep in {1..6}; do bash run_cv_test_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} --method pt2 --rep ${base_rep}_${cur_rep}; done


In [None]:
discovery="PGC2_noAJ"
target="dbg-scz19"
imp="impute2_1kg_afr2" # "impute2_1kg_ceu2"
method="ld"
base_rep=105
stage=1
folds=5


for cur_rep in {1..6}; do
    for fold in {1..5}; do
        res=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//PGC2_noAJ_dbg-scz19/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.ld___${fold}_5_validation.or.summary.*.tsv | wc -l);
        echo "res validation: ${res}"
        if [[ ${res} -eq 0 ]]; then
            bash calc_prs_cv_${method}.sh --discovery ${discovery} --target ${target} --imp ${imp} --cv ${fold}_${folds} --stage ${stage} --rep ${base_rep}_${cur_rep};
        fi   
    done;
    res=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//PGC2_noAJ_dbg-scz19/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.ld___5_test.or.summary.*.tsv | wc -l);
    echo "res test: ${res}"
    if [[ ${res} -eq 0 ]]; then
        bash calc_prs_cv_${method}.sh --discovery ${discovery} --target ${target} --imp ${imp} --cv 5 --stage ${stage} --rep ${base_rep}_${cur_rep}; 
    fi 
done



### Aggregate results (SCZ)

In [None]:
discoveries="PGC2_noAJ" 
targets="dbg-scz19"
imps="impute2_1kg_eur-ajkg14-t101-merged,impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2"
methods="pt3,pt2,ls,ld"
methods=${methods//,/ }
base_rep=105
suffix="scz"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        # python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} && \
        # python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} & # && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix};
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="PGC2_noAJ" 
targets="dbg-scz19"
imps="impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_tsi2,impute2_1kg_ibs2" # impute2_1kg_ceu2,impute2_1kg_fin2,
method="ls"
base_rep=105

for cur_rep in {1..6}; do 
    python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep};
    python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99;
    # python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99;
done;


### Run all folds UKB (recovery from latest stage)

In [None]:
# discoveries="D2_sysp_evangelou_2018,D2_dias_evangelou_2018,D2_asth_zhu_2019,D2_chol_willer_2013,D2_ldlp_willer_2013,D2_t2di_mahajan_2018,D2_gerx_an_2019,D2_madd_howard_2019" # "D2_sysp_evangelou_2018" D2_hdlp_willer_2013
discoveries="UKB_ht_eur,UKB_chol_eur,UKB_hfvr_eur,UKB_hyty_eur,UKB_madd_eur,UKB_osar_eur,UKB_t2d_eur,UKB_utfi_eur,UKB_gerx_eur,UKB_angna_eur,UKB_ast_eur,UKB_ctrt_eur"
discoveries=${discoveries//,/ }
targets="ukbb_afr"
targets=${targets//,/ }
imps="original" # "impute2_1kg_gbr" # ,impute2_1kg_afr,impute2_1kg_gbr" # "impute2_1kg_eur,impute2_1kg_afr,impute2_1kg_gbr,impute2_1kg_sas" # "impute2_1kg_ceu2"
imps=${imps//,/ }
method="ls"
base_rep=105
folds=5
complete_missing=true


# declare -A gwas_to_pheno=(['D2_hght_yengo_2018']='height' ['D2_ldlp_willer_2013']='chol' ['D2_hdlp_willer_2013']='chol' ['D2_chol_willer_2013']='chol'  ['D2_sysp_evangelou_2018']='ht' ['D2_dias_evangelou_2018']='ht' ['D2_asth_zhu_2019']='ast' ['D2_t2di_mahajan_2018']='t2d' ['D2_madd_howard_2019']='madd' ['D2_gerx_an_2019']='gerx') # ['D2_hdlp_willer_2013']='chol' ['D_t2d_mahajan_2018']='t2d' 
declare -A gwas_to_pheno=(['UKB_ht_eur']='ht' ['UKB_chol_eur']='chol' ['UKB_hfvr_eur']='hfvr' ['UKB_hyty_eur']='hyty'  ['UKB_madd_eur']='madd' ['UKB_osar_eur']='osar' ['UKB_t2d_eur']='t2d' ['UKB_utfi_eur']='utfi' ['UKB_gerx_eur']='gerx' ['UKB_angna_eur']='angna' ['UKB_ast_eur']='ast' ['UKB_ctrt_eur']='ctrt') 

counter=0
min_n_profiles=80
min_n_profiles_res3=10

for discovery in ${discoveries[@]}; do
    pheno=${gwas_to_pheno[${discovery}]}
    for target in ${targets[@]}; do
        for imp in ${imps[@]}; do
            for cur_rep in {1..6}; do
                for fold in {1..5}; do
                    res1=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/lasso/prs.cv.${method}___${fold}_${folds}_train.*.weights 2>/dev/null| wc -l );
                    res2=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${fold}_${folds}_validation.*.profile 2>/dev/null| wc -l ) ;
                    res3=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${fold}_${folds}_validation.or.summary.1-*.tsv 2>/dev/null | wc -l );
                    stage=-1
                    if [[ ${res3} -lt ${min_n_profiles_res3} ]]; then
                        stage=3
                    fi
                    if [[ ${res2} -lt ${min_n_profiles} ]]; then
                        stage=2
                    fi
                    if [[ ${res1} -lt ${min_n_profiles} ]]; then
                        stage=1
                    fi
                    echo "Discovery: ${discovery}, Target: ${target}, imp: ${imp}, rep: ${cur_rep}, fold: ${fold}: res validation: stage 1 - ${res1}, stage 2 - ${res2}, stage 3 - ${res3}" 
                    if [[ ${stage} -ne -1 ]]; then
                        if [[ ${complete_missing} = true ]]; then
                            bash calc_prs_cv_${method}.sh --discovery ${discovery} --target ${target} --imp ${imp} --cv ${fold}_${folds} --rep ${base_rep}_${cur_rep} --pheno ${pheno} --stage ${stage};
                        fi
                        counter=$((${counter}+1))
                    fi   
                done;
                res1=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/lasso/prs.cv.${method}___${folds}_both.*.weights 2>/dev/null| wc -l );
                res2=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${folds}_test.*.profile 2>/dev/null | wc -l);
                res3=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${folds}_test.or.summary.1-*.tsv 2>/dev/null | wc -l );
                echo "Discovery: ${discovery}, Target: ${target}, imp: ${imp}, rep: ${cur_rep}, fold: ${fold}: res test: stage 1 - ${res1}, stage 2 - ${res2}, stage 3 - ${res3}"
                stage=-1
                if [[ ${res3} -lt ${min_n_profiles_res3} ]]; then
                    stage=3
                fi
                if [[ ${res2} -lt ${min_n_profiles} ]]; then
                        stage=2
                fi
                if [[ ${res1} -lt ${min_n_profiles} ]]; then
                    stage=1
                fi
                if [[ ${stage} -ne -1 ]]; then
                    if [[ ${complete_missing} = true ]]; then
                        bash calc_prs_cv_${method}.sh --discovery ${discovery} --target ${target} --imp ${imp} --cv 5 --rep ${base_rep}_${cur_rep} --pheno ${pheno} --stage ${stage} ;
                    fi
                    counter=$((${counter}+1))
                fi 
            done
        done
    done
done

echo "counter: ${counter}"


In [None]:
pop="sas"
discoveries="D2_sysp_evangelou_2018" # "D2_sysp_evangelou_2018,D2_dias_evangelou_2018,D2_asth_zhu_2019,D2_chol_willer_2013,D2_ldlp_willer_2013,D2_t2di_mahajan_2018,D2_gerx_an_2019,D2_madd_howard_2019" # "D2_sysp_evangelou_2018,D2_dias_evangelou_2018,D2_asth_zhu_2019,D2_chol_willer_2013,D2_hdlp_willer_2013,D2_ldlp_willer_2013,D2_t2di_mahajan_2018,D2_gerx_an_2019,D2_madd_howard_2019" # "D2_sysp_evangelou_2018" D2_hdlp_willer_2013
# discoveries="UKB_ht_eur,UKB_chol_eur,UKB_hfvr_eur,UKB_hyty_eur,UKB_madd_eur,UKB_osar_eur,UKB_t2d_eur,UKB_utfi_eur,UKB_gerx_eur,UKB_angna_eur,UKB_ast_eur,UKB_ctrt_eur"
discoveries=${discoveries//,/ }
targets="ukbb_sas,ukbb_afr" # "ukbb_${pop}"
targets=${targets//,/ }
imps="impX_new" # "impute2_1kg_gbr,impute2_1kg_eur,impute2_1kg_${pop}" # ,impute2_1kg_afr" # ,impute2_1kg_sas "impute2_1kg_ceu2"
imps=${imps//,/ }
method="pt2"
base_rep=105
folds=5
complete_missing=true


declare -A gwas_to_pheno=(['D2_hght_yengo_2018']='height' ['D2_ldlp_willer_2013']='chol' ['D2_hdlp_willer_2013']='chol' ['D2_chol_willer_2013']='chol'  ['D2_sysp_evangelou_2018']='ht' ['D2_dias_evangelou_2018']='ht' ['D2_asth_zhu_2019']='ast' ['D2_t2di_mahajan_2018']='t2d' ['D2_madd_howard_2019']='madd' ['D2_gerx_an_2019']='gerx') # ['D2_hdlp_willer_2013']='chol' ['D_t2d_mahajan_2018']='t2d' 
# declare -A gwas_to_pheno=(['UKB_ht_eur']='ht' ['UKB_chol_eur']='chol' ['UKB_hfvr_eur']='hfvr' ['UKB_hyty_eur']='hyty'  ['UKB_madd_eur']='madd' ['UKB_osar_eur']='osar' ['UKB_t2d_eur']='t2d' ['UKB_utfi_eur']='utfi' ['UKB_gerx_eur']='gerx' ['UKB_angna_eur']='angna' ['UKB_ast_eur']='ast' ['UKB_ctrt_eur']='ctrt') 

counter=0
min_n_profiles=9

for discovery in ${discoveries[@]}; do
    pheno=${gwas_to_pheno[${discovery}]}
    for target in ${targets[@]}; do
        for imp in ${imps[@]}; do
            for cur_rep in {1..6}; do
                for fold in {1..5}; do
                    res1=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${fold}_${folds}_validation.*.profile 2>/dev/null| wc -l ) ;
                    res3=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${fold}_${folds}_validation.or.summary.*.tsv 2>/dev/null | wc -l );
                    stage=-1
                    if [[ ${res3} -lt ${min_n_profiles} ]]; then
                        stage=3
                    fi
                    if [[ ${res1} -lt ${min_n_profiles} ]]; then
                        stage=1
                    fi
                    # echo "Discovery: ${discovery}, Target: ${target}, imp: ${imp}, rep: ${cur_rep}, fold: ${fold}: res validation: stage 1 - ${res1}, stage 3 - ${res3}" 
                    if [[ ${stage} -ne -1 ]]; then
                        if [[ ${complete_missing} = true ]]; then
                           bash calc_prs_cv_${method}.sh --discovery ${discovery} --target ${target} --imp ${imp} --cv ${fold}_${folds} --rep ${base_rep}_${cur_rep} --pheno ${pheno} --stage ${stage};
                        fi
                        echo "Discovery: ${discovery}, Target: ${target}, imp: ${imp}, rep: ${cur_rep}, fold: ${fold}: res validation: stage 1 - ${res1}, stage 3 - ${res3}" 
                        counter=$((${counter}+1))
                    fi   
                done;
                res1=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${fold}_test.*.profile 2>/dev/null | wc -l);
                res3=$(ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs//${discovery}_${target}/${imp}/rep_${base_rep}_${cur_rep}/prs.cv.${method}_${pheno}__${fold}_test.or.summary.*.tsv 2>/dev/null | wc -l );
                # echo "Discovery: ${discovery}, Target: ${target}, imp: ${imp}, rep: ${cur_rep}, fold: ${fold}: res test: stage 1 - ${res1}, stage 3 - ${res3}"
                stage=-1
                if [[ ${res3} -lt ${min_n_profiles} ]]; then
                    stage=3
                fi
                if [[ ${res1} -lt ${min_n_profiles} ]]; then
                    stage=1
                fi
                if [[ ${stage} -ne -1 ]]; then
                    if [[ ${complete_missing} = true ]]; then
                        bash calc_prs_cv_${method}.sh --discovery ${discovery} --target ${target} --imp ${imp} --cv 5 --rep ${base_rep}_${cur_rep} --pheno ${pheno} --stage ${stage};
                    fi
                    echo "Discovery: ${discovery}, Target: ${target}, imp: ${imp}, rep: ${cur_rep}, fold: ${fold}: res test: stage 1 - ${res1}, stage 3 - ${res3}"
                    counter=$((${counter}+1))
                fi 
            done
        done
    done
done

echo "counter: ${counter}"



### Aggregate results (UKB)

In [None]:
# discoveries="D2_sysp_evangelou_2018,D2_chol_willer_2013,,D2_t2di_mahajan_2018,D2_gerx_an_2019,D2_madd_howard_2019" # D2_ldlp_willer_2013 # "D2_sysp_evangelou_2018,D2_dias_evangelou_2018,D2_asth_zhu_2019,D2_chol_willer_2013,D2_hdlp_willer_2013,,D2_t2di_mahajan_2018,D2_gerx_an_2019,D2_madd_howard_2019" # D2_ldlp_willer_2013
discoveries="UKB_ht_eur,UKB_chol_eur,UKB_hfvr_eur,UKB_hyty_eur,UKB_madd_eur,UKB_osar_eur,UKB_t2d_eur,UKB_utfi_eur,UKB_gerx_eur,UKB_angna_eur,UKB_ast_eur,UKB_ctrt_eur"
suffix="ukb_gwas"
method="pt3"
base_rep=105
stage=1
folds=5


targets="ukbb_afr,ukbb_sas" # ${pop}" # ,ukbb_sas
imps="original,impute2_1kg_sas,impute2_1kg_afr,impute2_1kg_eur" # ,impute2_1kg_gbr,imputeX_new" # "impute2_1kg_sas,impute2_1kg_afr,impute2_1kg_eur,impute2_1kg_gbr" # impute2_1kg_${pop} ,impute2_1kg_sas" 

declare -A gwas_to_pheno=(['D2_hght_yengo_2018']='height' ['D2_ldlp_willer_2013']='chol' ['D2_hdlp_willer_2013']='chol' \
                          ['D2_chol_willer_2013']='chol'  ['D2_sysp_evangelou_2018']='ht' ['D2_dias_evangelou_2018']='ht' \
                          ['D2_asth_zhu_2019']='ast' ['D2_t2di_mahajan_2018']='t2d'\
                          ['D2_madd_howard_2019']='madd' ['D2_gerx_an_2019']='gerx'\
                          ['UKB_ht_eur']='ht' ['UKB_chol_eur']='chol' \
                          ['UKB_hfvr_eur']='hfvr' ['UKB_hyty_eur']='hyty'  ['UKB_madd_eur']='madd' \
                          ['UKB_osar_eur']='osar' ['UKB_t2d_eur']='t2d' ['UKB_utfi_eur']='utfi' \
                          ['UKB_gerx_eur']='gerx' ['UKB_angna_eur']='angna' ['UKB_ast_eur']='ast' \
                          ['UKB_ctrt_eur']='ctrt') 

## aggregate_prs_statistics
for cur_rep in {1..6}; do 
    python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&\
    python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} &&\
    python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} --metric or_99 
done;



In [None]:
discoveries="D2_sysp_evangelou_2018,D2_dias_evangelou_2018,D2_asth_zhu_2019,D2_chol_willer_2013,D2_hdlp_willer_2013,D2_ldlp_willer_2013,D2_t2di_mahajan_2018,D2_gerx_an_2019,D2_madd_howard_2019" 
# discoveries="UKB_ht_eur,UKB_chol_eur,UKB_hfvr_eur,UKB_hyty_eur,UKB_madd_eur,UKB_osar_eur,UKB_t2d_eur,UKB_utfi_eur,UKB_gerx_eur,UKB_angna_eur,UKB_ast_eur,UKB_ctrt_eur"
method="pt2"
base_rep=105
stage=1
folds=5
declare -A gwas_to_pheno=(['D2_hght_yengo_2018']='height' ['D2_ldlp_willer_2013']='chol' ['D2_hdlp_willer_2013']='chol' ['D2_chol_willer_2013']='chol'  ['D2_sysp_evangelou_2018']='ht' ['D2_dias_evangelou_2018']='ht' ['D2_asth_zhu_2019']='ast' ['D2_t2di_mahajan_2018']='t2d') # ['D2_hdlp_willer_2013']='chol' ['D_t2d_mahajan_2018']='t2d' 
# declare -A gwas_to_pheno=(['UKB_ht_eur']='ht' ['UKB_chol_eur']='chol' ['UKB_hfvr_eur']='hfvr' ['UKB_hyty_eur']='hyty'  ['UKB_madd_eur']='madd' ['UKB_osar_eur']='osar' ['UKB_t2d_eur']='t2d' ['UKB_utfi_eur']='utfi' ['UKB_gerx_eur']='gerx' ['UKB_angna_eur']='angna' ['UKB_ast_eur']='ast' ['UKB_ctrt_eur']='ctrt') 

targets="ukbb_afr,ukbb_sas" # "ukbb_sas,ukbb_afr" # ,ukbb_sas"
imps="original,impute2_1kg_sas,impute2_1kg_afr,impute2_1kg_eur,impute2_1kg_gbr" # "impute2_1kg_ceu2"


for cur_rep in {1..6}; do 
    python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${targets};
    python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${targets} --metric or_99;
    # python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99;
done;


### Add phenotypes with the same sets as an existing one (\_ht\_; hypertension)

In [None]:

for cur_rep in {1..6}; do 
    cd rep_105_${cur_rep}
    for pheno in chol t2d gerx madd angna ast ctrt gerx height hfvr hyty osar utfi; do 
        echo "rep_105_${cur_rep} ${pheno}"
        ls -1 pheno_ht_* | xargs -I A bash -c 'fl=$(echo A | awk '"'"'{ gsub(/_ht_/, "_'${pheno}'_"); print $0}'"'"'); awk '"'"'{if(NR==FNR){c[$1]++; next}; if (c[$1]>0){print $0}}'"'"' <(cat A) <(cat ../pheno_'${pheno}'_) > ${fl}'
    done
    cd ..
done

### liftOver

In [None]:
vcf_file="..."

plink --vcf ${vcf_file}.vcf --allow-extra-chr --chr 1-22 --recode --out ds0 
python $PRS_TOOLS/liftOverPlink/liftOverPlink.py --map ds0.map --out lifted --chain $PRS_TOOLS/liftOverPlink/hg38ToHg19.over.chain
python $PRS_TOOLS/liftOverPlink/rmBadLifts.py --map lifted.map --out good_lifted.map --log bad_lifted.dat
cut -f 2 bad_lifted.dat > to_exclude.dat
cut -f 4 lifted.bed.unlifted | sed "/^#/d" >> to_exclude.dat
plink --file ds0 --recode --out lifted --exclude to_exclude.dat
plink --ped lifted.ped --map good_lifted.map --make-bed --out ds_lifted

### Clean lasso files

In [None]:
cat $PRS_CODEBASE/lasso_folders | head -n 5 | xargs -I {} bash -c 'ls -d {}*' | \
 xargs -I A -n 1 -P 80  bash -c  'echo "start A" && cat A | awk '"'"'{if($6!=0){print $0}}'"'"' > "A_" && sleep 0 | mv A_ A && echo "done A"'

### prepare cv files for another imputation strategy

In [None]:
# pheno="ctrt"; 
# for b in {1..6}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target ukbb_sas --imp original --cv ${a}_5 --rep 105_${b} --pheno ${pheno}; done; bash prepare_cv_datasets.sh --discovery test --target ukbb_sas --imp original --cv 5 --rep 105_${b} --pheno ${pheno}; done

 target="ukbb_sas";pheno="ht"; for b in {1..6}; do for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target ${target} --imp original --cv ${a}_5 --rep 105_${b} --pheno ${pheno}; done; done

### Aggregate GAIN results

In [None]:
discoveries="PGC2_noAJ" 
targets="gain_afr"
imps="impute2_1kg_eur,impute2_1kg_afr,impute2_1kg_eas"
methods="pt3" # "pt,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="scz_gain"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

### Aggregate bcac AJ results

In [None]:
discoveries="bcac_onco_eur-5pcs" 
targets="bcac_onco_aj"
imps="impute2_1kg_eur2,impute2_1kg_afr2,impute2_1kg_eas2,impute2_ajkg14_t101"
methods="pt3" # "pt,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="bc_aj"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

### Aggregate SCZ EAS dbg-scz19 AJ results

In [None]:
discoveries="LH_PGC-SCZ-EAS,LH_PGC-SCZ-EAS_LS" 
targets="dbg-scz19"
imps="impute2_1kg_eur2,impute2_1kg_afr2,impute2_1kg_eas2 " # ",impute2_ajkg14_t101"
methods="pt3" # "pt,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="scz_aj_eas"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;


In [None]:
discoveries="vim,LH_PGC-SCZ-EAS_LS" 
targets="dbg-scz19"
imps="impute2_1kg_eur2,impute2_1kg_afr2,impute2_1kg_eas2 " # ",impute2_ajkg14_t101"
methods="ls" # "pt,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="scz_aj_eas"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix}
    done;    
done;




### Aggregate SCZ EAS gain_afr results

In [None]:
discoveries="LH_PGC-SCZ-EAS,LH_PGC-SCZ-EAS_LS" 
targets="gain_afr"
imps="impute2_1kg_eur,impute2_1kg_afr,impute2_1kg_eas"
methods="pt3" # ,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="scz_gain_afr_eas"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

### Aggregate BC EAS aj results

In [None]:
discoveries="GC_bc_sakaue_2020,GC_bc_sakaue_2020_LS" 
targets="bcac_onco_aj"
imps="impute2_1kg_eur2,impute2_1kg_afr2,impute2_1kg_eas2,impute2_ajkg14_t101"
methods="pt3"# "pt,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="bc_aj_eas"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="D2_ldlp_willer_2013,D2_chol_willer_2013,D2_hdlp_willer_2013,D2_sysp_evangelou_2018,D2_dias_evangelou_2018,D2_asth_zhu_2019,D_t2d_mahajan_2018,D2_t2di_mahajan_2018"; 
discoveries=${discoveries//,/ }
targets=ukbb_sas,ukbb_afr
targets=${targets//,/ }

declare -A gwas_to_pheno=(['D2_hght_yengo_2018']='height' ['D2_ldlp_willer_2013']='chol' ['D2_hdlp_willer_2013']='chol' \
                          ['D2_chol_willer_2013']='chol'  ['D2_sysp_evangelou_2018']='ht' ['D2_dias_evangelou_2018']='ht' \
                          ['D2_asth_zhu_2019']='ast' ['D2_t2di_mahajan_2018']='t2d'\
                          ['D2_madd_howard_2019']='madd' ['D2_gerx_an_2019']='gerx'\
                          ['UKB_ht_eur']='ht' ['UKB_chol_eur']='chol' \
                          ['UKB_hfvr_eur']='hfvr' ['UKB_hyty_eur']='hyty'  ['UKB_madd_eur']='madd' \
                          ['UKB_osar_eur']='osar' ['UKB_t2d_eur']='t2d' ['UKB_utfi_eur']='utfi' \
                          ['UKB_gerx_eur']='gerx' ['UKB_angna_eur']='angna' ['UKB_ast_eur']='ast' \
                          ['UKB_ctrt_eur']='ctrt') 

for discovery in ${discoveries[@]}; do
    pheno=${gwas_to_pheno[${discovery}]}
    for target in ${targets[@]}; do 
        echo "running ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs/${discovery}_${target}/*/*/prs.cv.pt3___* | awk '{print \$0; gsub(\"___\", \"_'$pheno'__\"); print \$0}' | xargs  -n 2 mv"
        ls -1 /specific/netapp5/gaga/gaga-pd/prs_data/PRSs/${discovery}_${target}/*/*/prs.cv.pt3___* | awk '{print $0; gsub("___", "_'$pheno'__"); print $0}' | xargs  -n 2 mv
    done
done




#### Generate folders to eth datasets

In [None]:
eth=pl 
imp=impX_new 

# Create directories
mkdir -p $PRS_DATASETS_ELKON/bcac_onco_${eth}/${imp}
mkdir -p $PRS_DATASETS_ELKON/bcac_onco-eur-minus-${eth}/${imp}

# link ELKON to GAGA
ln -s $PRS_DATASETS_ELKON/bcac_onco_${eth} $PRS_DATASETS/bcac_onco_${eth}/${imp}
ln -s $PRS_DATASETS_ELKON/bcac_onco_eur-minus-${eth} $PRS_DATASETS/bcac_onco_eur-minus-${eth}/${imp}

#### Create pop.panel file

In [None]:
head -n 1 $PRS_DATASETS/bcac_onco_eur/pop.panel | grep Poland > $PRS_DATASETS/bcac_onco_${eth}/pop.panel
cat $PRS_DATASETS/bcac_onco_eur/pop.panel | grep Poland >> $PRS_DATASETS/bcac_onco_${eth}/pop.panel

head -n 1 $PRS_DATASETS/bcac_onco_eur/pop.panel | grep -v Poland > $PRS_DATASETS/bcac_onco_eur-minus-${eth}/pop.panel
cat $PRS_DATASETS/bcac_onco_eur/pop.panel | grep Poland >> $PRS_DATASETS/bcac_onco_eur-minus-${eth}/pop.panel

#### Create pheno files

In [None]:
# Create pop 


# Create pheno to eth
awk '{if(NR==FNR){c[$1]++; next;} if(c[$1]>0){print $0}}' <(cat $PRS_DATASETS/bcac_onco_pl/pop.panel) <(cat $PRS_DATASETS/bcac_onco_eur/pheno) > /specific/netapp5/gaga/gaga-pd/prs_data/datasets/dec/bcac_onc
o_pl/pheno

# Create pheno to eur-minus-eth
awk '{if(NR==FNR){c[$1]++; next;} if(c[$1]>0){print $0}}' <(cat $PRS_DATASETS/bcac_onco_eur-minus-${eth}/pop.panel) <(cat $PRS_DATASETS/bcac_onco_eur/pheno) > /specific/netapp5/gaga/gaga-pd/prs_data/datasets/dec/bcac_onco_eur-minus-${eth}/pheno

#### Generate eth dataset

In [None]:
eth=pl 
imp=impX_new 
plink --bfile $PRS_DATASETS/${imp}/ds --keep $PRS_DATASETS/bcac_onco_${eth}/pop.panel --make-bed --out $PRS_DATASETS/bcac_onco_pl/${imp}/ds

#### QC eth set 

In [None]:
bash qc_target_data.sh --target bcac_onco_pl --imp impX_new

#### Create reps and folds 

In [None]:
eth=pl 
python create_cv_repetitions.py --dataset bcac_onco_${eth}

#### Create plink files for each rep and each fold (Do it for rep 1..6)

In [None]:
base_rep=105; 
rep=1
eth=pl
imp=impX_new

for a in {1..5}; do bash prepare_cv_datasets.sh --discovery test --target bcac_onco_${eth}  --imp ${imp} --cv ${a}_5 --rep ${base_rep}_${rep}; done
bash prepare_cv_datasets.sh --discovery test --target bcac_onco_${eth}  --imp ${imp} --cv 5 --rep ${base_rep}_${rep};

#### Generate eur-minus-eth

In [None]:
eth=pl 
imp=impX_new
plink --bfile $PRS_DATASETS_ELKON/bcac_onco_eur/${imp}/ds --remove $PRS_DATASETS_ELKON/bcac_onco_${eth}/pop.panel  --make-bed --geno 0.1 --mind 0.1 --maf 0.05 --out $PRS_DATASETS/bcac_onco_eur-minus-${eth}/${imp}/ds.QC

In [None]:
eth=pl  
bash calc_pca2_full.sh --target bcac_onco_eur-minus-${eth} --imp impX_new --stage 2