## BCAC CV

### Create an LD for a specific GWAS and dataset (Legacy. Use now pt3.sh)

In [None]:

discovery_path=$PRS_GWASS/bcac_onco_eur-5pcs/

if [[ ! -f ds.dupvar ]]; then
    touch ds.dupvar
fi
echo clumping
plink \
    --bfile ds.QC \
  --clump-p1 1 \
  --clump-p2 1 \
  --clump-r2 0.2 \
  --clump-kb 500 \
  --clump ${discovery_path}gwas.QC.Transformed \
  --clump-snp-field SNP \
  --clump-field P \
  --out eur \
  --exclude ds.dupvar

echo filter by clumping
awk 'NR!=1{print $3}' eur.clumped >  eur.valid.snp

### Prepare splits

### Prepare datasets

#### multiple imps (UKB)

In [None]:
discovery="bcac_onco_eur-5pcs"
target="bcac_onco_aj"
imp=impX_new
base_rep="105"
n_repetitions=6
n_folds=5
## Adjust reps and folds according to a parameter

#Prepare inner CV (train+validation)
for cur_rep in $(seq 1 $n_repetitions); do 
    for cur_fold in $(seq 1 $n_folds); do 
        bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} \
        --imp ${imp} --rep ${base_rep}_${cur_rep} \
        --cv ${cur_fold}_${n_folds}; 
    done 
done

#Prepare outer CV (both+test)
for cur_rep in $(seq 1 $n_repetitions); do 
    bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} \
    --imp ${imp} --rep ${base_rep}_${cur_rep} \
    --cv ${n_folds}
done 

### Create CV

#### Generally the best practive it os run the scripts ```run_folds/run_UKB_folds_<method>.sh```. The run only parts that are missing in the CV scheme and is generally faster and safer.

In [None]:
method=ls

discoveries="bcac_onco_eur-5pcs" 
targets="bcac_onco_aj"
imps="impX_new"

bash run_folds_${method}.sh --discoveries=${discoveries} --targets=${targets} --imps=${imps}

#### To override existing results, use the scripts below.

#### Run PRS method across reps

In [None]:
discovery="bcac_onco_eur-5pcs"
target="bcac_onco_aj"
imp="impX_new" # "impute2_1kg_ceu2"
method="ls"
base_rep=105

for cur_rep in {1..6}; do 
    bash run_cv_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} \
    --method ${method} --rep ${base_rep}_${cur_rep} --start_fold 2 && \
    bash run_cv_test_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} \
    --method ${method} --rep ${base_rep}_${cur_rep}; 
done

#### Altentively, it can be executed as two separated commads (usefule for reruns of failed executions)

In [None]:
discovery="bcac_onco_eur-5pcs"
target="bcac_onco_aj"
imp="impX_new" # "impute2_1kg_ceu2"
method="ls"
base_rep=105

for cur_rep in {1..6}; do 
    bash run_cv_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} \
    --method ${method} --rep ${base_rep}_${cur_rep}
done

for cur_rep in {1..6}; do 
    bash run_cv_test_single_rep.sh --discovery ${discovery} --target ${target} --imp ${imp} \
    --method ${method} --rep ${base_rep}_${cur_rep}
done

### Clean lasso output weight files from 0 weighted SNPs

In [None]:
cat $PRS_CODEBASE/lasso_folders | head -n 5 | xargs -I {} bash -c 'ls -d {}*' | \
 xargs -I A -n 1 -P 80  bash -c  'echo "start A" && cat A | awk '"'"'{if($6!=0){print $0}}'"'"' > "A_" && sleep 0 | mv A_ A && echo "done A"'

### Run 6X5 nested CV

In [None]:
discoveries="bcac_onco_eur-5pcs" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt3,pt2,ls,ld"
methods=${methods//,/ }
base_rep=105
suffix="bcac_aj2"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="bcac_onco_eur-minus-uk-5pcs" 
targets="bcac_onco_uk_6000"
imps="impX_new"
methods="ls,pt"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_uk_uk"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..5}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="bcac_onco_eur-minus-uk-5pcs" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt3,ls"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_uk_aj"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

#### aggregate bcac_onco_eur-minus-pl

In [None]:
discoveries="bcac_onco_eur-minus-pl" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt2,pt3,ls,ld"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_pl_aj"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 

In [None]:
discoveries="bcac_onco_eur-minus-pl" 
targets="bcac_onco_aj"
imps="impX_new"
methods="pt3,pt2,ls"
methods=${methods//,/ }
base_rep=105
suffix="bcac_minus_pl_aj"

## aggregate_prs_statistics
for method in ${methods[@]}; do
    for cur_rep in {1..6}; do 
        echo "start aggreagating method ${method} rep ${base_rep}_${cur_rep}"
        python aggregate_prs_statistics_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep ${base_rep}_${cur_rep} --suffix ${suffix} &&
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --suffix ${suffix} && \
        python plot_metrics_boxplots_cv_${method}.py --discoveries ${discoveries} --targets ${targets} --imps ${imps} --rep_start ${base_rep}_${cur_rep} --rep_end ${base_rep}_${cur_rep} --metric or_99 --suffix ${suffix} &
    done;    
done;


# ## for cur_rep in {1..6}; do 
#     python aggregate_prs_statistics_cv_${method}.py --discoveries PGC2_noAJ --targets dbg-scz19 --imps impute2_ajkg14_t101,impute2_1kg_eur2,impute2_1kg_eas2,impute2_1kg_afr2,impute2_1kg_gbr2,impute2_1kg_ceu2,impute2_1kg_tsi2,impute2_1kg_ibs2,impute2_1kg_fin2 --rep ${base_rep}_${cur_rep};
# done;
 