## Preprocessing UKB data

### Merge UKB chromosome bed files into a single bed file

In [None]:
pop="eur"
ukbb_chr_path=$PRS_DATASETS/ukbb/raw/chrs
rm $ukbb_chr_path/mergelist.txt || echo ""
for i in {1..22}; do
    echo ${ukbb_chr_path}/chr${i}_${pop} >> ${ukbb_chr_path}/mergelist.txt
done
plink --merge-list ${ukbb_chr_path}/mergelist.txt --make-bed --out ${ukbb_chr_path}/ds_${pop}.all || ret=$?
echo $ret
if [[ -f ${ukbb_chr_path}/ds_${pop}.all-merge.missnp ]] && [[ ret -ne 0  ]]; then
    echo 'fix bed files'
    for i in {1..22}; do
        plink --bfile ${ukbb_chr_path}/chr${i}_${pop} --exclude ${ukbb_chr_path}/ds_${pop}.all-merge.missnp --make-bed --out ${ukbb_chr_path}/chr${i}_${pop}.fixed || echo "" 
        mv ${ukbb_chr_path}/chr${i}_${pop}.fixed.fam ${ukbb_chr_path}/chr${i}_${pop}.fam
        mv ${ukbb_chr_path}/chr${i}_${pop}.fixed.bim ${ukbb_chr_path}/chr${i}_${pop}.bim
        mv ${ukbb_chr_path}/chr${i}_${pop}.fixed.bed ${ukbb_chr_path}/chr${i}_${pop}.bed
    done

echo "Retry to merge (fixed) chrs files"
plink --exclude ${ukbb_chr_path}/ds_${pop}.all-merge.missnp --merge-list ${ukbb_chr_path}/mergelist.txt --make-bed --out ${ukbb_chr_path}/ds_${pop}.all

fi


### Prepare new pheno files for UKB datasets
- Before running the script below, activate your python virtual environment.

In [None]:
for pop in "gbr" "sas" "afr"; do
     for pheno in "t2d" "osar" "hfvr" "chol" "ht" "ast"; do
         ( python prepare_pheno_file.py --raw_phenotype_file $PRS_DATASETS/ukbb/ukb_code6.csv --pop_file $PRS_DATASETS/ukbb/pop.panel.${pop} --pheno_file $PRS_DATASETS/ukbb/pheno_${pheno}_${pop} --conditions_file $PRS_DATASETS/ukbb/conditions_${pheno}.json
         echo -e "FID\tIID\tlabel" > $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}.tmp
         tail -n +2 $PRS_DATASETS/ukbb/pheno_${pheno}_${pop} >> $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}.tmp
         mv $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}.tmp $PRS_DATASETS/ukbb/pheno_${pheno}_${pop}
         if [[ -d $PRS_DATASETS/ukbb_${pop} ]]; then 
             cp $PRS_DATASETS/ukbb/pheno_${pheno}_${pop} $PRS_DATASETS/ukbb_${pop}/pheno_${pheno}_ 
         fi 
         echo "done ${pop} ${pheno}!" ) &
     done
done

In [None]:
### create CV

In [None]:
# phenos="angna ast chol ctrt gerx hfvr ht hyty madd osar t2d utfi"

phenos="ctrt"

for pheno in ${phenos}; do 


# pheno="angna"
discovery="UKB_${pheno}_eur"
target="ukbb_sas"
imp="impute2_1kg_eur100"
base_rep="105"
n_folds=5

## Adjust reps and folds according to a parameter

#Prepare inner CV (train+validation)
for cur_rep in {1..6}; do 
    for cur_fold in {1..5}; do 
        bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} \
        --imp ${imp} --rep ${base_rep}_${cur_rep} \
        --cv ${cur_fold}_${n_folds} --pheno=${pheno}; 
    done 
done

#Prepare outer CV (both+test)
for cur_rep in {1..6}; do 
    bash prepare_cv_datasets.sh --discovery ${discovery} --target ${target} \
    --imp ${imp} --rep ${base_rep}_${cur_rep} \
    --cv ${n_folds} --pheno=${pheno};
done 

done

In [6]:
phenos="ctrt chol hfvr hyty ht osar madd t2d ast utfi gerx angna"
target="ukbb_afr"
folds=2
for pheno in $phenos; do 
    for cur_rep in {1..3}; do
        cd $PRS_DATASETS/$target/rep_102_${cur_rep}
        for cur_fold in {1..2}; do
                echo "cur files (inner): pheno_${pheno}__${cur_fold}"
                awk '{if(NR==FNR){c[$1]=$3; next;} print $1"\t"$2"\t"c[$1]}' <(cat ../pheno_${pheno}_) <(cat pheno___${cur_fold}_${folds}_train) > pheno_${pheno}__${cur_fold}_${folds}_train
                awk '{if(NR==FNR){c[$1]=$3; next;} print $1"\t"$2"\t"c[$1]}' <(cat ../pheno_${pheno}_) <(cat pheno___${cur_fold}_${folds}_validation) > pheno_${pheno}__${cur_fold}_${folds}_validation
                
        done
        echo "cur files (outer): pheno_${pheno}"
        awk '{if(NR==FNR){c[$1]=$3; next;} print $1"\t"$2"\t"c[$1]}' <(cat ../pheno_${pheno}_) <(cat pheno___${folds}_both) > pheno_${pheno}__${folds}_both
                awk '{if(NR==FNR){c[$1]=$3; next;} print $1"\t"$2"\t"c[$1]}' <(cat ../pheno_${pheno}_) <(cat pheno___${folds}_test) > pheno_${pheno}__${folds}_test
    done
done 
        

cur files (inner): pheno_ctrt__1
cur files (inner): pheno_ctrt__2
cur files (outer): pheno_ctrt
cur files (inner): pheno_ctrt__1
cur files (inner): pheno_ctrt__2
cur files (outer): pheno_ctrt
cur files (inner): pheno_ctrt__1
cur files (inner): pheno_ctrt__2
cur files (outer): pheno_ctrt
cur files (inner): pheno_chol__1
cur files (inner): pheno_chol__2
cur files (outer): pheno_chol
cur files (inner): pheno_chol__1
cur files (inner): pheno_chol__2
cur files (outer): pheno_chol
cur files (inner): pheno_chol__1
cur files (inner): pheno_chol__2
cur files (outer): pheno_chol
cur files (inner): pheno_hfvr__1
cur files (inner): pheno_hfvr__2
cur files (outer): pheno_hfvr
cur files (inner): pheno_hfvr__1
cur files (inner): pheno_hfvr__2
cur files (outer): pheno_hfvr
cur files (inner): pheno_hfvr__1
cur files (inner): pheno_hfvr__2
cur files (outer): pheno_hfvr
cur files (inner): pheno_hyty__1
cur files (inner): pheno_hyty__2
cur files (outer): pheno_hyty
cur files (inner): pheno_hyty__1
cur fil

#### Create "default" cv pheno files from cv pheno files of a specific pheno

In [7]:
pheno="ctrt"
target="ukbb_afr"
base_rep=105
folds=5
for cur_rep in {1..6}; do
    cd $PRS_DATASETS/$target/rep_${base_rep}_${cur_rep}
    for cur_fold in {1..5}; do
        echo "cur files (inner): pheno_${pheno}_${cur_rep}_${cur_fold}"    
        cp pheno_${pheno}__${cur_fold}_${folds}_train pheno___${cur_fold}_${folds}_train
        cp pheno_${pheno}__${cur_fold}_${folds}_validation pheno___${cur_fold}_${folds}_validation 

    done
    echo "cur files (outer): pheno_${pheno}_${cur_rep}"
    cp pheno_${pheno}__${folds}_both pheno___${folds}_both
    cp pheno_${pheno}__${folds}_test pheno___${folds}_test
done


cur files (inner): pheno_ctrt_1_1
cur files (inner): pheno_ctrt_1_2
cur files (inner): pheno_ctrt_1_3
cur files (inner): pheno_ctrt_1_4
cur files (inner): pheno_ctrt_1_5
cur files (outer): pheno_ctrt_1
cur files (inner): pheno_ctrt_2_1
cur files (inner): pheno_ctrt_2_2
cur files (inner): pheno_ctrt_2_3
cur files (inner): pheno_ctrt_2_4
cur files (inner): pheno_ctrt_2_5
cur files (outer): pheno_ctrt_2
cur files (inner): pheno_ctrt_3_1
cur files (inner): pheno_ctrt_3_2
cur files (inner): pheno_ctrt_3_3
cur files (inner): pheno_ctrt_3_4
cur files (inner): pheno_ctrt_3_5
cur files (outer): pheno_ctrt_3
cur files (inner): pheno_ctrt_4_1
cur files (inner): pheno_ctrt_4_2
cur files (inner): pheno_ctrt_4_3
cur files (inner): pheno_ctrt_4_4
cur files (inner): pheno_ctrt_4_5
cur files (outer): pheno_ctrt_4
cur files (inner): pheno_ctrt_5_1
cur files (inner): pheno_ctrt_5_2
cur files (inner): pheno_ctrt_5_3
cur files (inner): pheno_ctrt_5_4
cur files (inner): pheno_ctrt_5_5
cur files (outer): phe