## Impute datasets

#### Run impute2 using run_by_capacity.sh script

In [None]:
ds1_name="1kg"
ds1_pop="tsi2" # "eur-minus-${p}-aj-snps" # "eur"
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop} # ${ds1_pop}-${ds2_name}-${ds2_pop}-merged

bash run_by_capacity.sh --cmd_fn impute_${merged_panel}_scz --input_fn chrs

#### Check if parts files are not corrupted (by number of fields in each row).
After deleting the corrupted files, the impute2 scripts should be executed again to regenerate them.

In [None]:
## Default params
# panel="1kg"
# start_pos=0
# end_pos=500

imp="impute2_1kg_sas"
target="ukbb_sas"

bash remove_inconsistent_part_files.sh --target ${target} --imp ${imp} --chrs 3-22

#### Merge impute2 segments into a single bed file
1. For each chromosome, merge chromosome segments (part files) into a single (gen/impute2) file.
2. Convert chromosome files into a bed files. 
3. Merge all chromosome bed files into a single bed file

In [None]:
target='dbg-scz19'
ds1_name="1kg"
ds1_pop="eur100-2" # "eur-minus-${p}-aj-snps # eur # ds2_name="ajkg14" # ds2_pop="t101"
ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # -${ds2_name}-${ds2_pop}-merged

bash ip_1_merge_to_impute2.sh --target ${target} --imp ${ds_merged_imputed}
bash ip_2_convert_to_bed.sh --target ${target} --imp ${ds_merged_imputed} 
bash ip_3_merge_bed_files.sh --target ${target} --imp ${ds_merged_imputed} 




#### Identiy missed corrupted part files
In case the pervious step failed due to missed corrupted part files, these files can be identified by the position of the problematic line in the aggregated chromosome file. After deleting these files, the impute2 scripts should be executed again to regenerate them.  

In [None]:
sum=0 
chr=8
line_number=2160093

datasets_folder=$PRS_DATASETS
for a in {1..500}; do 
    echo "cur part file index: $a"; 
    sum=$((sum + $(cat ${datasets_folder}/ukbb_sas/impute2_1kg_sas/raw/impute2/parts/chr${chr}.${a}.legend | wc -l))); 
    echo "Finished passing on part file index $a (Total no. of lines passed: $sum)"; 
    if [[ $sum -gt  ${line_number} ]]; then 
        echo "Corrupted file: $a"
        break 
    fi
done

#### Move merged dataset into its proper folder

In [None]:
# p="fin"
target="dbg-scz19"
ds1_name="1kg"
ds1_pop="eur100-2"  # "eur-minus-${p}-aj-snps" # ds2_name="ajkg14" # ds2_pop="t101"
ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # impute2_${ds1_name}_${ds1_pop}-${ds2_name}-${ds2_pop}impute2_

for a in bim bed fam; do  
    mv $PRS_DATASETS/${target}/${ds_merged_imputed}/raw/impute2/bed/ds.all.${a} \
    $PRS_DATASETS/${target}/${ds_merged_imputed}/ds.${a} & 
done 

#### Run QC on the data

In [None]:
# p="ceu"
discovery="D2_hght_yengo_2018" # "UKB_ht_EUR_1"# "D_t2d_mahajan_2018"
target="ukbb_sas" # "ukbb_afr"
ds1_name="1kg"
ds1_pop="eur100-minus-gbr" # "eur-minus-${p}-aj-snps" # "eur" # 
pheno="" # "ht"
# ds2_name="ajkg14"
# ds2_pop="t101"
ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # impute2_${ds1_name}_${ds1_pop}-${ds2_name}-${ds2_pop}-merged


bash qc_target_data.sh --target ${target} --imp ${ds_merged_imputed}  # && bash calc_prs.sh --target ${target} --imp ${ds_merged_imputed} --discovery ${discovery} --stage 0 --pheno ${pheno} --pval_th 0.00000005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5

#### Compute PRS

In [None]:
# p="ceu"
target='ukbb_afr'
ds1_name="1kg"
ds1_pop="afr" # "eur-minus-${p}-aj-snps" # "eur" # 
discovery='ukbb_utfi_morton_2019'
pheno="utfi"
# ds2_name="ajkg14"
# ds2_pop="t101"
ds_merged_imputed=impute2_${ds1_name}_${ds1_pop} # impute2_${ds1_name}_${ds1_pop}-${ds2_name}-${ds2_pop}-merged

bash calc_prs.sh --target ${target} --imp ${ds_merged_imputed} --discovery ${discovery} --pheno ${pheno} --stage 0 --pval_th 0.00000005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5

### Dealing with part and impute2 files after creating the merged bed file
After generating the merged bed file, part and impute2 files are only needed for debug purposes. Therefore, they could be zipped or deleted.
Below are scripts for zipping these files

**Note that info files can be sometimes useful for QC purposes** (i.e, remove SNPS with low confidence score).

#### zip part files:

In [None]:
## Default params
# panel="1kg"
# start_pos=0
# end_pos=500

# Single imp
# imp="impute2_1kg_eur2"

target="dbg-scz19"
ls -1 $PRS_DATASETS | grep impute2 | xargs -I {} -n 1 -P 10 bash zip_part_files.sh --target ${target} --imp {}



#### zip chr files:

In [None]:
## Default params
# panel="1kg"

# Single imp
# imp="impute2_1kg_eur2"

target="dbg-scz19"
ls -1 $PRS_DATASETS | grep impute2 | xargs -I {} -n 1 -P 10 bash zip_chr_files.sh --target ${target} --imp {}

