## Combine phased and unphased datasets into one imputation panel

### First, you need to phase the unphased dataset using shapeit. This process is usually executed via"run_by_capacity.sh"

In [None]:
ds2_name="ajkg14"

bash run_by_capacity.sh --cmd_fn shapeit_${ds2_name} --input_fn chrs

### Create a phased vcf file from phased haps/sample files

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="kdv" # eur-minus-${p}-aj-snps

for a in {1..22}; do shapeit -convert --input-haps $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased --output-vcf $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf && echo "done chr${a}" & done

### Align datasets

#### Define export (if not exists):

In [None]:
export BCFTOOLS_PLUGINS=/specific/elkon/tools/bcftools-git/bcftools/plugins

#### Creates a compressed (bgz) and indexed (tabix) file from phased genome (e.g., 1kG)  

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps

declare -a chrs=({1..22}); for a in ${chrs[@]}; do bgzip $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf && tabix -f $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf.gz && echo "done chr${a}"  &  done

#### Flip strands in **phased** data (using bcftools +fixref plugin)

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps

for a in {1..22}; do 
    bcftools +fixref $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.vcf.gz -Oz -o \
    $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz -- -d \
    -f $HOME_ELKON/data-scratch/human_g1k_v37.fasta \
    -i $HOME_ELKON/data-scratch/1000G_ALL/chr${a}.vcf.gz \
    && echo "done chr${a}" & \
done

#### sort position in (unsorted) vcf files

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps
declare -a chrs=(8 15); 

for a in ${chrs[@]}; do 
    mv $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz \
    $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.unsorted.vcf.gz; 
done

for a in ${chrs[@]}; do 
    bcftools sort $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.unsorted.vcf.gz \
    -Oz -o $PRS_DATASETS/1kg/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz \
    && echo "done chr${a}"; \
done

#### Index a vcf.gz (bgz) file (create a tbi file):

In [None]:
p="ceu" 
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps
declare -a chrs=({1..22}); 

for a in ${chrs[@]}; do 
    tabix -f $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz \
    && echo "done chr${a}" & 
done

### Merge files

#### Merge the two datasets' vcf files

In [None]:
p="ceu"
ds1_name="1kg"
ds1_pop="eur"
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop}-${ds2_name}-${ds2_pop}-merged

mkdir -p $PRS_DATASETS/${ds1_name}/imputation_panels/${merged_panel} || true
declare -a chrs=({1..22}); 
for a in ${chrs[@]}; 
    do bcftools merge -Oz -o $PRS_DATASETS/${ds1_name}/imputation_panels/${merged_panel}/chr${a}.vcf.gz \
    $PRS_DATASETS/${ds1_name}/imputation_panels/${ds1_pop}/chr${a}.phased.flipped.vcf.gz  \
    $PRS_DATASETS/${ds2_name}/imputation_panels/${ds2_pop}/phased/chr${a}.phased.flipped.vcf.gz && echo "done chr${a}" & \
done

#### Make haps/sample from merged files

In [None]:
p="ceu"
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop}-${ds2_name}-${ds2_pop}-merged
declare -a chrs=({1..22}); 

for a in ${chrs[@]}; do 
    plink2 --vcf $PRS_DATASETS/${ds1_name}/imputation_panels/${merged_panel}/chr${a}.vcf.gz \
    --export haps --out $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a} \
    --geno 0 --min-alleles 2 --max-alleles 2 --threads 50 & 
done


### Generate imputation panels

In [None]:
p="ceu"
ds1_name="1kg"
ds1_pop="eur" # eur-minus-${p}-aj-snps
ds2_name="ajkg14"
ds2_pop="t101"
merged_panel=${ds1_pop}-${ds2_name}-${ds2_pop}-merged

declare -a chrs=({1..22}); 
for a in ${chrs[@]}; do  
    shapeit -convert --input-haps $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a} \
    --output-log $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a}.log \
    --output-ref $PRS_DATASETS/1kg/imputation_panels/${pop}/chr${a}.ref.hap \
        $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a}.ref.legend.gz 
        $PRS_DATASETS/1kg/imputation_panels/${merged_panel}/chr${a}.ref.sample \
    --thread 50 & 
done