## Compute SNP accuracy in 1kG population using ethnic homogenous imputation panels 

### Split 1kg subpopulations into panel and test sets

In [None]:
for a in "ACB" "ASW" "BEB" "GBR" "CDX" "CLM" "ESN" "GWD" "GIH" "CHB" "CHS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PUR" "PJL" "STU" "YRI" "FIN" "TSI" "IBS" "CEU"; do cat pop.panel | grep ${a} | cut -f 1 > pop.${a,,}.panel; done

In [None]:
for a in "ACB" "ASW" "BEB" "GBR" "CDX" "CLM" "ESN" "GWD" "GIH" "CHB" "CHS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PUR" "PJL" "STU" "YRI" "CEU" "IBS" "TSI" "FIN"; do head -n 20  pop.${a,,}.panel > pop.${a,,}20.panel; done

In [None]:
for a in "ACB" "ASW" "BEB" "GBR" "CDX" "CLM" "ESN" "GWD" "GIH" "CHB" "CHS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PUR" "PJL" "STU" "YRI" "CEU" "IBS" "TSI" "FIN"; do tail -n +21 pop.${a,,}.panel | head -n 70 > pop.${a,,}70.panel; done

In [None]:
sp="eur"; b=20; rm pop.${sp}${b}.panel; for a in gbr fin tsi ibs ceu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="eur"; b=70; rm pop.${sp}${b}.panel; for a in gbr fin tsi ibs ceu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done

sp="afr"; b=20; rm pop.${sp}${b}.panel; for a in acb esn gwd lwk yri; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done #  asw msl
sp="afr"; b=70; rm pop.${sp}${b}.panel; for a in acb esn gwd lwk yri; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done #  asw msl

sp="sas"; b=20; rm pop.${sp}${b}.panel; for a in beb gih itu pjl stu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="sas"; b=70; rm pop.${sp}${b}.panel; for a in beb gih itu pjl stu; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done

sp="eas"; b=20; rm pop.${sp}${b}.panel; for a in cdx chb chs jpt khv; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="eas"; b=70; rm pop.${sp}${b}.panel; for a in cdx chb chs jpt khv; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done

sp="amr"; b=20; rm pop.${sp}${b}.panel; for a in clm mxl pel pur; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done
sp="amr"; b=70; rm pop.${sp}${b}.panel; for a in clm mxl pel pur; do cat pop.${a}${b}.panel >> pop.${sp}${b}.panel; done



### Aggreagte 1kG SNP accuracy results

#### Merge subpopulations (Why only two?)

In [None]:
cat ds.fam | awk '{print $1"\t"$2"\t"0"\t"$2}' >recoded.txt
plink --bfile ds --update-ids recoded.txt --make-bed --out ds2
plink --bfile ds2 --bmerge ../true/ds --make-bed --out ds_merged
plink --bfile ds2 --exclude ds_merged-merge.missnp --make-bed --out ds2_excluded
plink --bfile ds2_excluded --bmerge ../true/ds --make-bed --out ds_merged

#### Filter out SNPs that do not apear on the chip

In [None]:
plink --bfile ds_merged --extract $ELKON_SCRATCH/1000G_ALL/strict_mask --range --make-bed --out ds_merged_strict

#### QC and break into per-chromosome vcf files

In [None]:
pop="1kg_sas70" # "1kg_afr2"

for a in {1..22}; do 
    plink --bfile impute2_${pop}/ds_merged --chr ${a} --geno 0.1 --maf 0.01 --hwe 1e-06 \
    --recode vcf --out impute2_${pop}/ds.QC_merged_${a}; 
done

#### Reorder samples

In [None]:
pop=afr;
idx=3

for chr in {1..22}; do 
    echo "start $chr";  
    bcftools view -S reordered_samples${idx} impute2_1kg_${pop}70/ds.QC_merged_${chr}.vcf > impute2_1kg_${pop}70/ds.QC_merged_reordered_${chr}.vcf; 
done

#### Calculate SNP accuracy per individual

In [None]:
pop=eas;
header_idx=12
for chr in {1..22}; do 
    echo "start chr $chr"; 
    for a in {0..545}; do 
        org=$(($a+10)); 
        imp=$(($a+556)); 
        cat impute2_1kg_${pop}70/ds.QC_merged_reordered_${chr}.vcf | awk '{sum+=($'$org'==$'$imp' && index($'$imp',".")==0 && NR>'$header_idx'); count+=(index($'$imp',".")==0 && NR > '$header_idx' );}; END{print sum"\t"count"\t"sum/count}'; 
    done > impute2_1kg_${pop}70/chr${chr}_qc_merged_reordered_imputation_score
done

#### Calculate SNP accuracy per 1kG ethnic group

In [None]:
declare -A eur_ar=([min]=1 [max]=100 [name]=EUR); 
declare -A eas_ar=([min]=101 [max]=200 [name]=EAS); 
declare -A sas_ar=([min]=201 [max]=300 [name]=SAS); 
declare -A afr_ar=([min]=301 [max]=400 [name]=AFR); 
declare -A afr_ar2=([min]=401 [max]=600 [name]=AFR2); 
chrs=(22 21)#### Merge subpopulations (Why only two?)
arr=(eur_ar eas_ar sas_ar afr_ar afr_ar2); 

for chr in {1..22}; do # 
    for idx in ${arr[@]}; do 
    declare -n cur_ar="$idx" 
        for pop in eur eas afr sas; 
            do echo -n -e "${cur_ar[name]}\t$chr\t$pop\t"; head -n ${cur_ar[max]} impute2_1kg_${pop}70/chr${chr}_qc_merged_reordered_imputation_score | tail -n +${cur_ar[min]}| awk '{sum0+=$2; sum+=($2-$1); sum1+=$3; count+=1;}; END{print sum0/count"\t"sum/count"\t"sum1/count}'; 
        done;
        echo ""  
    done;  
done >res_reordered

## Compute SNP accuracy in the ajkg14 dataset using ethnic homogeneous imputation panels

### Aggreagte 1kG SNP accuracy results

#### Merge subpopulations (Why only two?)

In [None]:
cat ds.fam | awk '{print $1"\t"$2"\t"0"\t"$2}' >recoded.txt
plink --bfile ds --update-ids recoded.txt --make-bed --out ds2
plink --bfile ds2 --bmerge ../true/ds --make-bed --out ds_merged
plink --bfile ds2 --exclude ds_merged-merge.missnp --make-bed --out ds2_excluded
plink --bfile ds2_excluded --bmerge ../true/ds --make-bed --out ds_merged



#### Filter out SNPs that do not apear on the chip

In [None]:
plink --bfile ds_merged --extract $ELKON_SCRATCH/1000G_ALL/strict_mask --range --make-bed --out ds_merged_strict

#### QC and break into per-chromosome vcf files

In [None]:
pop="1kg_eur-ajkg14-t101-merged" # "1kg_afr2"

for a in {1..22}; do 
    plink --bfile impute2_${pop}/ds_merged --chr ${a} --geno 0.1 --maf 0.01 --hwe 1e-06 \
    --recode vcf --out impute2_${pop}/ds.QC_merged_${a}; 
done

#### Calculate SNP accuracy per individual

In [None]:
pop="1kg_eur-ajkg14-t101-merged" # "ajkg14_t101" # "1kg_afr2"

for chr in {22..1}; do 
    echo "start chr $chr"; 
    for a in {0..26}; do 
        org=$(($a+10)); 
        imp=$(($a+37)); 
        cat impute2_${pop}/ds.QC_merged_${chr}.vcf | awk '{sum+=($'$org'==$'$imp' && index($'$imp',".")==0 && NR>7); count+=(index($'$imp',".")==0 && NR > 7 );}; END{print sum"\t"count"\t"sum/count}'; 
    done > impute2_${pop}/chr${chr}_qc_imputation_score1 
done

#### Calculate SNP accuracy for the ajkg14 dataset

In [None]:
chrs=(1 20 22)

for chr in {1..22}; do # 
    for pop in ajkg14_t101 1kg_eur2 1kg_eas2 1kg_afr2 1kg_gbr2 1kg_eur-ajkg14-t101-merged;
        do echo -n -e "$chr\t$pop\t"; cat impute2_${pop}/chr${chr}_qc_imputation_score1 | awk '{sum0+=$2; sum+=($2-$1); sum1+=$3; count+=1;}; END{print sum0/count"\t"sum/count"\t"sum1/count}'; 
    done;  
    echo ""
done >res_reordered