# GD based on QCed Single-ancestry GWAS

In [None]:
%%writefile /home/jupyter/panukbb/scripts/calc_eur_dist.R
setwd("/home/jupyter/panukbb/outputs")
library(data.table)
# load pc and eigenvalues
pcs2 <- fread("../data/0523_aou_ancestry_combined.txt.gz")

pops <- c("AFR", "AMR")
for(pop in pops){
    pca_df <- fread(paste0("training_", pop, "_testPops_projections.txt")) #local copy from google bucket OUTPUT_DIR
pca_mat <- pca_df[, .SD, .SDcols = paste0("PC", 1:20)]

dist_df <- data.table(
  FID = pca_df$FID,
  IID = pca_df$IID,
  euc_dist = sqrt(rowMeans(pca_mat**2))
)

dist_df[,Pop := pcs2[match(dist_df$IID, IID)]$Assign_Pop]
mus <- dist_df[,lapply(.SD, mean), .SDcols = c("euc_dist"), by = Pop]
print(pop)
print(mus[order(euc_dist)])

fwrite(dist_df, file = paste0("0427_", pop, "train_proj_euc_test.tsv"), col.names = T, row.names = F, quote = F, sep = "\t")


}

# GD based on multi-ancestry discovery GWAS

## get QCed SNPs based on all pops

In [None]:
%%writefile /home/jupyter/panukbb/scripts/plink_qc_all.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
unzip "${PLINK2_SOFT}"

./plink2 --bfile ${PLINK_BFILE} \
--maf 0.01 --hwe 1e-6 --geno 0.05 --mind 0.1 \
--make-just-bim --out "${OUTDIR}"/aou_v7_allPops_chr"${CHROM}"_qcd

In [None]:
%%bash


pop="EUR"

echo -e "--env CHROM\t--input PLINK_FILES\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_plink_qc_all.tsv

for chrom in {1..22};do
    echo -e "${chrom}\tgs://fc-secure-8eead535-fb8f-4f53-8f13-49f8f3a4d8b1/data/genos/aou_v7_chr${chrom}.*\tgs://fc-secure-8eead535-fb8f-4f53-8f13-49f8f3a4d8b1/data/genos/" >> /home/jupyter/panukbb/scripts/tasks_plink_qc_all.tsv 
done

In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink2.zip" \
  --min-cores 4 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "plink_qc" \
  --label "batch=plink_qc_0410" \
  --tasks '/home/jupyter/panukbb/scripts/tasks_plink_qc_all.tsv' \
  --script '/home/jupyter/panukbb/scripts/plink_qc_all.sh'

In [None]:
%%writefile /home/jupyter/panukbb/scripts/plink_pruneAll.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
unzip "${PLINK2_SOFT}"

##pruned SNPs
./plink2 --bfile ${PLINK_BFILE} --extract "${BIMFILE}" --indep-pairwise 100 50 0.1 --exclude range "${LONG_LD}" --out "${OUTDIR}"/aou_v7_allPops_qcd_prune_chr"${CHROM}"


In [None]:
%%bash


pop="EUR"

echo -e "--env CHROM\t--input BIMFILE\t--input PLINK_FILES\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_plink_pruneAll.tsv

for chrom in {1..22};do
    echo -e "${chrom}\tgs://fc-secure-8eead535-fb8f-4f53-8f13-49f8f3a4d8b1/data/genos/aou_v7_allPops_chr${chrom}_qcd.bim\tgs://fc-secure-8eead535-fb8f-4f53-8f13-49f8f3a4d8b1/data/genos/aou_v7_chr${chrom}.*\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/" >> /home/jupyter/panukbb/scripts/tasks_plink_pruneAll.tsv 
done

In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink2.zip" \
  --input LONG_LD="${WORKSPACE_BUCKET}/panukbb/data/high-LD-regions-hg38-GRCh38.bed" \
  --min-cores 4 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "plink_pruneAll" \
  --label "batch=plink_pruneall_0410" \
  --tasks '/home/jupyter/panukbb/scripts/tasks_plink_pruneAll.tsv' \
  --script '/home/jupyter/panukbb/scripts/plink_pruneAll.sh'

In [None]:
%%writefile /home/jupyter/panukbb/scripts/plink_pruneAll_bfiles.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
ind_dir="${IND_DIR%/*}"

unzip "${PLINK2_SOFT}"

##extract pruned SNPs for training
pops="training_AFR training_EUR training_AMR trainPops testPops"
for pop in ${pops};do
./plink2 --bfile ${PLINK_BFILE} --keep-allele-order \
--keep ${ind_dir}/aou_v7_${pop}.ids --extract "${PRUNE_FILE}" \
--make-bed --out "${OUTDIR}"/aou_v7_${pop}_chr"${CHROM}"
done

In [None]:
%%bash


pop="EUR"

echo -e "--env CHROM\t--input IND_DIR\t--input PRUNE_FILE\t--input PLINK_FILES\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_plink_pruneAll_bfiles.tsv

for chrom in {1..22};do
    echo -e "${chrom}\tgs://fc-secure-0a267fcd-4f74-4643-b7f3-d0cf2e548bcc/panukbb/phenos/aou_v7*ids\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/aou_v7_allPops_qcd_prune_chr"${chrom}".prune.in\tgs://fc-secure-8eead535-fb8f-4f53-8f13-49f8f3a4d8b1/data/genos/aou_v7_chr${chrom}.*\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/" >> /home/jupyter/panukbb/scripts/tasks_plink_pruneAll_bfiles.tsv 
done

In [None]:
%%bash --out TEST_JOB_ID3

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink2.zip" \
  --min-cores 4 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "plink_pruneall_bfile" \
  --label "batch=plink_pruneall_bfile_0410_2" \
  --tasks '/home/jupyter/panukbb/scripts/tasks_plink_pruneAll_bfiles.tsv' \
  --script '/home/jupyter/panukbb/scripts/plink_pruneAll_bfiles.sh'

In [None]:
%%writefile /home/jupyter/panukbb/scripts/merge_all.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
echo ${PLINK_BFILE}

unzip "${PLINK2_SOFT}"

pops="training_AFR training_EUR training_AMR trainPops testPops"
for pop in ${pops};do

ls ${PLINK_BFILE}/aou_v7_${pop}_chr*bed | sed -e 's/.bed//' > ./merge-list.txt
./plink --merge-list merge-list.txt --out "${OUT_DIR}"/aou_v7_${pop}

done


In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink_linux_x86_64_20231018.zip" \
  --input-recursive PLINK_FILES="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/" \
  --output-recursive OUT_DIR="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/" \
  --min-cores 4 \
  --min-ram 30 \
  --disk-size 30 \
  --boot-disk-size 30 \
  --name "merge_plink" \
  --label "batch=plink_0410" \
  --script '/home/jupyter/panukbb/scripts/merge_all.sh'

In [None]:
%%writefile /home/jupyter/panukbb/scripts/update_a1s.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
echo ${PLINK_BFILE}

unzip "${PLINK2_SOFT}"



./plink --bfile ${PLINK_BFILE} --a1-allele "${A1_ALLELES}" \
    --make-bed --out "${OUT_DIR}"/aou_v7_"${POP}"_a1s



In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

pops="training_AFR training_EUR training_AMR trainPops testPops"
for pop in ${pops};do

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink_linux_x86_64_20231018.zip" \
  --input PLINK_FILES="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/aou_v7_${pop}.*" \
  --env POP="${pop}" \
  --output-recursive OUT_DIR="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/" \
  --input A1_ALLELES="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/a1s.list" \
  --min-cores 4 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "update_a1s_plink" \
  --label "batch=update_a1s_0410_2" \
  --script '/home/jupyter/panukbb/scripts/update_a1s.sh'
  
done

## Use intersecting SNPs based on QC & LD pruned in each discovery Pop

In [None]:
%%writefile /home/jupyter/panukbb/scripts/plink_qc_pops.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
ind_dir="${IND_DIR%/*}"

unzip "${PLINK2_SOFT}"

pops="training_AFR training_EUR training_AMR"

for pop in ${pops};do

./plink2 --bfile ${PLINK_BFILE} --keep ${ind_dir}/aou_v7_${pop}.ids \
--maf 0.01 --hwe 1e-6 --geno 0.05 --mind 0.1 \
--indep-pairwise 100 50 0.2 --exclude range "${LONG_LD}" \
--out "${OUTDIR}"/aou_v7_${pop}_chr"${CHROM}"_qcd_pruned02

done


In [None]:
%%bash


echo -e "--env CHROM\t--input IND_DIR\t--input PLINK_FILES\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_plink_qc_pops.tsv

for chrom in {1..22};do
    echo -e "${chrom}\tgs://fc-secure-0a267fcd-4f74-4643-b7f3-d0cf2e548bcc/panukbb/phenos/aou_v7*ids\tgs://fc-secure-8eead535-fb8f-4f53-8f13-49f8f3a4d8b1/data/genos/aou_v7_chr${chrom}.*\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/" >> /home/jupyter/panukbb/scripts/tasks_plink_qc_pops.tsv 
done

In [None]:
%%bash --out TEST_JOB_ID3

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink2.zip" \
  --input LONG_LD="${WORKSPACE_BUCKET}/panukbb/data/high-LD-regions-hg38-GRCh38.bed" \
  --min-cores 4 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "plink_qc_pops" \
  --label "batch=plink_qc_pops_0410" \
  --tasks '/home/jupyter/panukbb/scripts/tasks_plink_qc_pops.tsv' \
  --script '/home/jupyter/panukbb/scripts/plink_qc_pops.sh'

In [None]:
%%R
library(data.table)

pops <- c("EUR", "AFR", "AMR")

dats <- list()
for(i in 1:length(pops)){
  pop <- pops[i]
  tmp <- data.table()
  for(chr in 1:22){
    tt <- fread(paste0("aou_v7_training_", pop, "_chr", chr, "_qcd_pruned02.prune.in"), header = F)
    tmp <- rbind(tmp, tt)
  }
  
  dats[[i]] <- tmp
}

coms <- Reduce(intersect, list(dats[[1]]$V1, dats[[2]]$V1, dats[[3]]$V1))
length(coms) #36565
write.table(coms, file = "aou_v7_training_coms.snplist", col.names = F, quote = F, row.names = F)

#gsutil -m cp /home/jupyter/panukbb/data/bims/aou_v7_training_coms.snplist gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/

In [None]:
%%writefile /home/jupyter/panukbb/scripts/plink_prunedPops_bfile.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
ind_dir="${IND_DIR%/*}"

unzip "${PLINK2_SOFT}"

pops="training_AFR training_EUR training_AMR trainPops testPops"

for pop in ${pops};do

##extract pruned SNPs for training
./plink2 --bfile ${PLINK_BFILE} --keep ${ind_dir}/aou_v7_${pop}.ids \
--keep-allele-order --extract "${SNPLIST}" --make-bed \
--out "${OUTDIR}"/aou_v7_${pop}_qcd_pruned_chr"${CHROM}"

done


In [None]:
%%bash --out TEST_JOB_ID3

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink2.zip" \
  --input SNPLIST="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/aou_v7_training_coms.snplist" \
  --min-cores 4 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "plink_prunedpops_bfile" \
  --label "batch=plink_prunedpops_bfile_0410" \
  --tasks '/home/jupyter/panukbb/scripts/tasks_plink_qc_pops.tsv' \
  --script '/home/jupyter/panukbb/scripts/plink_prunedPops_bfile.sh'

In [None]:
%%writefile /home/jupyter/panukbb/scripts/merge_pops.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
echo ${PLINK_BFILE}

unzip "${PLINK2_SOFT}"

pops="training_AFR training_EUR training_AMR trainPops testPops"
for pop in ${pops};do

ls ${PLINK_BFILE}/aou_v7_${pop}_qcd_pruned_chr*bed | sed -e 's/.bed//' > ./merge-list.txt

./plink --keep-allele-order  --merge-list merge-list.txt --out "${OUT_DIR}"/aou_v7_${pop}

done

In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw-prs-r:test \
  --input PLINK2_SOFT="${WORKSPACE_BUCKET}/software/plink_linux_x86_64_20231018.zip" \
  --input-recursive PLINK_FILES="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/" \
  --output-recursive OUT_DIR="gs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/" \
  --min-cores 4 \
  --min-ram 30 \
  --disk-size 30 \
  --boot-disk-size 30 \
  --name "merge_pops" \
  --label "batch=merge_pops_0410" \
  --script '/home/jupyter/panukbb/scripts/merge_pops.sh'

## run PCA

In [None]:
%%writefile /home/jupyter/panukbb/scripts/flashpca_all.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"

~/flashpca/flashpca  --bfile ${PLINK_BFILE} -d 20 \
--outload "${OUTDIR}"/"${POP}"_loadings.txt \
--outmeansd "${OUTDIR}"/"${POP}"_meansd.txt \
--outvec "${OUTDIR}"/"${POP}"_eigenvectors.txt \
--outval "${OUTDIR}"/"${POP}"_eigenvalues.txt \
--outpc "${OUTDIR}"/"${POP}"_pcs.txt \
--outpve "${OUTDIR}"/"${POP}"_pve.txt 


In [None]:
%%bash
echo -e "--env POP\t--input PLINK_FILES\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_flashpca_all.tsv

pops="training_AFR training_EUR training_AMR trainPops"
for pop in ${pops};do
    echo -e "${pop}\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/aou_v7_${pop}_a1s.*\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/" >> /home/jupyter/panukbb/scripts/tasks_flashpca_all.tsv
done

In [None]:
%%bash
echo -e "--env POP\t--input PLINK_FILES\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_flashpca_all_v2.tsv

pops="training_AFR training_EUR training_AMR trainPops"
for pop in ${pops};do
    echo -e "${pop}\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/aou_v7_${pop}.*\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/" >> /home/jupyter/panukbb/scripts/tasks_flashpca_all_v2.tsv
done

In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw_flashpca:test \
  --min-cores 1 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "flashpca_v2" \
  --label "batch=flashpca_v2_0410" \
  --script '/home/jupyter/panukbb/scripts/flashpca_all.sh' \
  --tasks '/home/jupyter/panukbb/scripts/tasks_flashpca_all_v2.tsv'

In [None]:
%%writefile /home/jupyter/panukbb/scripts/flashpca_all_proj.sh

set -o errexit
set -o nounset

PLINK_BFILE="${PLINK_FILES%.*}"
input_dir="${INPUT_DIR%/*}"

~/flashpca/flashpca  --bfile ${PLINK_BFILE} --project \
--inmeansd "${input_dir}"/"${POP}"_meansd.txt \
--inload "${input_dir}"/"${POP}"_loadings.txt \
--outproj "${OUTDIR}"/"${POP}"_testPops_projections.txt  \
--outvec "${OUTDIR}"/"${POP}"_testPops_eigenvectors.txt \
--outval "${OUTDIR}"/"${POP}"_testPops_eigenvalues.txt -v



In [None]:
%%bash
echo -e "--env POP\t--input PLINK_FILES\t--input INPUT_DIR\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_flashpca_all_proj.tsv

pops="training_AFR training_EUR training_AMR trainPops"
for pop in ${pops};do
    echo -e "${pop}\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/aou_v7_testPops_a1s.*\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/${pop}_*txt\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/" >> /home/jupyter/panukbb/scripts/tasks_flashpca_all_proj.tsv
done

In [None]:
%%bash
echo -e "--env POP\t--input PLINK_FILES\t--input INPUT_DIR\t--output-recursive OUTDIR" > /home/jupyter/panukbb/scripts/tasks_flashpca_all_proj_v2.tsv

pops="training_AFR training_EUR training_AMR trainPops"
for pop in ${pops};do
    echo -e "${pop}\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/aou_v7_testPops.*\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/${pop}_*txt\tgs://fc-secure-bc8d81b7-7002-4a06-986d-e37eb771200b/pca/pruned_all/bims/" >> /home/jupyter/panukbb/scripts/tasks_flashpca_all_proj_v2.tsv
done

In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw_flashpca:test \
  --min-cores 1 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "flashpca_proj" \
  --label "batch=flashpca_proj_0410" \
  --script '/home/jupyter/panukbb/scripts/flashpca_all_proj.sh' \
  --tasks '/home/jupyter/panukbb/scripts/tasks_flashpca_all_proj.tsv'

In [None]:
%%bash --out TEST_JOB_ID

source ~/aou_dsub.bash

aou_dsub \
  --image gcr.io/ukbb-diversepops-neale/yw_flashpca:test \
  --min-cores 1 \
  --min-ram 10 \
  --disk-size 10 \
  --boot-disk-size 10 \
  --name "flashpca_proj_v2" \
  --label "batch=flashpca_proj_v2_0411" \
  --script '/home/jupyter/panukbb/scripts/flashpca_all_proj.sh' \
  --tasks '/home/jupyter/panukbb/scripts/tasks_flashpca_all_proj_v2.tsv'

In [None]:
%%R
setwd("/home/jupyter/panukbb/outputs")
library(data.table)
# load pc and eigenvalues
pops <- c("training_AFR", "training_EUR", "training_AMR", "trainPops")

pcs2 <- fread("../data/0523_aou_ancestry_combined.txt.gz")

for(pop in pops){
    print(pop)
    pca_df <- fread(paste0(pop, "_projections.txt"))
    pca_mat <- pca_df[, .SD, .SDcols = paste0("PC", 1:20)]

    dist_df <- data.table(
      FID = pca_df$FID,
      IID = pca_df$IID,
      euc_dist = sqrt(rowMeans(pca_mat**2))     
    )
    
    dist_df[,Pop := pcs2[match(dist_df$IID, IID)]$Assign_Pop]
    mus <- dist_df[,lapply(.SD, mean), .SDcols = c("euc_dist"), by = Pop]
    mus[order(euc_dist)]


    fwrite(dist_df, file = "0424_EURtrain_proj_euc_test.tsv", col.names = T, row.names = F, quote = F, sep = "\t")

}

#gustil -m cp 0424_EURtrain_proj_euc_test.tsv ${WORKSPACE_BUCKET}/panukbb/individualPRS/

In [None]:
%%R
setwd("/home/jupyter/panukbb/outputs/bims")
library(data.table)
# load pc and eigenvalues
pops <- c("training_AFR", "training_EUR", "training_AMR", "trainPops")

pcs2 <- fread("../../data/0523_aou_ancestry_combined.txt.gz")

outs <- data.table()
for(pop in pops){
    print(pop)
    pca_df <- fread(paste0(pop, "_testPops_projections.txt"))
    pca_mat <- pca_df[, .SD, .SDcols = paste0("PC", 1:20)]
    eigenvalues <- fread(paste0(pop, "_eigenvalues.txt"))$V1

    dist_df <- data.table(
      FID = pca_df$FID,
      IID = pca_df$IID,
      euc_dist = sqrt(rowMeans(pca_mat**2)), 
      mah_dist = sqrt(rowSums(sweep(pca_mat**2, 2, eigenvalues, '/')))
    )
    
    dist_df[,Pop := pcs2[match(dist_df$IID, IID)]$Assign_Pop]
    fwrite(dist_df, file = paste0("0624_", pop, "_gd.tsv"), col.names = T, row.names = F, quote = F, sep = "\t")
    mus <- dist_df[,lapply(.SD, mean), .SDcols = c("euc_dist", "mah_dist"), by = Pop]
    print(mus[order(euc_dist)])
    mus[,training := pop]
    outs <-rbind(outs, mus)
[1] "training_AFR"
   Pop   euc_dist   mah_dist
1: AFR 0.02168319 0.01931964
2: EAS 0.10006237 0.03862752
3: AMR 0.11974947 0.04719078
4: CSA 0.12180336 0.04656767
5: MID 0.13252304 0.05069538
6: EUR 0.15485431 0.05888731
[1] "training_EUR"
   Pop    euc_dist   mah_dist
1: EUR 0.009635833 0.01067965
2: AMR 0.018887601 0.01654855
3: CSA 0.030026092 0.02795632
4: EAS 0.038432065 0.03783596
5: MID 0.044132719 0.02731746
6: AFR 0.060390162 0.04550837
[1] "training_AMR"
   Pop   euc_dist   mah_dist
1: CSA 0.02054483 0.01158777
2: AMR 0.02991078 0.02082831
3: EAS 0.03192151 0.02179467
4: MID 0.04551933 0.01658957
5: EUR 0.05058898 0.02134086
6: AFR 0.13125855 0.04562398
[1] "trainPops"
   Pop   euc_dist    mah_dist
1: CSA 0.02490818 0.009728248
2: EUR 0.03884104 0.006048028
3: MID 0.04117273 0.019880651
4: AMR 0.04527470 0.011689324
5: EAS 0.06250559 0.012109583
6: AFR 0.10344316 0.013524216
    
   
}
fwrite(outs, file = "0427_proj_gd.tsv", col.names = T, row.names = F, quote = F, sep = "\t")

#gsutil -m cp 0427_proj_gd.tsv ${WORKSPACE_BUCKET}/panukbb/individualPRS/