# Derive set of variants for GWAS study (NONGENT)

1. SNP level filtering    
1a. Call rate < 0.95    
1b. MAF < 0.01 (in nonGenetic cohort)    
2. Sample level filtering    
2a. High missingness (Sample call rate < 0.95)    
2b. Sex discordance (skipped: sex chromosomes not available. Checked differently)    
2c. Extreme heterozygosity (|F|>0.15)    
2d. Ancestory filtering    
2e. Excess relatedness (pairwise kinship coefficients > 0.125 within cases or controls)    
3. SNP level filtering (second)    
3a.hwe < 1e-4 (among controls)    

In [63]:
%%bash
INFILE="../../../../PPMI_WGS/july_2018/PPMI_july2018"
rm -rf t
mkdir -p t
echo '

################## step 1a ##############################
'
module load plink
plink --bfile $INFILE --geno 0.05 --make-bed --out t/1a
echo '

################## step 1b ##############################
'
FILENAME="t/1a"
DIAGFILE="../../../PDcohorts/PPMI/out181018/DEMOG_DIAG.csv"
echo "library(data.table);library(dplyr)
fam=fread('$INFILE.fam') %>% rename(FID=V1, IID=V2, GENETIC_SEX=V5, GENETIC_PHENO=V6);diag=fread('$DIAGFILE')
data = diag %>% mutate(IID=paste('PPMISI', PATNO, sep='')) %>% left_join(fam, ., by = 'IID')
data %>% with(table(GENETIC_SEX, FEMALE, useNA = 'always'));data %>% with(table(GENETIC_PHENO, RECRUIT, useNA = 'always'))
data %>% filter(RECRUIT %in% c('GENPD', 'GENUN', 'REGPD', 'REGUN'))  %>% select(FID, IID) %>% write.table(., 't/ID_GENETIC.txt', quote = F, row.names = F)
data %>% filter(!RECRUIT %in% c('GENPD', 'GENUN', 'REGPD', 'REGUN')) %>% select(FID, IID) %>% write.table(., 't/ID_NONGENT.txt', quote = F, row.names = F)
data %>% filter(is.na(RECRUIT)) %>% select(FID, IID) %>% write.table(., 't/ID_UNKNOWN.txt', quote = F, row.names = F)
" > t/IDsep.R
FILENAME="t/1a"
module load R
Rscript --vanilla t/IDsep.R
plink --bfile $FILENAME --keep t/ID_NONGENT.txt --maf 0.01 --make-bed --out t/1b
echo '

################## step 2a ##############################
'
FILENAME="t/1b"
plink --bfile $FILENAME --mind 0.05 --make-bed --out t/2a
echo '

################## step 2c ##################################
'
FILENAME="t/2a"
THRES=0.15
PRUNE=0.1
plink --bfile $FILENAME --indep-pairwise 50 5 $PRUNE --out t/pruning
plink --bfile $FILENAME --extract t/pruning.prune.in --make-bed --out t/pruned_data
plink --bfile t/pruned_data --het --out t/prunedHet
awk '{if ($6 <= -'$THRES') print $0 }' t/prunedHet.het > t/outliers1.txt
awk '{if ($6 >= '$THRES') print $0 }' t/prunedHet.het > t/outliers2.txt
cat t/outliers1.txt t/outliers2.txt > t/HETEROZYGOSITY_OUTLIERS.txt
plink --bfile $FILENAME --remove t/HETEROZYGOSITY_OUTLIERS.txt --make-bed --out t/2c
echo '

################## step 2d ##################################
'
FILENAME="t/2c"
HAPMAP="../../../tool/hapmap/HAPMAP_hg19_new"
LONGLD="../../../tool/longLD_hg38.txt"
echo '###### Extract non-palindrome SNPs ############################################'
awk '{geno=$5"_"$6;if (geno=="A_G" || geno=="A_C" ||
                       geno=="C_A" || geno=="C_T" ||
                       geno=="G_A" || geno=="G_T" ||
                       geno=="T_C" || geno=="T_G") print}' $FILENAME.bim | cut -f2 > $FILENAME.extract
plink --bfile $FILENAME --extract $FILENAME.extract --make-bed --out "$FILENAME"p
echo '###### Try merge (.missnp for multi alleic cites (potentially flipped strand)##'
plink --bfile "$FILENAME"p --bmerge $HAPMAP --make-bed --out t/hapmap_trymerge
echo '###### Flip the .missnp #######################################################' 
plink --bfile "$FILENAME"p --flip t/hapmap_trymerge-merge.missnp --make-bed --out "$FILENAME"pf
echo '###### A1/A2 exchange correction ##############################################'
plink --bfile "$FILENAME"pf --a1-allele $HAPMAP.bim 2 5 --make-bed --out "$FILENAME"pfa
echo '###### Second try merge ########################################################'
plink --bfile "$FILENAME"pfa --bmerge $HAPMAP --make-bed --out "$FILENAME"pfah --geno 0.01
echo '######  final merge (Sometimese does not need to create pfah2 (simple case)#####'
plink --bfile "$FILENAME"pfa --exclude "$FILENAME"pfah-merge.missnp --make-bed --out "$FILENAME"pfa2
plink --bfile "$FILENAME"pfa2 --bmerge $HAPMAP --make-bed --out "$FILENAME"pfah --geno 0.01
echo '###### LD prune to reduce the snps to 300K #####################################'
plink --bfile "$FILENAME"pfah --indep-pairwise 50 5 0.5 --maf 0.05 --out t/pruning
plink --bfile "$FILENAME"pfah --extract t/pruning.prune.in --make-bed --out "$FILENAME"pfahp
echo '###### PCA #####################################################################'
# Use maf 0.05 and LD prune
plink --bfile "$FILENAME"pfahp --exclude $LONGLD --pca 4 header --make-bed --out t/pca
echo 'library("ggplot2");library(tidyr);library(dplyr);library(data.table)
pcs <- read.table("t/pca.eigenvec", header = T, stringsAsFactors = F) %>% 
  separate(IID, c("IID", "Population", "Continent"), sep = "_", fill = "right") %>% select(IID, FID, PC1, PC2, PC3)
cohort1 = fread("clean/ID_GENETIC.txt") %>% mutate(GeneticStudy="GeneticStudy")
cohort2 = fread("clean/ID_NONGENT.txt") %>% mutate(GeneticStudy="NonGeneticStudy")
cohort = bind_rows(cohort1, cohort2)
pops = read.table("t/pca.fam", header = F, stringsAsFactors = F) %>%
  separate(V2, c("IID", "Population", "Continent"), sep = "_", fill = "right") %>% 
  mutate(
    Population=case_when(
      is.na(Population) & V6==1 ~ "STUDY_HC",
      is.na(Population) & V6==2 ~ "STUDY_PD",
      is.na(Population) & V6==-9 ~ "STUDY_NA",
      TRUE ~ Population),
    Continent=ifelse(is.na(Continent), "STUDY", Continent)) %>% 
  mutate(Population=factor(Population, levels=c("CEU", "CHB", "CHD", "JPT","LWK", "TSI", "YRI", "STUDY_HC", "STUDY_PD", "STUDY_NA"))) %>% 
  rename(FID=V1) %>% select(FID, IID, Population, Continent)
data = inner_join(pops, pcs, by=c("FID", "IID")) %>% 
  left_join(., cohort, by=c("FID", "IID")) %>% 
  mutate(Continent=ifelse(is.na(GeneticStudy), Continent, GeneticStudy))
thres = data %>% group_by(Continent) %>% summarise_at(vars("PC1","PC2"), funs(L=mean(.)-6*sd(.), H=mean(.)+6*sd(.)))
data2 = data %>% mutate(Ancestry = case_when(
  PC1 >= thres[thres$Continent=="EUROPE",]$PC1_L & PC2 >= thres[thres$Continent=="EUROPE",]$PC2_L & PC1 <= thres[thres$Continent=="EUROPE",]$PC1_H & PC2 <= thres[thres$Continent=="EUROPE",]$PC2_H ~ "European",
  PC1 >= thres[thres$Continent=="AFRICA",]$PC1_L & PC2 >= thres[thres$Continent=="AFRICA",]$PC2_L & PC1 <= thres[thres$Continent=="AFRICA",]$PC1_H & PC2 <= thres[thres$Continent=="AFRICA",]$PC2_H ~ "African",
  PC1 >= thres[thres$Continent=="ASIA",]$PC1_L & PC2 >= thres[thres$Continent=="ASIA",]$PC2_L & PC1 <= thres[thres$Continent=="ASIA",]$PC1_H & PC2 <= thres[thres$Continent=="ASIA",]$PC2_H ~ "Asian",
  TRUE ~ "Admixed"
))
cat("\nTable for Population vs Ancestry");table(data2$Population, data2$Ancestry)
data2 %>% filter(Continent %in% c("GeneticStudy", "NonGeneticStudy") & Ancestry=="European") %>% select(FID, IID) %>% write.table(., "t/cohort_europe.txt", quote=F, row.names = F)
data2 %>% filter(Continent %in% c("GeneticStudy", "NonGeneticStudy") & Ancestry=="Asian") %>% select(FID, IID) %>% write.table(., "t/cohort_asia.txt", quote=F, row.names = F)
data2 %>% filter(Continent %in% c("GeneticStudy", "NonGeneticStudy") & Ancestry=="African") %>% select(FID, IID) %>% write.table(., "t/cohort_africa.txt", quote=F, row.names = F)
data2 %>% filter(Continent %in% c("GeneticStudy", "NonGeneticStudy") & Ancestry=="Admixed") %>% select(FID, IID) %>% write.table(., "t/cohort_admix.txt", quote=F, row.names = F)
f1 = ggplot(data2, aes(PC1, PC2, color = Population, shape = Continent)) + geom_point() + theme_bw() + 
  geom_rect(aes(xmin = thres[thres$Continent=="EUROPE",]$PC1_L, xmax = thres[thres$Continent=="EUROPE",]$PC1_H, ymin = thres[thres$Continent=="EUROPE",]$PC2_L, ymax = thres[thres$Continent=="EUROPE",]$PC2_H), fill = NA, color = "grey", linetype = 2) +
  geom_rect(aes(xmin = thres[thres$Continent=="AFRICA",]$PC1_L, xmax = thres[thres$Continent=="AFRICA",]$PC1_H, ymin = thres[thres$Continent=="AFRICA",]$PC2_L, ymax = thres[thres$Continent=="AFRICA",]$PC1_H), fill = NA, color = "grey", linetype = 2) + 
  geom_rect(aes(xmin = thres[thres$Continent=="ASIA",]$PC1_L,   xmax = thres[thres$Continent=="ASIA",]$PC1_H,   ymin = thres[thres$Continent=="ASIA",]$PC2_L,   ymax = thres[thres$Continent=="ASIA",]$PC2_H), fill = NA, color = "grey", linetype = 2)
f2 = data2 %>% filter(Ancestry=="European") %>% 
  ggplot(., aes(PC1, PC2, color = Population, shape = Continent)) + geom_point() + theme_bw() + 
  geom_rect(aes(xmin = thres[thres$Continent=="EUROPE",]$PC1_L, xmax = thres[thres$Continent=="EUROPE",]$PC1_H, ymin = thres[thres$Continent=="EUROPE",]$PC2_L, ymax = thres[thres$Continent=="EUROPE",]$PC2_H), fill = NA, color = "grey", linetype = 2)
f3 = data2 %>% filter(Ancestry=="European") %>% 
  ggplot(., aes(PC1, PC3, color = Population, shape = Continent)) + geom_point() + theme_bw()
ggsave("t/pca_f1.jpg", plot=f1);ggsave("t/pca_f2.jpg", plot=f2);ggsave("t/pca_EuropeanPC12.jpg", plot=f3)
# Colur by cohort
diag = fread("../../../PDcohorts/PPMI/out181018/DEMOG_DIAG.csv") %>% 
  mutate(ID = paste("PPMISI", PATNO, sep=""),
         Variants = case_when(
           grepl("LRRK2+", DIAG)~ "LRRK2+",
           grepl("GBA+", DIAG)~ "GBA+",
           grepl("SNCA+", DIAG)~"SNCA+")) %>% 
  select(ID, Variants)
table(diag$Variants)
data3 = left_join(data2, diag, by = c("IID"="ID")) %>% mutate(Variants = ifelse(is.na(Variants), "N/A", Variants))
f4 = data3 %>% filter(Ancestry=="European") %>% select(PC1, PC2, Variants, Continent) %>% 
  ggplot(., aes(PC1, PC2, color = Variants, shape = Continent)) + geom_point() + theme_bw()
ggsave("t/pca_f4.jpg", plot=f4)' > t/PCA.R
Rscript --vanilla t/PCA.R
plink --bfile $FILENAME --keep t/cohort_europe.txt --make-bed --out t/2d
echo '

################## step 2e ##################################
'
FILENAME="t/2d"
PRUNE=0.1
plink --bfile $FILENAME --make-bed --indep-pairwise 50 5 $PRUNE --out t/pruning
plink --bfile $FILENAME --extract t/pruning.prune.in --make-bed --out t/pruned_data
plink --bfile t/pruned_data --make-rel square0 bin4 --out t/relatedness
echo 'id=read.table("t/relatedness.rel.id", stringsAsFactors = F)
n=nrow(id)
t=readBin("t/relatedness.rel.bin", what="numeric", n=n*n, size=4)
m=matrix(t, ncol=n)
rel=data.frame(id, NA)
names(rel)=c("IID1", "IID2", "Relatedness")
x=1
for (i in 1:n){for (j in which(m[i,]>0.125)){if(i!=j){rel[x,]=c(id[i,2], id[j,2], m[i,j]);x=x+1}}}
rel=rel[complete.cases(rel),]
write.table(rel, "t/relatedness_raw.txt", quote=F, row.names=F)
library(data.table);library(dplyr)
rel = fread("t/relatedness_raw.txt")
diag=fread("'$DIAGFILE'") %>% 
  mutate(IID = paste("PPMISI", PATNO, sep="")) %>% select(IID, RECRUIT)
rel1= left_join(rel, diag, by=c("IID1"="IID")) %>% rename(RECRUIT1=RECRUIT) %>% 
  left_join(., diag, by=c("IID2"="IID")) %>% rename(RECRUIT2=RECRUIT) %>% 
  rowwise() %>% 
  mutate(PD1 = paste(length(grep("PD", RECRUIT1)), length(grep("PD", RECRUIT2)), sep = "_")) %>% 
  mutate(KEEP=case_when(
    PD1=="0_1" ~ IID2,
    TRUE~IID1)) %>% 
  mutate(EXCL=case_when(
    PD1=="0_1" ~ IID1,
    TRUE~IID2)) %>% 
  mutate(KEEP_EXCL = ifelse(PD1=="0_1", "1_0(FLIP)", PD1)) %>% 
  select(KEEP, EXCL, KEEP_EXCL)
cat("Paring and case (1), control(0) status\n");rel1 %>% distinct(EXCL, .keep_all = T) %>% with(table(KEEP_EXCL))
rel1 %>% filter(KEEP_EXCL %in% c("0_0", "1_1")) %>% distinct(EXCL) %>% mutate(FID=EXCL, IID=EXCL) %>% select(FID, IID) %>% write.table(., "t/relatedness_EXCL.txt", quote=F, row.names=F)
' > t/relatedness.R
Rscript --vanilla t/relatedness.R
plink --bfile $FILENAME --remove t/relatedness_EXCL.txt --make-bed --out t/2e
echo '

################## step 3a ##################################
'
FILENAME="t/2e"
plink --bfile $FILENAME --filter-controls --hwe 1E-4 --write-snplist --out t/hwe
plink --bfile $FILENAME --extract t/hwe.snplist --make-bed --out t/3a



################## step 1a ##############################

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to t/1a.log.
Options in effect:
  --bfile ../../../../PPMI_WGS/july_2018/PPMI_july2018
  --geno 0.05
  --make-bed
  --out t/1a

257652 MB RAM detected; reserving 128826 MB for main workspace.
49900024 variants loaded from .bim file.
1379 people (777 males, 602 females) loaded from .fam.
1249 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 1379 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%

[+] Loading plink  1.9.0-beta4.4  on cn3094 
[-] Unloading gcc  7.2.0  ... 
[-] Unloading GSL 2.4 for GCC 7.2.0 ... 
[-] Unloading openmpi 3.0.0  for GCC 7.2.0 
[-] Unloading R 3.5.0_build2 
[+] Loading gcc  7.2.0  ... 
[+] Loading GSL 2.4 for GCC 7.2.0 ... 
[+] Loading openmpi 3.0.0  for GCC 7.2.0 
[+] Loading R 3.5.0_build2 

Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Error: 1018 variants with 3+ alleles present.
* If you believe this is due to strand inconsistency, try --flip with
  t/hapmap_trymerge-merge.missnp.
  alleles probably remain in your data.  If LD between nearby SNPs is high,
  --flip-scan should detect them.)
* If you are dealing with genuine multiallelic variants, we recommend exporting
  that subset of the data to VCF (via e.g. '-

In [5]:
%%bash
echo "################## Final Report #################################"
for FILENAME in "../../../../PPMI_WGS/july_2018/PPMI_july2018" "t/1a" "t/1b" "t/2a" "t/2b" "t/2c" "t/2d" "t/2e" "t/3a";do
    echo "STEP $(echo $FILENAME | cut -d'/' -f2):  PEOPLE=$(wc -l $FILENAME.fam | cut -f1 -d' '),  VARIANTS=$(wc -l $FILENAME.bim | cut -f1 -d' ')"
done
echo "#################################################################"

################## Final Report #################################
STEP ..:  PEOPLE=1379,  VARIANTS=49900024
STEP 1a:  PEOPLE=1379,  VARIANTS=48590533
STEP 1b:  PEOPLE=754,  VARIANTS=10376226
STEP 2a:  PEOPLE=754,  VARIANTS=10376226
STEP 2b:  PEOPLE=,  VARIANTS=
STEP 2c:  PEOPLE=746,  VARIANTS=10376226
STEP 2d:  PEOPLE=703,  VARIANTS=10376226
STEP 2e:  PEOPLE=703,  VARIANTS=10376226
STEP 3a:  PEOPLE=703,  VARIANTS=10308883
#################################################################


wc: t/2b.fam: No such file or directory
wc: t/2b.bim: No such file or directory


whole cohort
![pca_f1.jpg](t/pca_f1.jpg)

Europeans PC1 vs PC2
![pca_f2.jpg](t/pca_f2.jpg)

Europeans PC1 vs PC3
![t/pca_EuropeanPC13.jpg](t/pca_EuropeanPC12.jpg)

In [6]:
%%bash
mkdir clean_NONGENT
mv t/3a* clean_NONGENT

# Creating PCs and see the population structure within Europeans
1. Calculate PCs
    * for whole cohorts
    * for genetic study and ordinary study separately
2. Create PC plots and write out outliers: if any PC1-5 outside of mean +/- 8SD

In [7]:
%%bash
# Create PCs for whole dataset
BFILE="clean_NONGENT/3a"
LONGLD="../../../tool/longLD_hg38.txt"
module load plink
mkdir -p t
plink --bfile $BFILE --exclude $LONGLD --indep-pairwise 50 5 0.1 --out t/prune
plink --bfile $BFILE --extract t/prune.prune.in --make-bed --out t/prune 
plink --bfile t/prune --pca header --out $BFILE

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to t/prune.log.
Options in effect:
  --bfile clean_NONGENT/3a
  --exclude ../../../tool/longLD_hg38.txt
  --indep-pairwise 50 5 0.1
  --out t/prune

257652 MB RAM detected; reserving 128826 MB for main workspace.
10308883 variants loaded from .bim file.
703 people (467 males, 236 females) loaded from .fam.
580 phenotype values loaded from .fam.
--exclude: 10308883 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 703 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51

[+] Loading plink  1.9.0-beta4.4  on cn3390 


In [13]:
%%bash
BFILE=clean_NONGENT/3a
echo 'library(data.table);library(ggplot2);library(dplyr)
INPUT=commandArgs(trailingOnly = TRUE)
PCA = paste(INPUT, "eigenvec", sep=".")
PCs = fread(PCA) %>% inner_join(., fread("clean/3a.fam")[,c(1,2,6)], by=c("FID"="V1", "IID"="V2")) %>% mutate(PATNO=sub("PPMISI", "",IID))
demog = fread("../../../PDcohorts/PPMI/out181018/DEMOG_DIAG.csv", colClasses = c("PATNO"="character"))
df = left_join(PCs, demog, by ="PATNO") %>% mutate(PHENO=as.factor(V6))
df %>% ggplot(., aes(PC1, PC3, color=RECRUIT, shape=PHENO)) + geom_point()
f1 = df %>% ggplot(., aes(PC1, PC2, color=RECRUIT, shape=PHENO)) + geom_point()
ggsave(paste(INPUT, "jpg", sep = "."), plot=f1)
thres = df %>% summarise_at(vars("PC1", "PC2", "PC3", "PC4", "PC5"), funs(L = mean(.)-8*sd(.), H = mean(.)+8*sd(.)))
# outlier = df %>% mutate(EXCL = ifelse(thres$PC1_L < PC1 & PC1 < thres$PC1_H & thres$PC2_L < PC2 & PC2 < thres$PC2_H, 0, 1)) %>% 
outlier = df %>% mutate(EXCL = ifelse(thres$PC1_L < PC1 & PC1 < thres$PC1_H & thres$PC2_L < PC2 & PC2 < thres$PC2_H & thres$PC3_L < PC3 & PC3 < thres$PC3_H & thres$PC3_L < PC3 & PC3 < thres$PC3_H & thres$PC4_L < PC4 & PC4 < thres$PC4_H & thres$PC5_L < PC5 & PC5 < thres$PC5_H, 0, 1)) %>% 
  filter(EXCL==1) %>% select(FID, IID)
write.table(outlier, paste(INPUT, "PCoutliers.txt", sep = "_"), quote = F, row.names = F)
if(nrow(outlier)>0){
  f2 = anti_join(df, outlier, by = c("FID", "IID")) %>% anti_join(., outlier, by = c("FID", "IID")) %>% 
    ggplot(., aes(PC1, PC2, color=RECRUIT, shape=PHENO)) + geom_point()
  ggsave(paste(INPUT, "woOutlier.jpg", sep = "_"), plot=f2)
}' > t/PC12.R
Rscript --vanilla t/PC12.R $BFILE/3a
# create the bfile without outliers
module load plink
plink --bfile $BFILE --remove "$BFILE"_PCoutliers.txt --make-bed --out "$BFILE"rmOUTR

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to clean_NONGENT/3armOUTR.log.
Options in effect:
  --bfile clean_NONGENT/3a
  --make-bed
  --out clean_NONGENT/3armOUTR
  --remove clean_NONGENT/3a_PCoutliers.txt

257652 MB RAM detected; reserving 128826 MB for main workspace.
10308883 variants loaded from .bim file.
703 people (467 males, 236 females) loaded from .fam.
580 phenotype values loaded from .fam.
--remove: 700 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 700 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%

[+] Loading plink  1.9.0-beta4.4  on cn3390 


PCA for NONGENT
![clean_NOPNGENT/3a.jpg](clean_NONGENT/3a.jpg)

### Number of outliers

In [9]:
%%bash
echo 'Number minus 1 is the Number of outliers'
wc -l clean_NONGENT/3a_PCoutliers.txt

Number minus 1 is the Number of outliers
4 clean_NONGENT/3a_PCoutliers.txt


PCA for NONGENT without outliers
![clean_NONGENT_NOOUTLIERS/3a.jpg](clean_NONGENT/3a_woOutlier.jpg)

# Liftover to hg19

In [13]:
%%bash
cat clean_NONGENT/3a.bim | awk '{print "chr"$1"\t"$4"\t"$4+1"\t"$2}' | sed 's/:/_/g' > 3a.forhg19

In [14]:
%%bash
head 3a.forhg19 
# get the result form 'https://genome.ucsc.edu/cgi-bin/hgLiftOver' It doesn't accept ":" for variants name. 

chr1	10390	10391	chr1_10390
chr1	10403	10404	chr1_10403
chr1	10406	10407	chr1_10406
chr1	10413	10414	chr1_10413
chr1	10415	10416	chr1_10415
chr1	10416	10417	chr1_10416
chr1	10421	10422	chr1_10421
chr1	10423	10424	chr1_10423
chr1	10427	10428	chr1_10427
chr1	10428	10429	chr1_10428


In [26]:
%%bash
# Use return from liftover procedure
cat clean_NONGENT/hglft_genome_699c_278060.bed | awk '{print $4"\t"$2}' | sed 's/_/:/' > clean_NONGENT/3a_hg19.update
grep -v 'rs' clean_NONGENT/hglft_genome_699c_278060.bed | sed 's/_/\t/' |\
 awk '{if($1!=$4) print $4":"$5"\t"$1,$2}' > clean_NONGENT/3a_hg19.chrmchange
head clean_NONGENT/3a_hg19.chrmchange # chromosome change between hg19 and hg38

chr1:180944	chr15 102521211
chr1:180951	chr15 102521204
chr1:181844	chr2 114359688
chr1:182597	chr2 114358936
chr1:183189	chr2 114358344
chr1:183192	chr2 114358341
chr1:184461	chr15 102517224
chr1:184773	chr2 114356759
chr1:185313	chr2 114356219
chr1:185336	chr2 114356196


In [28]:
%%bash
module load plink
plink --bfile clean_NONGENT/3a --extract clean_NONGENT/3a_hg19.update --update-map clean_NONGENT/3a_hg19.update --make-bed --out _temp
plink --bfile _temp --exclude clean_NONGENT/3a_hg19.chrmchange --make-bed --out clean_NONGENT/3a_hg19 # To sort by position, need to reload.
rm _temp*

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to _temp.log.
Options in effect:
  --bfile clean_NONGENT/3a
  --extract clean_NONGENT/3a_hg19.update
  --make-bed
  --out _temp
  --update-map clean_NONGENT/3a_hg19.update

257652 MB RAM detected; reserving 128826 MB for main workspace.
10308883 variants loaded from .bim file.
703 people (467 males, 236 females) loaded from .fam.
580 phenotype values loaded from .fam.
--update-map: 10242886 values updated.
--extract: 10242886 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 703 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%

[+] Loading plink  1.9.0-beta4.4  on cn3566 


In [29]:
%%bash
head clean_NONGENT/3a_hg19.bim

1	chr1:180899	0	10236	AAACCCT	AACCCTAACCCT
1	chr1:180900	0	10237	*	ACCCT
1	chr1:180901	0	10238	*	CCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAA
1	chr1:180905	0	10242	*	AACCCT
1	chr1:180906	0	10243	*	ACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAAC
1	chr1:10390	0	10390	C	CCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA
1	chr1:10403	0	10403	*	ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
1	chr1:10406	0	10406	*	CT
1	chr1:10413	0	10413	*	TA
1	chr1:10415	0	10415	*	ACCCTAACCCTAACCCTAACCCTAAC
