# Creating a 1000G reference panel

This creates the 1000G reference panel to merge with the other genetics data to determine the ancestry of the samples.

From the 1000g reference from https://www.cog-genomics.org/plink/2.0/resources#1kg_phase3, only keep the biallelic snps on autosomes with a MAF > 0.01, geno > 0.95 and hwe > 1e-6. Also, pallindromes and long LD regions were excluded. 



In [None]:
import pandas as pd

In [None]:
%%bash
# decompress pgen and rename psam as suggested
plink2 --zst-decompress all_hg38.pgen.zst > all_hg38.pgen
cp hg38_corrected.psam all_hg38.psam

In [None]:
%%bash
# sample info
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.xlsx

In [None]:
t=pd.read_excel('20130606_sample_info.xlsx', usecols=['Sample', 'Population'])
t.rename(columns={'Sample':'IID'}, inplace=True)
t.to_csv('20130606_sample_info.txt', index=False, sep='\t')

In [None]:
%%bash
# hg38 long LD region
wget https://raw.githubusercontent.com/meyer-lab-cshl/plinkQC/master/inst/extdata/high-LD-regions-hg38-GRCh38.txt
sed -i 's/chr//g' high-LD-regions-hg38-GRCh38.txt

# hg38 fasta file
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz

In [None]:
%%bash
# filter the snps
plink2\
  --autosome\
  --allow-extra-chr\
  --exclude bed0 high-LD-regions-hg38-GRCh38.txt\
  --fa /data/iwakih2/resources/hg38.fa.gz\
  --geno 0.05\
  --hwe 0.000001\
  --maf 0.01\
  --make-pgen\
  --max-alleles 2\
  --mind 0.01\
  --out all_hg38_filtered\
  --pfile all_hg38 vzs\
  --ref-from-fa force\
  --remove deg2_hg38.king.cutoff.out.id\
  --snps-only just-acgt\
  --sort-vars

In [None]:
!grep '##' all_hg38_filtered.pvar | wc -l

In [None]:
df = pd.read_csv('all_hg38_filtered.pvar', delim_whitespace=True, skiprows=211, engine='c')
df.head()

In [None]:
df=df[['#CHROM','POS', 'ID', 'REF', 'ALT']].copy()
df = df.replace(['A', 'C', 'G', 'T'], [-10, -1, 1, 10]) # convert to values
IDexclude = df.loc[df['ALT'] + df['REF'] ==0, 'ID'] # 0 = A/T or C/G
IDexclude.to_csv('palindrome.txt', index=False, header=False)
print(IDexclude.shape)

In [None]:
%%bash
# rename the snps
plink2\
  --pfile all_hg38_filtered\
  --exclude palindrome.txt\
  --out all_hg38_filtered_chrpos\
  --set-all-var-ids 'chr@:#:$r:$a'\
  --make-bed

# Ancestry mapping of the plink file

In [None]:
# Read the file into a pandas DataFrame
t=pd.read_csv('20130606_sample_info.txt', sep='\t')

# merge with fam file
d=pd.read_csv('all_hg38_filtered_chrpos.fam', header=None, sep='\t', usecols=[1], names=['IID'])
df=pd.merge(d, t, on=['IID'], how='left')

# One individual is missing for the ancestry so hand label it
df.loc[df.IID=='NA12236', 'Population'] = 'CEU'


# relabel them
ancestory_map={
    'YRI': 'AFR',
    'GWD': 'AFR',
    'ESN': 'AFR',
    'LWK': 'AFR',
    'ACB': 'AFR',
    'ASW': 'AFR',
    'MSL': 'AFR',
    'CEU': 'EUR',
    'IBS': 'EUR',
    'TSI': 'EUR',
    'FIN': 'EUR',
    'GBR': 'EUR',
    'CHS': 'EAS',
    'JPT': 'EAS',
    'CHB': 'EAS',
    'KHV': 'EAS',
    'CDX': 'EAS',
    'PJL': 'SAS',
    'ITU': 'SAS',
    'STU': 'SAS',
    'BEB': 'SAS',
    'GIH': 'SAS',
    'PUR': 'AMR',
    'CLM': 'AMR',
    'PEL': 'AMR',
    'MXL': 'AMR',
    
}


df['Population2']=df['Population'].map(ancestory_map).fillna('Study')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2573 entries, 0 to 2572
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IID          2573 non-null   object
 1   Population   2573 non-null   object
 2   Population2  2573 non-null   object
dtypes: object(3)
memory usage: 60.4+ KB


In [None]:
df.pivot_table(index='Population', columns='Population2', 
               aggfunc='count', margins=True, values='IID', fill_value=0)

In [None]:
df.to_csv('all_hg38_filtered_chrpos_pop.txt', index=False, sep='\t')

In [27]:
!du -sh *

128K	20130606_sample_info.txt
1.0M	20130606_sample_info.xlsx
5.7G	all_hg38_filtered_chrpos.bed
329M	all_hg38_filtered_chrpos.bim
128K	all_hg38_filtered_chrpos.fam
512	all_hg38_filtered_chrpos.log
128K	all_hg38_filtered_chrpos_pop.txt
512	all_hg38_filtered.log
8.9G	all_hg38.pgen
3.2G	all_hg38.pgen.zst
128K	all_hg38.psam
2.6G	all_hg38.pvar.zst
128K	deg2_hg38.king.cutoff.out.id
128K	hg38_corrected.psam
512	high-LD-regions-hg38-GRCh38.txt
128K	main.ipynb
23M	palindrome.txt
512	reame.md
