# Notebook to Prepare 1000 Genomes Data
## Requires plink in binary folder and blink version: 1.9

In [1]:
import allel
import h5py  # Python Package to do the HDF5.
import numpy as np
import pandas as pd

In [2]:
#path_vcf = "../Data/1000Genomes/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz" # Path of VCF
path_vcf = "../Data/1000Genomes/1000gX1240kEur.vcf"
#h5_path = "../Data/1000Genomes/1000Genomes_X.hdf5"
h5_path = "../Data/1000Genomes/1000gX1240kEur.hdf5"
ind_path = "../Data/1000Genomes/integrated_call_samples_v2.20130502.ALL.ped"  # Family Relationships
pop_path = "../Data/1000Genomes/integrated_call_samples_v3.20130502.ALL.panel" # Population Information

## Do the Conversion to HDF 5 
Comment out / one can skip down for standalone extraction of autosomes

In [4]:
### Do the conversion to hdf5 (if not done already)
# geno = allel.read_vcf(path_vcf) # Load the VCF # Load the VCF # Needs too much Memory for my laptop
#allel.vcf_to_hdf5(input=path_vcf, output=h5_path, compression="gzip") # Do the conversion to hdf5. Takes 10 Minutes

In [3]:
## Load HDF5
f = h5py.File(h5_path, "r") # Load for Sanity Check. See below!
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))
#print(list(f["samples"].keys()))

Loaded 47297 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']


In [4]:
### Load the ID of the Individuals
ids = np.array(f["variants/ID"])
df_s_empirical = pd.DataFrame({'Individual ID' : list(f["samples"])})

In [5]:
df_s_empirical["Individual ID"] = df_s_empirical["Individual ID"].str.split("_").str[-1]

# Load and Merge Individual Meta Data

In [8]:
df_i = pd.read_csv(ind_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Individuals")

df_pops = pd.read_csv(pop_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Population Data")

### Merge with IDs in Genotype File
df = pd.merge(df_s_empirical, df_i, on='Individual ID', how='inner')
df = pd.merge(df, df_pops, left_on="Individual ID", right_on="sample", how="inner")
print(f"Merged from {len(df_s_empirical)} to {len(df)} individuals")
assert(len(df_s_empirical) == len(df))  # Sanity Check

Loaded 3691 Individuals
Loaded 3691 Population Data
Merged from 503 to 503 individuals


In [9]:
df["Population"].value_counts()

TSI    107
IBS    107
CEU     99
FIN     99
GBR     91
Name: Population, dtype: int64

# Save Individual List
Save table with Individual and Family IDs

In [5]:
save_path = "../Data/1000Genomes/EUR.csv"

eur_inds = df["super_pop"]=="EUR"
print(f"Nr of European Samples: {np.sum(eur_inds)}")

df_save = df[eur_inds]
df_save = df_save[["Family ID", "Individual ID"]] # Extract column of Individual and Family ID
df_save.to_csv(save_path, sep="\t", header=None, index=False)
print(f"Saved to {save_path}. Nr Individuals: {len(df_save)}")

### Do the same, but for families
save_path = "../Data/1000Genomes/EUR_fam.csv"

eur_inds = df["super_pop"]=="EUR"
print(f"Nr of European Samples: {np.sum(eur_inds)}")

df_save = df[eur_inds]
df_save = df_save["Individual ID"] # Extract column of Individual and Family ID
df_save.to_csv(save_path, sep="\t", header=None, index=False)
print(f"Saved to {save_path}. Nr Individuals: {len(df_save)}")

NameError: name 'df' is not defined

# Check against Sardinian X data

In [11]:
path_snp = "../../ancient-sardinia/data/bed/full230.snp" # All SNPs found in the 1240k Ancient Panel

df_snp = pd.read_csv(path_snp, header=None, sep=r"\s*", engine="python")
df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
df_snp = df_snp[df_snp["chr"]==23]

print(f"Loaded {len(df_snp)} X SNPs.")

  yield pat.split(line.strip())
  yield pat.split(line.strip())


Loaded 49711 X SNPs.


# Save  interesection with 1240k Marker 
Prepare txt List for Plink filtering

In [6]:
save_path = "../Data/1000Genomes/variants1240k"

found = np.isin(f["variants/POS"], df_snp["pos"])
print(f"Intersection: {np.sum(found)} out of {len(found)} SNPS")
variants = f["variants/ID"][found]

dots = np.where(variants == ".")[0]
print(f"Found {len(dots)} unnamed SNPs")
variants = np.delete(variants, dots)

np.savetxt(save_path, variants, fmt="%s")
print(f"Successfully saved to {save_path}. Length: {len(variants)}")

NameError: name 'df_snp' is not defined

# Prepare downsampled Autosomal hdf5s
Prepare a 1000 Genome autosomal hdf5 file. Include Recombination Map
Input: 1000 Genome vcf file, Recombination Map from a 1240k Eigenstrat
## Standalone from here forward.

In [3]:
### Important Parameters and paths
ch = 3 # Which Chromosome to use:

# Path of the 1000 Genome VCF:
p1, p2 = "", ""
file_vcf100g, path_vcf100g = "", ""
out_vcf_path0, out_vcf_path = "", ""
path_hdf5temp, path_hdf5final = "", ""

snp1240k_path, ind_path = "", ""   # Where to find the 1240k SNPs
snp_filter_path = ""

def prepare_paths(ch = 3):
    """Prepares all the Paths need for processing Steps.
    ch: Which Chromosomes to use"""
    global p1, p2, file_vcf100g, path_vcf100g, out_vcf_path0, out_vcf_path, path_hdf5temp, path_hdf5final
    global snp1240k_path, ind_path, snp_filter_path, out_vcf_path_gz, marker_path
    # Path of the 1000 Genome VCF:
    p1 = "../Data/1000Genomes/AutosomeVCF/"
    p2 = ".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
    file_vcf100g = "ALL.chr" + str(ch) + p2
    path_vcf100g = p1 + file_vcf100g
    print(f"Full Input path:\n{path_vcf100g}")
    out_vcf_path0 = "../Data/1000Genomes/AutosomeVCF/Subset/" + "1240EURchr" + str(ch) # needs no .vcf
    out_vcf_path = out_vcf_path0 + ".vcf"
    out_vcf_path_gz = out_vcf_path + ".gz"
    path_hdf5temp = "../Data/1000Genomes/HDF5/FULLHDF5/cr" + str(ch) + ".hdf5"
    path_hdf5final = "../Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr" + str(ch) + ".hdf5"
    snp1240k_path = "../Data/1000Genomes/Markers/MinMyc.snp"   # Where to find the 1240k SNPs
    ind_path = "../Data/1000Genomes/Individuals/EUR_fam.csv"   # Where to find the individual lists
    marker_path = "../Data/1000Genomes/Markers/1240k/chr" + str(ch) + ".csv"

    # Path of SNP Filter
    snp_filter_path = "../Data/1000Genomes/Markers/variants1240k" + str(ch) + ".txt"

### Step 0: Download the Data
### Step 1: Produce hdf5 file for all markers
### Step 2: Extract Positions. Match with Eigenstrat File Positions
### Step 3: Create new vcf based on subset of Individuals and Markers
### Step 4: Transfer to hdf5. 
### Step 5: Merge in Linkage Map
### Step 6: Quality Check? (Control ref/alt against hdf5 we have for Sardinians)

In [5]:
### Step 0: Download the Data
def download_1kg():
    path_cl = "/project/jnovembre/data/external_public/1kg_phase3/haps/"
    path_cluster = "hringbauer@midway.rcc.uchicago.edu:" + path_cl + file_vcf100g
    
    p_c = path_cluster + ".tbi"
    p_v = path_vcf100g + ".tbi"
    #!scp $p_c $p_v # Download the tbi
    !scp $path_cluster $path_vcf100g # Only Download the .vcf (not the .tbi)
    
### Step 1: Produce hdf5 file for all markers
def vcf_to_hdf5(in_path, out_path):
    """Transform Full VCF to FULL HDF5"""
    allel.vcf_to_hdf5(input=in_path, output=out_path, compression="gzip") # Takes 10 Minutes
    
### Step 2: Extract Positions. Match with Eigenstrat File Positions
### Load HDF5

def merge_positions():
    """Creates the Filter File to filter SNPs for"""
    f_full = h5py.File(path_hdf5temp, "r") # Load for Sanity Check. See below!
    print("Loaded %i variants" % np.shape(f_full["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f_full["calldata/GT"])[1])
    print(list(f_full["calldata"].keys()))
    print(list(f_full["variants"].keys()))
    #print(list(f["samples"].keys()))

    ### Load Eigenstrat
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s*", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    ### Prepare SNP File for Eigenstrat filtering 
    found = np.isin(f_full["variants/POS"], df_snp["pos"])
    print(f"Intersection: {np.sum(found)} out of {len(found)} SNPS")
    variants = f_full["variants/ID"][found]

    dots = np.where(variants == ".")[0]
    print(f"Found {len(dots)} unnamed SNPs")
    variants = np.delete(variants, dots)

    np.savetxt(snp_filter_path, variants, fmt="%s")
    print(f"Successfully saved to {snp_filter_path}. Length: {len(variants)}")
    
def save_1240kmarkers():
    """Save all 1240 Markers in csv"""
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s*", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    df_save = df_snp[["chr", "pos"]]
    df_save.to_csv(marker_path, sep="\t", header=None, index=False)
    print(f"Saved {len(df_save)} 1240k Markers on Chr. {ch} to {marker_path}")
    
### Step 3: Create new vcf based on subset of Individuals and Markers
def plink_new_vcf():
    !plink --vcf $path_vcf100g --extract $snp_filter_path --keep-fam $ind_path --recode vcf --out $out_vcf_path0 --biallelic-only strict --keep-allele-order

### Step 3b
def bctools_new_vcf0():
    """Same as PLINK, but with bcftools"""
    str_ex = "ID=@" + snp_filter_path
    #!echo bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
    !bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
    print("Finished BCF tools runs.")
    
def bctools_new_vcf():
    """Same as PLINK, but with bcftools and directly via Marker Positions"""
    !bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -T $marker_path -m2 -M2 -v snps $path_vcf100g
    print("Finished BCF tools runs.")

### Step 4: Transfer to hdf5.
#allel.vcf_to_hdf5(input=out_vcf_path, output=path_hdf5final, compression="gzip") # Takes 1s
 
### Step 5: Merge in Linkage Map
### Load HDF5
def merge_in_ld_map():
    """Merge in ld_map into HDF5!"""
    f = h5py.File(path_hdf5final, "r") # Load for Sanity Check. See below!
    print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
    print(list(f["calldata"].keys()))
    print(list(f["variants"].keys()))
    #print(list(f["samples"].keys()))

    ### Load Eigenstrat
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s*", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    ### Intersect SNP positions
    its, i1, i2 = np.intersect1d(f["variants/POS"], df_snp["pos"], return_indices=True)

    l = len(f["variants/POS"])
    print(f"Intersection {len(i2)} out of {l}")

    ### Extract 
    rec = np.zeros(len(f["variants/POS"]))
    rec[i1] = df_snp["map"].values[i2]  # Fill in the values in Recombination map

    ids0 = np.where(rec == 0)[0] # The 0 Values
    rec[ids0] = (rec[ids0-1] + rec[ids0+1]) / 2.0 # Interpolate

    ### Make sure that sorted
    assert(np.all(np.diff(rec)>=0))  # Assert the Recombination Map is sorted! (no 0 left and no funky stuff)

    f.close()
    with h5py.File(path_hdf5final, 'a') as f0:
        group = f0["variants"]
        group.create_dataset('MAP', (l,), dtype='f')   
        f0["variants/MAP"][:] = rec[:]

    print(f"Finished Chromosome {ch}")
    
### Step 6: Delete the Data:
def del_temp_data():
    !rm $path_vcf100g # Remove the full .vcf
    #!rm $out_vcf_path
    #!rm $path_hdf5temp

# All in one Run

In [6]:
def prep_1000genomes_full(ch):
    """ch: Which Chromosome to prepare"""
    prepare_paths(ch = ch)
    download_1kg()
    print("Download Complete")
    #vcf_to_hdf5(in_path=path_vcf100g, out_path=path_hdf5temp) # Takes 10 Minutes
    #print("Transformation to HDF5 Complete.")
    #merge_positions()
    save_1240kmarkers()
    #plink_new_vcf()
    bctools_new_vcf()
    vcf_to_hdf5(in_path=out_vcf_path_gz, out_path=path_hdf5final)
    merge_in_ld_map()
    #del_temp_data()
    print(f"Finished Preparing HDF5 Chromosome {ch}. GZ!")

In [7]:
#ch=2  # Which Chromosome to prepare

for ch in range(1, 23):
    print(f"Preparing Chromosome: {ch}")
    prep_1000genomes_full(ch)
    
print

Preparing Chromosome: 1
Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
ALL.chr1.phase3_shapeit2_mvncall_integrated_v 100% 1161MB  11.1MB/s   01:44    
Download Complete


  yield pat.split(line.strip())
  yield pat.split(line.strip())


Loaded 93166 Chr.1 SNPs.
Saved 93166 1240k Markers on Chr. 1 to ../Data/1000Genomes/Markers/1240k/chr1.csv
Finished BCF tools runs.
Loaded 89147 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']
Loaded 93166 Chr.1 SNPs.
Intersection 89147 out of 89147
Finished Chromosome 1
Finished Preparing HDF5 Chromosome 1. GZ!
Preparing Chromosome: 2
Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
ALL.chr2.phase3_shapeit2_mvncall_integrated_v 100% 1252MB  11.2MB/s   01:52    
Download Complete
Loaded 98657 Chr.2 SNPs.
Saved 98657 1240k Markers on Chr. 2 to ../Data/1000Genomes/Markers/1240k/chr2.csv
Finished BCF tools runs.
Loaded 94239 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']
Loaded 98657 Chr.2 SNPs.
Intersection 94239 out of 94239
Finished Chromosome 2
Finished Preparing HDF5 Chromosome 2. GZ!
Preparing Chromosome: 3
F

Loaded 37903 Chr.14 SNPs.
Intersection 36301 out of 36301
Finished Chromosome 14
Finished Preparing HDF5 Chromosome 14. GZ!
Preparing Chromosome: 15
Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
ALL.chr15.phase3_shapeit2_mvncall_integrated_ 100%  437MB  11.1MB/s   00:39    
Download Complete
Loaded 35991 Chr.15 SNPs.
Saved 35991 1240k Markers on Chr. 15 to ../Data/1000Genomes/Markers/1240k/chr15.csv
Finished BCF tools runs.
Loaded 34429 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']
Loaded 35991 Chr.15 SNPs.
Intersection 34429 out of 34429
Finished Chromosome 15
Finished Preparing HDF5 Chromosome 15. GZ!
Preparing Chromosome: 16
Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
ALL.chr16.phase3_shapeit2_mvncall_integrated_ 100%  472MB  11.2MB/s   00:42    
Download Complete
Loaded 3600

### Trouble Shooting

In [49]:
def prep_1000genomes_full_error(ch):
    """ch: Which Chromosome to prepare"""
    prepare_paths(ch = ch)
    #download_1kg()
    #print("Download Complete")
    #vcf_to_hdf5(in_path=path_vcf100g, out_path=path_hdf5temp) # Takes 10 Minutes
    #print("Transformation to HDF5 Complete.")
    #merge_positions()
    #plink_new_vcf()
    #bctools_new_vcf()
    vcf_to_hdf5(in_path=out_vcf_path_gz, out_path=path_hdf5final)
    #merge_in_ld_map()
    #del_temp_data()
    print("Finished Preparing HDF5. GZ!")

In [50]:
ch = 21
prep_1000genomes_full_error(ch=21)

Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
Finished Preparing HDF5. GZ!


# Downsample to 1240k SNPs withing VCF
Use bcftools

In [18]:
### Downsample to 1240k SNPs

### Step1:
# Produce the .csv




Loaded 81416 Chr.3 SNPs.
Saved 81416 1240k Markers on Chr. 3 to ../Data/1000Genomes/Markers/1240k/chr3.csv


In [21]:
### Step2:
# Do the BCF Tools Step
str_ex = "ID=@" + snp_filter_path
#!echo bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
!bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -T $save_path -m2 -M2 -v snps $path_vcf100g
print("Finished BCF tools runs.")

Finished BCF tools runs.


In [22]:
%%time
#!echo bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
!bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -T $save_path -m2 -M2 -v snps $path_vcf100g
print("Finished BCF tools runs.")

Finished BCF tools runs.
CPU times: user 10.6 s, sys: 1.63 s, total: 12.3 s
Wall time: 13min 29s


In [29]:
#marker_path
download_1kg()

ALL.chr3.phase3_shapeit2_mvncall_integrated_v 100%  192KB   1.5MB/s   00:00    


In [30]:
%%time
#!echo bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
!bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -R $save_path -m2 -M2 -v snps $path_vcf100g
print("Finished BCF tools runs.")

Finished BCF tools runs.
CPU times: user 12.1 s, sys: 1.91 s, total: 14 s
Wall time: 13min 10s
