# Notebook to Prepare 1000 Genomes Data
This notebooks prepares the downsampled HDF5 for 1240k Data.
Requires bcftools binary.
Atm, only runs on Harald's local machine (where BCFtools is installed)
Runtime on one CPU: Ca. 15 Min for Chr. 3

In [9]:
import allel
import h5py  # Python Package to do the HDF5.
import numpy as np
import pandas as pd
import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [10]:
#path_vcf = "../Data/1000Genomes/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz" # Path of VCF
path_vcf = "./Data/1000Genomes/1000gX1240kEur.vcf"
#h5_path = "../Data/1000Genomes/1000Genomes_X.hdf5"
h5_path = "./Data/1000Genomes/1000gX1240kEur.hdf5"
ind_path = "./Data/1000Genomes/integrated_call_samples_v2.20130502.ALL.ped"  # Family Relationships
pop_path = "./Data/1000Genomes/integrated_call_samples_v3.20130502.ALL.panel" # Population Information

## Do the Conversion to HDF 5 
Comment out / one can skip down for standalone extraction of autosomes

In [4]:
### Do the conversion to hdf5 (if not done already)
# geno = allel.read_vcf(path_vcf) # Load the VCF # Load the VCF # Needs too much Memory for my laptop
#allel.vcf_to_hdf5(input=path_vcf, output=h5_path, compression="gzip") # Do the conversion to hdf5. Takes 10 Minutes

In [3]:
## Load HDF5
f = h5py.File(h5_path, "r") # Load for Sanity Check. See below!
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))
#print(list(f["samples"].keys()))

Loaded 47297 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']


In [4]:
### Load the ID of the Individuals
ids = np.array(f["variants/ID"])
df_s_empirical = pd.DataFrame({'Individual ID' : list(f["samples"])})

In [5]:
df_s_empirical["Individual ID"] = df_s_empirical["Individual ID"].str.split("_").str[-1]

# Load and Merge Individual Meta Data

In [8]:
df_i = pd.read_csv(ind_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Individuals")

df_pops = pd.read_csv(pop_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Population Data")

### Merge with IDs in Genotype File
df = pd.merge(df_s_empirical, df_i, on='Individual ID', how='inner')
df = pd.merge(df, df_pops, left_on="Individual ID", right_on="sample", how="inner")
print(f"Merged from {len(df_s_empirical)} to {len(df)} individuals")
assert(len(df_s_empirical) == len(df))  # Sanity Check

Loaded 3691 Individuals
Loaded 3691 Population Data
Merged from 503 to 503 individuals


In [9]:
df["Population"].value_counts()

TSI    107
IBS    107
CEU     99
FIN     99
GBR     91
Name: Population, dtype: int64

# Save Table with individual IDs [European, needed later for bcftools]
Save table with Individual and Family IDs of all European 1000 Genome Data

In [None]:
save_path = "./Data/1000Genomes/EUR.csv"

eur_inds = df["super_pop"]=="EUR"
print(f"Nr of European Samples: {np.sum(eur_inds)}")

df_save = df[eur_inds]
df_save = df_save[["Family ID", "Individual ID"]] # Extract column of Individual and Family ID
df_save.to_csv(save_path, sep="\t", header=None, index=False)
print(f"Saved to {save_path}. Nr Individuals: {len(df_save)}")

### Do the same, but for only individual ID
save_path = "./Data/1000Genomes/EUR_fam.csv"

eur_inds = df["super_pop"]=="EUR"
print(f"Nr of European Samples: {np.sum(eur_inds)}")

df_save = df[eur_inds]   # Extract the European Individuals
df_save = df_save["Individual ID"] # Extract column of Individual ID
df_save.to_csv(save_path, sep="\t", header=None, index=False)
print(f"Saved to {save_path}. Nr Individuals: {len(df_save)}")

# Check against Sardinian X data

In [11]:
path_snp = "../ancient-sardinia/data/bed/full230.snp" # All SNPs found in the 1240k Ancient Panel

df_snp = pd.read_csv(path_snp, header=None, sep=r"\s*", engine="python")
df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
df_snp = df_snp[df_snp["chr"]==23]

print(f"Loaded {len(df_snp)} X SNPs.")

  yield pat.split(line.strip())
  yield pat.split(line.strip())


Loaded 49711 X SNPs.


# Save  interesection with 1240k Marker 
Prepare txt List for Plink filtering

In [1]:
save_path = "./Data/1000Genomes/variants1240k"

found = np.isin(f["variants/POS"], df_snp["pos"])
print(f"Intersection: {np.sum(found)} out of {len(found)} SNPS")
variants = f["variants/ID"][found]

dots = np.where(variants == ".")[0]
print(f"Found {len(dots)} unnamed SNPs")
variants = np.delete(variants, dots)

np.savetxt(save_path, variants, fmt="%s")
print(f"Successfully saved to {save_path}. Length: {len(variants)}")

NameError: name 'np' is not defined

# Prepare downsampled Autosomal hdf5s
Prepare a 1000 Genome autosomal hdf5 file. Include Recombination Map
Input: 1000 Genome vcf file, Recombination Map from a 1240k Eigenstrat
## Standalone from here onward.

In [3]:
### Important Parameters and paths
ch = 3 # Which Chromosome to use:

# Path of the 1000 Genome VCF:
p1, p2 = "", ""
file_vcf100g, path_vcf100g = "", ""
out_vcf_path0, out_vcf_path = "", ""
path_hdf5temp, path_hdf5final = "", ""

snp1240k_path, ind_path = "", ""   # Where to find the 1240k SNPs
snp_filter_path = ""

def prepare_paths(ch = 3):
    """Prepares all the Paths need for processing Steps.
    ch: Which Chromosomes to use"""
    global p1, p2, file_vcf100g, path_vcf100g, out_vcf_path0, out_vcf_path, path_hdf5temp, path_hdf5final
    global snp1240k_path, ind_path, snp_filter_path, out_vcf_path_gz, marker_path
    # Path of the 1000 Genome VCF:
    p1 = "./Data/1000Genomes/AutosomeVCF/"
    p2 = ".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
    file_vcf100g = "ALL.chr" + str(ch) + p2
    path_vcf100g = p1 + file_vcf100g
    print(f"Full Input path:\n{path_vcf100g}")
    out_vcf_path0 = "./Data/1000Genomes/AutosomeVCF/Subset/" + "1240EURchr" + str(ch) # needs no .vcf
    out_vcf_path = out_vcf_path0 + ".vcf"
    out_vcf_path_gz = out_vcf_path + ".gz"
    path_hdf5temp = "./Data/1000Genomes/HDF5/FULLHDF5/cr" + str(ch) + ".hdf5"
    path_hdf5final = "./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr" + str(ch) + ".hdf5"
    snp1240k_path = "./Data/1000Genomes/Markers/MinMyc.snp"   # Where to find the 1240k SNPs
    ind_path = "./Data/1000Genomes/Individuals/EUR_fam.csv"   # Where to find the individual lists
    marker_path = "./Data/1000Genomes/Markers/1240k/chr" + str(ch) + ".csv"

    # Path of SNP Filter
    snp_filter_path = "./Data/1000Genomes/Markers/variants1240k" + str(ch) + ".txt"

### Step 0: Download the Data
### Step 1: Produce hdf5 file for all markers
### Step 2: Extract Positions. Match with Eigenstrat File Positions
### Step 3: Create new vcf based on subset of Individuals and Markers
### Step 4: Transfer to hdf5. 
### Step 5: Merge in Linkage Map
### Step 6: Quality Check? (Control ref/alt against hdf5 we have for Sardinians)

In [30]:
### Step 0: Download the Data
def download_1kg(cluster=False):
    """cluster: Whether program is run on cluster"""
    if cluster==False:
        path_cl = "/project/jnovembre/data/external_public/1kg_phase3/haps/"
        path_cluster = "hringbauer@midway.rcc.uchicago.edu:" + path_cl + file_vcf100g
    
    elif cluster==True:
        path_cl = "/project/jnovembre/data/external_public/1kg_phase3/haps/"
        path_cluster = path_cl + file_vcf100g
    
    p_c = path_cluster + ".tbi"
    p_v = path_vcf100g + ".tbi"
    #!scp $p_c $p_v # Download the tbi
    !scp $path_cluster $path_vcf100g # Only Download the .vcf (not the .tbi)
    
### Step 1: Produce hdf5 file for all markers
def vcf_to_hdf5(in_path, out_path):
    """Transform Full VCF to full HDF5"""
    allel.vcf_to_hdf5(input=in_path, output=out_path, compression="gzip") # Takes 10 Minutes
    
### Step 2: Extract Positions. Match with Eigenstrat File Positions
### Load HDF5

def merge_positions():
    """Creates the Filter File to filter SNPs for"""
    f_full = h5py.File(path_hdf5temp, "r") # Load for Sanity Check. See below!
    print("Loaded %i variants" % np.shape(f_full["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f_full["calldata/GT"])[1])
    print(list(f_full["calldata"].keys()))
    print(list(f_full["variants"].keys()))
    #print(list(f["samples"].keys()))

    ### Load Eigenstrat
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s+", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    ### Prepare SNP File for Eigenstrat filtering 
    found = np.isin(f_full["variants/POS"], df_snp["pos"])
    print(f"Intersection: {np.sum(found)} out of {len(found)} SNPS")
    variants = f_full["variants/ID"][found]

    dots = np.where(variants == ".")[0]
    print(f"Found {len(dots)} unnamed SNPs")
    variants = np.delete(variants, dots)

    np.savetxt(snp_filter_path, variants, fmt="%s")
    print(f"Successfully saved to {snp_filter_path}. Length: {len(variants)}")
    
def save_1240kmarkers():
    """Save all 1240 Markers in csv"""
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s+", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    df_save = df_snp[["chr", "pos"]]
    df_save.to_csv(marker_path, sep="\t", header=None, index=False)
    print(f"Saved {len(df_save)} 1240k Markers on Chr. {ch} to {marker_path}")
    
### Step 3: Create new vcf based on subset of Individuals and Markers
def plink_new_vcf():
    !plink --vcf $path_vcf100g --extract $snp_filter_path --keep-fam $ind_path --recode vcf --out $out_vcf_path0 --biallelic-only strict --keep-allele-order

### Step 3b
def bctools_new_vcf0():
    """Same as PLINK, but with bcftools 
    [small hack with marker strings, so LEGACY code and replaced by bcftools_new_vcf]"""
    str_ex = "ID=@" + snp_filter_path
    #!echo bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
    !bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -i $str_ex -m2 -M2 -v snps $path_vcf100g
    print("Finished BCF tools runs.")
    
def bctools_new_vcf(filter_iids=True, cluster=False):
    """Same as PLINK, but with bcftools and directly via Marker Positions.
    filter_iids: Whether to use the .csv with Indivdiduals to extract"""
    if filter_iids==True:
        if cluster==False:
            !bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -T $marker_path -m2 -M2 -v snps $path_vcf100g
        elif cluster==True:
            !module load bcftools; bcftools view -Oz -o $out_vcf_path_gz -S $ind_path -T $marker_path -m2 -M2 -v snps $path_vcf100g     
    elif filter_iids==False:
        if cluster==False:
            !bcftools view -Oz -o $out_vcf_path_gz -T $marker_path -m2 -M2 -v snps $path_vcf100g
        elif cluster==True:
            !module load bcftools; bcftools view -Oz -o $out_vcf_path_gz -T $marker_path -m2 -M2 -v snps $path_vcf100g
    print("Finished BCF tools runs.")

### Step 4: Transfer to hdf5.
#allel.vcf_to_hdf5(input=out_vcf_path, output=path_hdf5final, compression="gzip") # Takes 1s
 
### Step 5: Merge in Linkage Map
### Load HDF5
def merge_in_ld_map():
    """Merge in ld_map into HDF5!"""
    f = h5py.File(path_hdf5final, "r") # Load for Sanity Check. See below!
    print("Merging in LD Map into HDF5...")
    print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
    print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
    print(list(f["calldata"].keys()))
    print(list(f["variants"].keys()))
    #print(list(f["samples"].keys()))

    ### Load Eigenstrat
    df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s+", engine="python")
    df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
    df_snp = df_snp[df_snp["chr"] == ch]
    print(f"Loaded {len(df_snp)} Chr.{ch} SNPs.")

    ### Intersect SNP positions
    its, i1, i2 = np.intersect1d(f["variants/POS"], df_snp["pos"], return_indices=True)

    l = len(f["variants/POS"])
    print(f"Intersection {len(i2)} out of {l}")

    ### Extract 
    rec = np.zeros(len(f["variants/POS"]))
    rec[i1] = df_snp["map"].values[i2]  # Fill in the values in Recombination map

    ids0 = np.where(rec == 0)[0] # The 0 Values
    rec[ids0] = (rec[ids0-1] + rec[ids0+1]) / 2.0 # Interpolate

    ### Make sure that sorted
    assert(np.all(np.diff(rec)>=0))  # Assert the Recombination Map is sorted! (no 0 left and no funky stuff)

    f.close()
    with h5py.File(path_hdf5final, 'a') as f0:
        group = f0["variants"]
        group.create_dataset('MAP', (l,), dtype='f')   
        f0["variants/MAP"][:] = rec[:]

    print(f"Finished Chromosome {ch}")
    
### Step 6: Delete the Data:
def del_temp_data():
    !rm $path_vcf100g # Delete the full 1000 genome .vcf
    !rm $out_vcf_path_gz # Delete the extracted .vcf
    #!rm $path_hdf5temp # The originally intermediate hdf5 (for 1240k intersection)

# Do all steps in one run (function)

In [6]:
def prep_1000genomes_full(ch):
    """ch: Which Chromosome to prepare"""
    prepare_paths(ch = ch)
    download_1kg(cluster=False)
    print("Download Complete")
    #vcf_to_hdf5(in_path=path_vcf100g, out_path=path_hdf5temp) # Takes 10 Minutes
    #print("Transformation to HDF5 Complete.")
    #merge_positions()
    save_1240kmarkers()
    #plink_new_vcf()
    bctools_new_vcf()
    vcf_to_hdf5(in_path=out_vcf_path_gz, out_path=path_hdf5final)
    merge_in_ld_map()
    #del_temp_data()
    print(f"Finished Preparing HDF5 Chromosome {ch}. GZ!")

In [None]:
#Loop to run all chromosomes

for ch in range(1, 23):
    print(f"Preparing Chromosome: {ch}")
    prep_1000genomes_full(ch)

# Prepare 1240k SNP HDF5 for CHB, CLM, YRI.
(Needs functions from downsample autosomal hdf5s but otherwise stand-alone)

### Prepare the Indivdual .csv for bcftools

In [9]:
def prepare_iid_csv(pops_oi=[], save_path = "../Data/1000Genomes/Individuals/CHB_CLM_YRI_fam.csv", 
                   pop_path="../Data/1000Genomes/integrated_call_samples_v3.20130502.ALL.panel", 
                    output=True):
    """Prepare .csv with Individuals.
    pops_oi: List of Population IIDs to extract. If empty, extract all
    save_path: Where to save the .csv File to
    pop_path: Input .csv with the Population information
    output: Whether to print output
    """
    df_pops = pd.read_csv(pop_path, sep="\t")
    inds = df_pops["pop"].isin(pops_oi)
    
    if output == True:
        print(f"Loaded {np.shape(df_pops)[0]} Individuals with Population Data")
        print(f"Found {np.sum(inds)} Individuals from target populations")
    
    df_save = df_pops[inds]   # Filter to target Individual rows
    df_save = df_save["sample"] # Extract column of IIds
    df_save.to_csv(save_path, sep="\t", header=None, index=False)
    
    if output == True:
        print(f"Saved to {save_path}. Nr saved IIDs: {len(df_save)}")

In [10]:
prepare_iid_csv(pops_oi = ["CHB", "CLM", "YRI"], save_path = "../Data/1000Genomes/Individuals/CHB_CLM_YRI_fam.csv")

Loaded 2504 Individuals with Population Data
Found 305 Individuals from target populations
Saved to ../Data/1000Genomes/Individuals/CHB_CLM_YRI_fam.csv. Nr saved IIDs: 305


### Overwrite paths for HDF5 creation

In [27]:
### Important Parameters and paths
ch = 3 # Which Chromosome to use:

# Path of the 1000 Genome VCF:
p1, p2 = "", ""
file_vcf100g, path_vcf100g = "", ""
out_vcf_path0, out_vcf_path = "", ""
path_hdf5temp, path_hdf5final = "", ""

snp1240k_path, ind_path = "", ""   # Where to find the 1240k SNPs
snp_filter_path = ""

def prepare_paths(ch = 3):
    """Prepares all the Paths need for processing Steps.
    ch: Which Chromosomes to use"""
    global p1, p2, file_vcf100g, path_vcf100g, out_vcf_path0, out_vcf_path, path_hdf5temp, path_hdf5final
    global snp1240k_path, ind_path, snp_filter_path, out_vcf_path_gz, marker_path
    # Path of the 1000 Genome VCF:
    p1 = "./Data/1000Genomes/AutosomeVCF/"
    p2 = ".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
    file_vcf100g = "ALL.chr" + str(ch) + p2
    path_vcf100g = p1 + file_vcf100g
    print(f"Full Input path:\n{path_vcf100g}")
    out_vcf_path0 = "./Data/1000Genomes/AutosomeVCF/Subset/" + "1240NonEURchr" + str(ch) # needs no .vcf
    out_vcf_path = out_vcf_path0 + ".vcf"
    out_vcf_path_gz = out_vcf_path + ".gz"
    path_hdf5temp = "./Data/1000Genomes/HDF5/FULLHDF5/cr" + str(ch) + ".hdf5"
    path_hdf5final = "./Data/1000Genomes/HDF5/1240kHDF5/NonEur1240chr" + str(ch) + ".hdf5"
    snp1240k_path = "./Data/1000Genomes/Markers/MinMyc.snp"   # Where to find the 1240k SNPs
    ind_path = "./Data/1000Genomes/Individuals/CHB_CLM_YRI_fam.csv"   # Where to find the individual lists
    marker_path = "./Data/1000Genomes/Markers/1240k/chr" + str(ch) + ".csv"

    ### Path of SNP Filter
    snp_filter_path = "./Data/1000Genomes/Markers/variants1240k" + str(ch) + ".txt"

### Do the Extraction run for the HDF5 
(don't forget to have the other functions defined but program would complain)

In [30]:
def prep_1000genomes_full(ch):
    """ch: Which Chromosome to prepare"""
    prepare_paths(ch = ch)
    download_1kg()
    print("Download Complete")
    # Merge not needed here: As interesecting 1240k SNPs are already extracted above
    bctools_new_vcf()
    vcf_to_hdf5(in_path=out_vcf_path_gz, out_path=path_hdf5final)
    merge_in_ld_map()
    del_temp_data()
    print("Finished Preparing HDF5. GZ!")

In [31]:
### Only do Chromosome 3 [suffices for testing]
prep_1000genomes_full(ch = 3)   ### Takes about X min

Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
ALL.chr3.phase3_shapeit2_mvncall_integrated_v 100% 1055MB  10.9MB/s   01:36    
Download Complete
Finished BCF tools runs.
Loaded 77652 variants
Loaded 305 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']


  yield pat.split(line.strip())
  yield pat.split(line.strip())


Loaded 81416 Chr.3 SNPs.
Intersection 77652 out of 77652
Finished Chromosome 3
Finished Preparing HDF5. GZ!


## Prepare 1240k HDF5 with all 1000g reference Indviduals

In [12]:
### Important Parameters and paths
ch = 3 # Which Chromosome to use:

# Path of the 1000 Genome VCF:
p1, p2 = "", ""
file_vcf100g, path_vcf100g = "", ""
out_vcf_path0, out_vcf_path = "", ""
path_hdf5temp, path_hdf5final = "", ""

snp1240k_path, ind_path = "", ""   # Where to find the 1240k SNPs
snp_filter_path = ""

def prepare_paths(ch = 3):
    """Prepares all the Paths need for processing Steps.
    ch: Which Chromosomes to use"""
    global p1, p2, file_vcf100g, path_vcf100g, out_vcf_path0, out_vcf_path, path_hdf5temp, path_hdf5final
    global snp1240k_path, ind_path, snp_filter_path, out_vcf_path_gz, marker_path
    # Path of the 1000 Genome VCF:
    p1 = "./Data/1000Genomes/AutosomeVCF/"
    p2 = ".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
    file_vcf100g = "ALL.chr" + str(ch) + p2
    path_vcf100g = p1 + file_vcf100g
    print(f"Full Input path:\n{path_vcf100g}")
    out_vcf_path0 = "./Data/1000Genomes/AutosomeVCF/Subset/" + "1240all/chr" + str(ch) # needs no .vcf
    out_vcf_path = out_vcf_path0 + ".vcf"
    out_vcf_path_gz = out_vcf_path + ".gz"
    path_hdf5temp = "./Data/1000Genomes/HDF5/FULLHDF5/cr" + str(ch) + ".hdf5"
    path_hdf5final = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" + str(ch) + ".hdf5"
    snp1240k_path = "./Data/1000Genomes/Markers/MinMyc.snp"   # Where to find the 1240k SNPs
    ind_path = "./Data/1000Genomes/Individuals/NO_EXIST.csv"  # non-existing place-holder (sanity check)
    marker_path = "./Data/1000Genomes/Markers/1240k/chr" + str(ch) + ".csv"   
    
    for path in [out_vcf_path, path_hdf5final]:
        path_dir = os.path.dirname(path)
    
        if not os.path.exists(path_dir):
            os.makedirs(path_dir)
            print(f"Created new directory: {path_dir}")
    
    ### Path of SNP Filter
    snp_filter_path = "../Data/1000Genomes/Markers/variants1240k" + str(ch) + ".txt"

In [32]:
def prep_1000genomes_full(ch, cluster):
    """ch: Which Chromosome to prepare
    cluster: Whether Function is run on Cluster"""
    prepare_paths(ch = ch)
    download_1kg(cluster=cluster)  # Since we run it on the cluster
    print("Download Complete")
    ### SNP prep not needed here: Interesecting 1240k SNPs were already extracted above
    bctools_new_vcf(filter_iids=False, cluster=cluster)  # Important, turn off filter individuals here!
    vcf_to_hdf5(in_path=out_vcf_path_gz, out_path=path_hdf5final)
    merge_in_ld_map()
    del_temp_data()
    print("Finished Preparing HDF5. GZ!")

In [7]:
prep_1000genomes_full(ch=3, cluster=True)  ### Test with Chromosome 3

Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
Created new directory: ../Data/1000Genomes/AutosomeVCF/Subset/1240all
Created new directory: ../Data/1000Genomes/HDF5/1240kHDF5/all1240
ALL.chr3.phase3_shapeit2_mvncall_integrated_v 100% 1055MB  11.2MB/s   01:34    
Download Complete
Finished BCF tools runs.
Loaded 77652 variants
Loaded 2504 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']


  yield pat.split(line.strip())
  yield pat.split(line.strip())


Loaded 81416 Chr.3 SNPs.
Intersection 77652 out of 77652
Finished Chromosome 3
Finished Preparing HDF5. GZ!


In [None]:
%%time
for ch in range(1,23):
    prep_1000genomes_full(ch, cluster=True)

Full Input path:
./Data/1000Genomes/AutosomeVCF/ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
Download Complete


### Prepare lean Meta File for the new Reference
(sample, pop, super_pop)

In [2]:
pop_path = "../Data/1000Genomes/integrated_call_samples_v3.20130502.ALL.panel" # Population Information
save_path ="../Data/1000Genomes/Individuals/meta_df_all.csv"

df_pops = pd.read_csv(pop_path, sep="\t")
print(f"Loaded {np.shape(df_pops)[0]} Rows of Population Data")

### Merge with IDs in Genotype File
df_save = df_pops[["sample", "pop", "super_pop"]]
df_save.to_csv(save_path, index=False, sep="\t") # Tab seperation!!
print(f"Successfully saved to {save_path}")

Loaded 2504 Rows of Population Data
Successfully saved to ../Data/1000Genomes/Individuals/meta_df_all.csv


In [6]:
df_save["super_pop"].value_counts()

AFR    661
EAS    504
EUR    503
SAS    489
AMR    347
Name: super_pop, dtype: int64

# Area51
Test Code here

### Debugging / Trouble Shooting
Run specific parts of the pipeline for trouble shooting

In [49]:
def prep_1000genomes_full_error(ch):
    """ch: Which Chromosome to prepare"""
    prepare_paths(ch = ch)
    #download_1kg()
    #print("Download Complete")
    #vcf_to_hdf5(in_path=path_vcf100g, out_path=path_hdf5temp) # Takes 10 Minutes
    #print("Transformation to HDF5 Complete.")
    #merge_positions()
    #plink_new_vcf()
    #bctools_new_vcf()
    vcf_to_hdf5(in_path=out_vcf_path_gz, out_path=path_hdf5final)
    #merge_in_ld_map()
    #del_temp_data()
    print("Finished Preparing HDF5. GZ!")

In [50]:
prep_1000genomes_full_error(ch=21)

Full Input path:
../Data/1000Genomes/AutosomeVCF/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
Finished Preparing HDF5. GZ!


In [10]:
### Test that. Possible Solution to warning: s+
df_snp = pd.read_csv(snp1240k_path, header=None, sep=r"\s*", engine="python")

In [11]:
### Test the Final HDF5 just created
f = h5py.File(path_hdf5final, "r") # Load for Sanity Check. See below!
print("Merging in LD Map into HDF5...")
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

Merging in LD Map into HDF5...
Loaded 77652 variants
Loaded 2504 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'MAP', 'POS', 'QUAL', 'REF']
