## Pilot for downsampling / throwing in errors
Plan: Input is a a HDF5, Output is a HDF5. Eventually make class that can downsample, and/or throw errors on the reads.

Class is a wrapper of an HDF5, and applies the operations to it.

In [16]:
import allel
import h5py  # Python Package to do the HDF5.
import numpy as np
import pandas as pd
import socket
import os

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name[:7] == "midway2":
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG


In [17]:
class ModifyHDF5Genotypes(object):
    """Class for Modifying HDF5 genotypes and
    saving new HDF5s. Can downsample/throw down error/Create Readcound.
    Plan: Also do contamination"""

    f = 0    # The hdf5 object to modify
    original_path = "" # Where to find the original HDF5
    save_path = ""  # Where to save the modified HDF5 to
    output = True # Whether to print any output
    gt_new = []

    def __init__(self, original_path="", save_path="", output=True):
        """pop_path: Where to load a HDF5 from
           save_path: Where to save the new HDF5 to"""
        self.output = output
        self.save_path = save_path
        
        if output == True:
            print("Heyho back old friend. I started running")
        
        if len(original_path)>0:
            self.original_path = original_path
            self.load_data()
        else:
            print("No HDF5 Loaded! Alarm. Alarm. Alarm.")

    def load_data(self, path=""):
        """Load the HDF5 Data"""
        if len(path)==0:
            path = self.original_path
        self.f = h5py.File(path, "r") # Load for Sanity Check. See below!
        
        if self.output == True:
            print("Loaded HDF5")
            print("Loaded %i variants" % np.shape(self.f["calldata/GT"])[0])
            print("Loaded %i individuals" % np.shape(self.f["calldata/GT"])[1])
            print(list(self.f["calldata"].keys()))
            print(list(self.f["variants"].keys()))
            #self.f["samples"] # Samples Vector
        
        ### Sanity Check whether both Genotypes are there and nothing else
        assert(np.min(self.f["calldata/GT"]) == 0)
        assert(np.max(self.f["calldata/GT"]) == 1)

    def save_data(self, gt, ad, ref, alt, pos, 
                  rec, samples, path, compression="gzip", ad_group=True,
                  gt_type="int8"):
        """Create a new HDF5 File with Input Data.
        gt: Genotype data [l,k,2]
        ad: Allele depth [l,k,2]
        ref: Reference Allele [l]
        alt: Alternate Allele [l]
        pos: Position  [l]
        m: Map position [l]
        samples: Sample IDs [k].
        Save genotype data as int8, readcount data as int16.
        ad: whether to save allele depth
        gt_type: What genotype data type save"""

        l, k, _ = np.shape(gt)  # Nr loci and Nr of Individuals

        if os.path.exists(path):  # Do a Deletion of existing File there
            os.remove(path)

        dt = h5py.special_dtype(vlen=str)  # To have no problem with saving

        with h5py.File(path, 'w') as f0:
            ### Create all the Groups
            f_map = f0.create_dataset("variants/MAP", (l,), dtype='f')
            if ad_group:
                f_ad = f0.create_dataset("calldata/AD", (l, k, 2), dtype='int8', compression=compression)
            f_ref = f0.create_dataset("variants/REF", (l,), dtype=dt)
            f_alt = f0.create_dataset("variants/ALT", (l,), dtype=dt)
            f_pos = f0.create_dataset("variants/POS", (l,), dtype='int32')
            f_gt = f0.create_dataset("calldata/GT", (l, k, 2), dtype=gt_type, compression=compression)
            f_samples = f0.create_dataset("samples", (k,), dtype=dt)

            ### Save the Data
            f_map[:] = rec
            if ad_group:
                f_ad[:] = ad
            f_ref[:] = ref.astype("S1")
            f_alt[:] = alt.astype("S1")
            f_pos[:] = pos
            f_gt[:] = gt
            f_samples[:] = np.array(samples).astype("S10")

        if self.output == True:
            print(f"Successfully saved {k} individuals to: {path}")

    def create_error_gt(self, freq_flips=0.01):
        """Create Error on the HDF5 of genotypes.
        freq_flips: How often to do flip of genotyps"""
        f = self.f
        gt = f["calldata/GT"]
        
        switch = np.random.random(np.shape(gt)) < freq_flips
        
        if self.output == True:
            print(f"Swapping frac of SNPs: {np.mean(switch):.6f}")

        ### Switch the Genotypes
        gt_new = (gt + switch) %2

        self.save_data(gt_new, f["calldata/AD"], f["variants/REF"][:], f["variants/ALT"][:], f["variants/POS"], 
                       f["variants/MAP"], f["samples"][:], self.save_path)
            
    def downsample_gt(self, frac=0.9, ad=True, mult_alt=False, 
                      gt_type="int8", compression=None):
        """Downsample the HDF5 to fewer reads.
        Update also the recombination and position map if needed to remove missing values
        frac: To what fraction of markers one downsamples
        ad: Whether original HDF5 has AD field
        mult_alt: Whether there are multiple alternative Allelels in the original HDF5"""
        f = self.f
        gt = f["calldata/GT"]
        
        ### Decide on SNPs
        l, n, _ = np.shape(gt)
        survive = np.random.random(l) <= frac
        print(f"Fraction Loci surviving {np.mean(survive):.6f}")
        
        ### Downsample
        gt_new = gt[survive,:,:].astype(gt_type)
        r_map_new = f["variants/MAP"][survive]
        if ad:
            ad_new = f["calldata/AD"][survive,:,:]
        else:
            ad_new = np.zeros(np.shape(gt_new), dtype="int8")
        
        ref_new = f["variants/REF"][survive]
        
        if mult_alt:
            alt_new = f["variants/ALT"][survive,0]   
        else:
            alt_new = f["variants/ALT"][survive]
        
        pos_new = f["variants/POS"][survive]
        
        ### Downsample where needed  
        self.save_data(gt_new, ad_new, ref_new, alt_new, pos_new, r_map_new, 
                       f["samples"], self.save_path, 
                       ad_group=ad, gt_type=gt_type, compression=compression)
        
    def generate_binomial_rc(self, mean_rc=1):
        """Generate Readcount Data from GT data.
        mean_rc: The Mean total Readcount per site"""
        
        f = self.f
        gt = f["calldata/GT"]
        
        ### Create the Poisson Readcounts with the right mean
        rc_full = poisson_readcounts(gt, mean_rc, output=self.output) 
        
        self.save_data(gt, rc_full, f["variants/REF"][:], f["variants/ALT"][:], f["variants/POS"], 
               f["variants/MAP"], f["samples"][:], self.save_path)
        
    def generate_lambda_rc(self, mean_rc = 1, norm_counts=True,
                           lambda_path = "./Data/1000Genomes/Coverage/mean_cov1240k_Marcus.csv"):
        """Generate Readcount Data from GT data.
        Use Table found at lambda_path for Lambdas 
        (relative. mean coverages, normed to 1 genome-wide)
        norm_counts: Whether to normalize on overlapping Readcounts"""
        
        df_lambda = load_lambda(lambda_path, output=self.output)  ### Load the Lambda Data
        
        f = self.f
        gt = f["calldata/GT"]
        l, n, _ = np.shape(gt)
        
        pos_f = f["variants/POS"][:]  # The Position of the Original 
        _, i1, i2 = np.intersect1d(pos_f, df_lambda["Pos"], return_indices=True)
        
        if self.output==True:
            print(f"Found {len(i1)} / {l} Loci in Lambda Table")
        
        lambdas = df_lambda["Lambda"].values[i2]
        if norm_counts == True: # Normalize to extracted lambdas
            lambdas = lambdas / np.mean(lambdas)
            
        mean_cov = lambdas * mean_rc  # Extract the Means that Intersect
        gt = gt[i1,:,:]  # Downsample to Loci intersecting the Lambda Table 
        
        ### Do the Binomial Readcount Sampling
        rc_full = poisson_readcounts(gt, mean_cov[:,None], output=self.output) 
        
        i1 = list(i1)  # So that it works with HDF5
        self.save_data(gt, rc_full, f["variants/REF"][i1], f["variants/ALT"][i1], f["variants/POS"][i1], 
               f["variants/MAP"][i1], f["samples"][:], self.save_path)
           
    def generate_ph(self, coverage = 1.0, error = 0.0):
        """Generate Pseudo-Haploid Data with fraction coverage sites covered,
        and then error thrown down.
        coverage: Fraction of sites covered
        error: Fraction of sites with error. If >0, flip error added at random"""
        
        f = self.f
        gt = f["calldata/GT"]
        l, _, _ = np.shape(gt)
        
        idx = np.random.random(l)<=coverage  # Which sites are covered
        gt = gt[idx, :, :]  # Extract downsampled SNPs
        
        switch = [0,]
        if error>0:
            switch = (np.random.random(np.shape(gt)) < error) & (gt >= 0)
            gt = (gt + switch) %2 # Switch the Genotypes
                
        if self.output:
            print(f"{np.sum(idx)} / {len(idx)} SNPs pseudohaploidized.")
            print(f"Added fraction errors to SNPs: {np.mean(switch):.6f}")
            print(f"Added sum errors: {np.sum(switch):.0f}")
        
        rc = pseudo_haploid(gt) # Generate Pseudo-Haploid Readcounts
        
        idx = np.array(idx)  # So that it works with HDF5 (Boolean Indexing)
        self.save_data(gt, rc, f["variants/REF"][idx], f["variants/ALT"][idx], f["variants/POS"][idx], 
                       f["variants/MAP"][idx], f["samples"][:], self.save_path)
        
    def copy_rohinfo(self, load_path="", save_path="", file="roh_info.csv"):
        """Copy in the ROH Info from folder of load path into folder of save_path.
        file: Which file to copy (roh_info by default)"""
        if len(load_path) == 0:
            load_path = self.original_path
            
        if len(save_path) ==0 :
            save_path = self.save_path
            
        save_path = os.path.dirname(save_path) + "/" + file
        load_path = os.path.dirname(load_path) + "/" + file
        
        ### Copy the file
        !cp $load_path $save_path  
        
##########################################
#### Some Small Helper Functions

def load_lambda(loadpath, ch=3, output=True):
    """Load and return the Lambda Vector
    for Chromosome ch, and from path loadpath"""
    df_lambda = pd.read_csv(loadpath)
    mean = np.mean(df_lambda["Lambda"])
    assert(np.isclose(mean, 1))  # Sanity Check if Valid Lambda Vector
    l=len(df_lambda)
    df_lambda = df_lambda[df_lambda["Ch"]==ch]
    if output==True:
        print(f"Extracted {len(df_lambda)} / {l} Loci on Chr.{ch}")
    return df_lambda

def poisson_readcounts(gt, mean_rc, output=True):
    """Create and return Poisson Readcount array.
    gt: Underlying Genotype Matrix [l, n, 2]
    Return readcound array: [l, n, 2]"""
    l, n, _ = np.shape(gt)
    rc_tot = np.random.poisson(lam=mean_rc, size = (l,n))  # Draw Full Readcounts

    p = np.mean(gt, axis=2) # Get the Mean Allele Frequency per locus and individual
    assert(np.max(p)<=1) ### Sanity Check whether allele freqs are right
    assert(np.min(p)>=0)

    rc_der = np.random.binomial(n=rc_tot, p=p)  # The derived Readcount (Binomial Sampling)
    rc_ref = rc_tot - rc_der  # The Ref Readcount

    rc_full = np.stack([rc_ref, rc_der], axis=2)
    assert(np.shape(rc_full) == np.shape(gt))  # Check whether data was created properly

    if output == True:
        print(f"Mean Readcount: {np.mean(rc_tot):.4f}")
    
    return rc_full

def pseudo_haploid(gt):
    """Create and return Pseudo-Haploid Readcount array
    gt: Underlying Genotype Matrix [l, n, 2]
    Return readcound array: [l, n, 2]"""
    
    p = np.mean(gt, axis=2) # Get the Mean Allele Frequency per locus and individual
    
    rc_der = np.random.binomial(n=1, p=p)  # The derived Readcount (Binomial Sampling)
    rc_ref = 1 - rc_der  # The Ref Readcount
    rc_full = np.stack([rc_ref, rc_der], axis=2)
    
    assert(np.max(rc_full)<=1)
    assert(np.min(rc_full)==0)
    assert(np.shape(rc_full) == np.shape(gt))  # Check whether data was created properly
    return rc_full

### Prepare Pseudohaploid data hdf5s that are used for parameter setting
.) Simulate 0.5x coverage as well as 0.01 error rate into from TSI5 (perfect genotypes) into TSI6  
.) Simulate 1x coverage with 0.001 error rate from TSI5 into TSI7

In [None]:
%%time
org_folder = "./Simulated/1000G_Mosaic/TSI5/"
out_folder = "./Simulated/1000G_Mosaic/TSI7/"

snps_cov = 1.0 # 0.5 for TSI6
e_rate = 0.001
lengths = [0, 2, 4, 6, 8, 10]

for l in lengths:
    org_folder1 = org_folder + "ch3_" + str(l) + "cm/"
    load_path = org_folder1 + "data.h5" 
    save_path = out_folder + "ch3_" + str(l) + "cm/data.h5" 

    # Make Directory if not already there
    if not os.path.exists(os.path.dirname(save_path)):   
        os.makedirs(os.path.dirname(save_path))

    #os.remove(save_path)  # For previous whoopsie
    m = ModifyHDF5Genotypes(original_path=load_path, save_path=save_path)
    
    m.generate_ph(coverage = snps_cov, error = e_rate)
    m.copy_rohinfo()   # Copy the ROH Info!!

# Save the reference with int8 Genotype data. Faster to read!
2020 Version: Include gzip and don't save Allele Depths
gt_type: Specifiy dtype of Genotype File (int8 or np.bool)

In [None]:
%%time
org_folder = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr"
out_folder = "./Data/1000Genomes/HDF5/1240kHDF5/all1240bool0/chr"

chs = range(1, 23)

for ch in chs:
    load_path = org_folder + str(ch) + ".hdf5"
    save_path = out_folder + str(ch) + ".hdf5" 

    # Make Directory if not already there
    if not os.path.exists(os.path.dirname(save_path)):   
        os.makedirs(os.path.dirname(save_path))

    #os.remove(save_path)  # For previous whoopsie
    m = ModifyHDF5Genotypes(original_path=load_path, save_path=save_path)
    m.downsample_gt(frac=1.0, ad=False, mult_alt=True, gt_type="int8")

## Prepare data hdf5s with various levels of errors

In [None]:
error_vec = np.logspace(-3,-1, 8)
org_folder = "./Simulated/1000G_Mosaic/TSI5/"
lengths = [0, 2, 4, 6, 8, 10]

for l in lengths:
    for e in error_vec:
        print(f"Doing Error Frac: {e}")
        org_folder1 = org_folder + "ch3_" + str(l) + "cm/"
        load_path = org_folder1 + "data.h5" 
        
        e_print = str(round(e, 4)).split(".")[1] # Extract four digits after decimal
        
        ### To do save 4 digits error data file
        save_path = org_folder1 + "error/" + e_print + "/data.h5"   
        
        # Make Directory if not already there
        if not os.path.exists(os.path.dirname(save_path)):   
            os.makedirs(os.path.dirname(save_path))
        
        #os.remove(save_path)  # For previous whoopsie
        m = ModifyHDF5Genotypes(original_path=load_path, save_path=save_path)
            
        m.create_error_gt(freq_flips=e)
        m.copy_rohinfo()   # Copy the ROH Info!!

## Prepare data hdf5s with various levels of missingness

In [None]:
missing_vec = np.linspace(0.1, 1.0, 10)
#missing_vec = np.linspace(0.3, 1.0, 8)
#missing_vec = np.array([0.1, 0.2])
org_folder = "./Simulated/1000G_Mosaic/CHB/"
#lengths = [2, 4, 6, 8, 10]
lengths = [0,]

for l in lengths:
    for m in missing_vec:
        print(f"Doing Missing Fraction: {m}")
        org_folder1 = org_folder + "ch3_" + str(l) + "cm/"
        load_path = org_folder1 + "data.h5" 
        
        m_print = str(round(m, 4)).split(".")[1] # Extract four digits after decimal
        
        ### To do save 4 digits error data file
        save_path = org_folder1 + "missing/" + m_print + "/data.h5"   
        
        # Make Directory if not already there
        if not os.path.exists(os.path.dirname(save_path)):   
            os.makedirs(os.path.dirname(save_path))
        
        #os.remove(save_path)  # For previous whoopsie
        modh5 = ModifyHDF5Genotypes(original_path=load_path, save_path=save_path)
            
        modh5.downsample_gt(frac=m)
        modh5.copy_rohinfo()   # Copy the ROH Info!!

## Create data hdf5s with Lambda Readcounts around Poisson mean

In [3]:
def create_poisson_mean_hdf5s(lengths, mean_rcs, org_folder="./Simulated/1000G_Mosaic/TSI5/", 
                              lambda_rc_string="lambda_rc", output=False):
    """Create Downsampled HDF5s with Poisson Mean Readcount Data. 
    And save into the same folder 
    but with lambda_rcX.X prefix in file
    lengths: Block Lengths Array [in CM]
    mean_rcs: Mean Readcount Array
    org_folder: The original Folder""" 

    for mean_rc in mean_rcs:
        print(f"Simulating Mean RC: {mean_rc}x ")
        for l in lengths:
            print(f"Doing block length: {l} cM")
            original_path = org_folder + "ch3_" + str(l) + "cm/data.h5"
            #save_path = org_folder + "rc" + str(mean_rc) + "/ch3_" + str(l) + "cm/data.h5"
            save_path = org_folder + lambda_rc_string + f"{mean_rc:.1f}" + "/ch3_" + str(l) + "cm/data.h5"

            if not os.path.exists(os.path.dirname(save_path)):   
                print(f"Creating DIR: {save_path}")
                os.makedirs(os.path.dirname(save_path))

            m = ModifyHDF5Genotypes(original_path = original_path, save_path = save_path, output=output)
            #m.generate_binomial_rc(mean_rc = mean_rc)
            m.generate_lambda_rc(mean_rc = mean_rc)
            m.copy_rohinfo()

    print("Once more: Finished the Job.")

In [None]:
lengths = [0, 2, 4, 6, 8, 10] # The Block Lengths to simulate
mean_rcs = np.linspace(0.1, 1, 10)
#mean_rcs = np.linspace(2, 6, 5)
#lengths =[8]
#mean_rcs = [1.]

org_folder = "./Simulated/1000G_Mosaic/CHB/"
create_poisson_mean_hdf5s(lengths=lengths, mean_rcs=mean_rcs, org_folder=org_folder)

### Create Downsampled Readcount File from TSI6 (the test case with 0.5x PH and 0.01 error)

In [4]:
create_poisson_mean_hdf5s(lengths=[0,4,], mean_rcs=[1.0,], 
                          org_folder="./Simulated/1000G_Mosaic/TSI6/")

Simulating Mean RC: 1.0x 
Doing block length: 0 cM
Creating DIR: ./Simulated/1000G_Mosaic/TSI6/lambda_rc1.0/ch3_0cm/data.h5
Doing block length: 4 cM
Creating DIR: ./Simulated/1000G_Mosaic/TSI6/lambda_rc1.0/ch3_4cm/data.h5
Once more: Finished the Job.


# Prepare data hdf5s with downsampled Pseudohaploid Data

In [10]:
def create_pseudohaploid_hdf5s(lengths, coverages, org_folder="./Simulated/1000G_Mosaic/TSI5/", 
                              lambda_rc_string="ph", output=False):
    """Create Downsampled HDF5s with Poisson Mean Readcount Data. And save into the same folder 
    but with lambda_rcX prefix (X coverage)
    lengths: Block Lengths Array [in CM]
    mean_rcs: Mean Readcount Array
    org_folder: The original Folder""" 
    
    for cov in coverages:
        print(f"Simulating Coverage: {cov}x ")
        for l in lengths:
            print(f"Doing block length: {l} cM")
            original_path = org_folder + "ch3_" + str(l) + "cm/data.h5"  # Hardcoded Path to Original Data
            save_path = org_folder + lambda_rc_string + f"{cov:.1f}" + "/ch3_" + str(l) + "cm/data.h5"

            if not os.path.exists(os.path.dirname(save_path)):   
                print(f"Creating DIR: {save_path}")
                os.makedirs(os.path.dirname(save_path))

            m = ModifyHDF5Genotypes(original_path = original_path, save_path = save_path, output=output)
            m.generate_ph(coverage=cov)
            m.copy_rohinfo()
    print("Great Job: Finished Successfully!")

In [13]:
lengths = [0, 2, 4, 6, 8, 10] # The Block Lengths to simulate
coverages = [1.0,]

org_folder = "./Simulated/1000G_Mosaic/TSI5/"
create_pseudohaploid_hdf5s(lengths=lengths, coverages=coverages, lambda_rc_string="ph", org_folder=org_folder)

Simulating Coverage: 1.0x 
Doing block length: 0 cM
Doing block length: 2 cM
Creating DIR: ./Simulated/1000G_Mosaic/TSI5/ph1.0/ch3_2cm/data.h5
Doing block length: 4 cM
Creating DIR: ./Simulated/1000G_Mosaic/TSI5/ph1.0/ch3_4cm/data.h5
Doing block length: 6 cM
Creating DIR: ./Simulated/1000G_Mosaic/TSI5/ph1.0/ch3_6cm/data.h5
Doing block length: 8 cM
Creating DIR: ./Simulated/1000G_Mosaic/TSI5/ph1.0/ch3_8cm/data.h5
Doing block length: 10 cM
Creating DIR: ./Simulated/1000G_Mosaic/TSI5/ph1.0/ch3_10cm/data.h5
Great Job: Finished Successfullyb.


# Area 51

In [36]:
m = ModifyHDF5Genotypes(original_path="./Simulated/1000G_Mosaic/TSI5/ch3_2cm/data.h5", save_path="./Simulated/1000G_Mosaic/TSI5/ch3_2cm/data_shitty.h5")
#m.downsample_gt(frac=0.8)
#m.create_error_gt(freq_flips=0.01)
m.copy_rohinfo()

Heyho back old friend. I started running
Loaded HDF5
Loaded 77652 variants
Loaded 100 individuals
['AD', 'GT']
['ALT', 'MAP', 'POS', 'REF']
./Simulated/1000G_Mosaic/TSI5/ch3_2cm/roh_info.csv
./Simulated/1000G_Mosaic/TSI5/ch3_2cm/roh_info.csv


### Creating Readcount Data from Genotype Data: Test

In [3]:
original_path="./Simulated/1000G_Mosaic/TSI5/ch3_4cm/data.h5"
save_path="./Simulated/1000G_Mosaic/TSI5/rc/ch3_4cm/data.h5"


if not os.path.exists(os.path.dirname(save_path)):   
    print(f"Creating DIR: {save_path}")
    os.makedirs(os.path.dirname(save_path))

m = ModifyHDF5Genotypes(original_path = original_path, save_path = save_path)
m.generate_binomial_rc(mean_rc=2)
m.copy_rohinfo()

Creating DIR: ./Simulated/1000G_Mosaic/TSI5/rc/ch3_4cm/data.h5
Heyho back old friend. I started running
Loaded HDF5
Loaded 77652 variants
Loaded 100 individuals
['AD', 'GT']
['ALT', 'MAP', 'POS', 'REF']
Mean Readcount: 2.0003
Successfully saved 100 individuals to: ./Simulated/1000G_Mosaic/TSI5/rc/ch3_4cm/data.h5


### Create Lambda Readcount Data from Genotypes Data: Test Case

In [28]:
original_path="./Simulated/1000G_Mosaic/TSI5/ch3_4cm/data.h5"
save_path="./Simulated/1000G_Mosaic/TSI5/lambda_rc0.3/ch3_4cm/data.h5"

if not os.path.exists(os.path.dirname(save_path)):   
    print(f"Creating DIR: {save_path}")
    os.makedirs(os.path.dirname(save_path))

m = ModifyHDF5Genotypes(original_path = original_path, save_path = save_path)
m.generate_lambda_rc(mean_rc = 0.3)
m.copy_rohinfo()

Heyho back old friend. I started running
Loaded HDF5
Loaded 77652 variants
Loaded 100 individuals
['AD', 'GT']
['ALT', 'MAP', 'POS', 'REF']
Extracted 81079 / 1145647 Loci on Chr.3
Found 77650 / 77652 Loci in Lambda Table
Mean Readcount: 0.1999
Successfully saved 100 individuals to: ./Simulated/1000G_Mosaic/TSI5/lambda_rc0.2/ch3_4cm/data.h5


## Testing HDF5s

In [25]:
#path = "./Simulated/1000G_Mosaic/TSI5/rc0.2/ch3_4cm/data.h5"
#path = "./Simulated/1000G_Mosaic/TSI5/ch3_10cm/data.h5"
#path = "./Simulated/1000G_Mosaic/TSI5/lambda_rc0.2/ch3_4cm/data.h5"
path = "./Simulated/1000G_Mosaic/CHB/ch3_4cm/data.h5"

f = h5py.File(path, "r") # Load for Sanity Check. See below!
        
print("Loaded HDF5")
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

Loaded HDF5
Loaded 77652 variants
Loaded 100 individuals
['AD', 'GT']
['ALT', 'MAP', 'POS', 'REF']


In [26]:
%%time
f["calldata/GT"][:,0,:]

CPU times: user 6.51 ms, sys: 22.8 ms, total: 29.3 ms
Wall time: 688 ms


array([[1, 1],
       [1, 0],
       [0, 0],
       ...,
       [0, 0],
       [1, 1],
       [0, 0]], dtype=int32)

In [27]:
%%time
f["calldata/AD"][:,0,:]

CPU times: user 3.33 ms, sys: 24.2 ms, total: 27.6 ms
Wall time: 648 ms


array([[1, 1],
       [1, 0],
       [0, 0],
       ...,
       [0, 0],
       [1, 1],
       [0, 0]], dtype=int32)

In [20]:
tot_cov = np.sum(f["calldata/AD"], axis=2)
print(f"Mean Coverage: {np.mean(tot_cov):.5f}")

Mean Coverage: 0.39964


In [21]:
np.var(tot_cov)

0.5753332670021388

In [23]:
np.mean(tot_cov==0)

0.7180924661944623

In [14]:
f.close()

## Testing the Lambda RCs:

In [None]:
ch=3
loadpath = "./Data/1000Genomes/Coverage/mean_cov1240k_Marcus.csv"

df_lambda = load_lambda(loadpath, ch=3)

In [33]:
pos_f = f["variants/POS"][:]
_, i1, i2 = np.intersect1d(pos_f, df_lambda["Pos"], return_indices=True)

In [10]:
path = "./Simulated/1000G_Mosaic/YRI/ch3_4cm/data.h5"

f = h5py.File(path, "r") # Load for Sanity Check. See below!
        
print("Loaded HDF5")
print("Loaded %i variants" % np.shape(f["calldata/AD"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/AD"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))

Loaded HDF5
Loaded 77652 variants
Loaded 100 individuals
['AD', 'GT']
['ALT', 'MAP', 'POS', 'REF']


In [11]:
f["samples"][:]

array(['iid0', 'iid1', 'iid2', 'iid3', 'iid4', 'iid5', 'iid6', 'iid7',
       'iid8', 'iid9', 'iid10', 'iid11', 'iid12', 'iid13', 'iid14',
       'iid15', 'iid16', 'iid17', 'iid18', 'iid19', 'iid20', 'iid21',
       'iid22', 'iid23', 'iid24', 'iid25', 'iid26', 'iid27', 'iid28',
       'iid29', 'iid30', 'iid31', 'iid32', 'iid33', 'iid34', 'iid35',
       'iid36', 'iid37', 'iid38', 'iid39', 'iid40', 'iid41', 'iid42',
       'iid43', 'iid44', 'iid45', 'iid46', 'iid47', 'iid48', 'iid49',
       'iid50', 'iid51', 'iid52', 'iid53', 'iid54', 'iid55', 'iid56',
       'iid57', 'iid58', 'iid59', 'iid60', 'iid61', 'iid62', 'iid63',
       'iid64', 'iid65', 'iid66', 'iid67', 'iid68', 'iid69', 'iid70',
       'iid71', 'iid72', 'iid73', 'iid74', 'iid75', 'iid76', 'iid77',
       'iid78', 'iid79', 'iid80', 'iid81', 'iid82', 'iid83', 'iid84',
       'iid85', 'iid86', 'iid87', 'iid88', 'iid89', 'iid90', 'iid91',
       'iid92', 'iid93', 'iid94', 'iid95', 'iid96', 'iid97', 'iid98',
       'iid99'], dtyp

In [86]:
tot_cov = np.sum(ad, axis=2)
np.var(tot_cov[:, 15])

6.375369456536567