# French Canadian

This notebook estimates the computational size of the French Canadian dataset, and computes the time that it takes to load the dataframe by using tszip. This also examines the computational size of the compressed French Canadian dataset.

In [1]:
import numpy as np
import os
import time
import tszip

In [2]:
# Location of French Canadian dataset
before = time.perf_counter()
ts = tszip.decompress("data/simulated_genomes_chr9.tsz")
duration = time.perf_counter() - before
duration

76.64165680005681

In [3]:
ts.num_individuals

2723339

In [4]:
tszip_size = os.path.getsize("data/simulated_genomes_chr9.tsz")
print("zipped size is ", tszip_size, "bytes")

zipped size is  1356776810 bytes


In [5]:
ts_sub = ts.delete_sites(np.arange(1000, ts.num_sites))
tmp_vcf = "data/tmp.vcf"
with open(tmp_vcf, "w") as f:
    ts_sub.write_vcf(f)

In [6]:
sub_size = os.path.getsize(tmp_vcf)
total_size = ts.num_sites * sub_size / ts_sub.num_sites

print("sub is ", sub_size, "bytes")
print("extrapolated size = ", total_size, "bytes")

sub is  5723035308 bytes
extrapolated size =  279836956795082.2 bytes


In [7]:
site_list = np.delete(np.arange(ts.num_sites), np.random.choice(ts.num_sites, 1000))

In [8]:
ts_sub = ts.delete_sites(site_list)
tmp_vcf = "data/tmp1.vcf"
with open(tmp_vcf, "w") as f:
    ts_sub.write_vcf(f)

In [9]:
sub_size = os.path.getsize(tmp_vcf)
total_size = ts.num_sites * sub_size / ts_sub.num_sites

print("sub is ", sub_size, "bytes")
print("extrapolated size = ", total_size, "bytes")

sub is  5723038511 bytes
extrapolated size =  279837113410885.56 bytes
