# French Canadian

This notebook estimates the computational size of the French Canadian dataset, and computes the time that it takes to load the dataframe by using tszip. This also examines the computational size of the compressed French Canadian dataset and the compressed vcf file.

The chromosome 9 data from the French Canadian dataset (`simulated_genomes_chr9.tsz`) is installed from https://zenodo.org/record/6839683.

Please put it inside `data` folder before running the code.

In [1]:
import gzip
import humanize
import numpy as np
import os
import shutil
import time
import tszip

In [2]:
# Location of French Canadian dataset
before = time.perf_counter()
ts = tszip.decompress("data/simulated_genomes_chr9.tsz")
duration = time.perf_counter() - before
duration

62.88050389988348

In [3]:
ts.num_individuals

2723339

In [4]:
tszip_size = os.path.getsize("data/simulated_genomes_chr9.tsz")
print("tszip size is ", humanize.naturalsize(tszip_size, format='%.3f'))

tszip size is  1.357 GB


In [5]:
ts_sub = ts.delete_sites(np.arange(1000, ts.num_sites))
tmp_vcf = "data/tmp.vcf"
with open(tmp_vcf, "w") as f:
    ts_sub.write_vcf(f)

In [6]:
with open(tmp_vcf, "rb") as f_in:
    with gzip.open("data/tmp.gz", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

In [7]:
sub_size = os.path.getsize(tmp_vcf)
total_size = ts.num_sites * sub_size / ts_sub.num_sites

print("sub is ", humanize.naturalsize(sub_size, format='%.3f'))
print("extrapolated size is ", humanize.naturalsize(total_size, format='%.3f'))

sub is  5.723 GB
extrapolated size is  279.837 TB


In [8]:
gzip_vcf_sub_size = os.path.getsize("data/simulated_genomes_chr9.tsz")
gzip_vcf_size = ts.num_sites * gzip_vcf_sub_size / ts_sub.num_sites
print("gzip vcf sub size is ", humanize.naturalsize(gzip_vcf_sub_size, format='%.3f'))
print("gzip vcf extrapolated size is ", humanize.naturalsize(gzip_vcf_size, format='%.3f'))

gzip vcf sub size is  1.357 GB
gzip vcf extrapolated size is  66.342 TB


In [9]:
ts_sub = ts.delete_sites(np.arange(10000, ts.num_sites))
tmp_vcf = "data/tmp.vcf"
with open(tmp_vcf, "w") as f:
    ts_sub.write_vcf(f)

In [10]:
with open(tmp_vcf, "rb") as f_in:
    with gzip.open("data/tmp.gz", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

In [11]:
sub_size = os.path.getsize(tmp_vcf)
total_size = ts.num_sites * sub_size / ts_sub.num_sites

print("sub is ", humanize.naturalsize(sub_size, format='%.3f'))
print("extrapolated size is ", humanize.naturalsize(total_size, format='%.3f'))

sub is  57.086 GB
extrapolated size is  279.132 TB


In [12]:
gzip_vcf_sub_size = os.path.getsize("data/simulated_genomes_chr9.tsz")
gzip_vcf_size = ts.num_sites * gzip_vcf_sub_size / ts_sub.num_sites
print("gzip vcf sub size is ", humanize.naturalsize(gzip_vcf_sub_size, format='%.3f'))
print("gzip vcf extrapolated size is ", humanize.naturalsize(gzip_vcf_size, format='%.3f'))

gzip vcf sub size is  1.357 GB
gzip vcf extrapolated size is  6.634 TB
