## Imports

In [1]:
import allel
import sys
import numpy as np
import zarr
import numcodecs
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

%matplotlib inline

In [2]:
def df_to_zarr(df, path):
    for i in range(len(df.columns)):

        # Saving each column array to a separate zarr folder:
        col_name = df.columns[i]
        array = df[col_name].values

        # If dtype of array is object, save it as a string array instead:
        if array.dtype == 'O':
            array = array.astype('U')

        # Changing folder name according to col. name:
        save_loc = path + col_name
        zarr.save(save_loc, array)

## Extract Zarr

In [3]:
zarr_path = '../data/ALL_30X_Chr22_GR38.zarr/'

In [4]:
# Create callset group:
callset_h1k = zarr.open_group(zarr_path)
callset_h1k

<zarr.hierarchy.Group '/'>

In [5]:
callset_h1k.tree(expand=True)

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, name='22', nodes=(Node(disabled=True, nam…

# Generating dataframes from arrays for easier wrangling

In [6]:
chrom = "22"
calldata = callset_h1k[chrom]['calldata']
variants = callset_h1k[chrom]['variants']

In [7]:
countries = ['AFR', 'AMR', 'EAS', 'EUR', 'SAS']

In [8]:
# heterozygous allele counts i.e. REF|ALT or ALT|REF counts by superpopulation 
ac_het = [f'AC_Het_{country}_unrel' for country in countries]
het_data = [[x[0] for x in variants[col][:]] for col in ac_het]
AC_het_df = pd.DataFrame({ac_het[i]: het_data[i] for i in range(len(ac_het))})
AC_het_df.head()

Unnamed: 0,AC_Het_AFR_unrel,AC_Het_AMR_unrel,AC_Het_EAS_unrel,AC_Het_EUR_unrel,AC_Het_SAS_unrel
0,0,0,0,0,1
1,0,0,1,0,0
2,0,1,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [9]:
# homozygous allele counts i.e. ALT|ALT by superpopulation
ac_hom = [f'AC_Hom_{country}_unrel' for country in countries]
hom_data = [[x[0] for x in variants[col][:]] for col in ac_hom]
AC_hom_df = pd.DataFrame({ac_hom[i]: hom_data[i] for i in range(len(ac_hom))})
AC_hom_df.head()

Unnamed: 0,AC_Hom_AFR_unrel,AC_Hom_AMR_unrel,AC_Hom_EAS_unrel,AC_Hom_EUR_unrel,AC_Hom_SAS_unrel
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,2,2,0,0,0
4,0,0,0,0,2


In [10]:
# retrieving population data for sample counts by superpopulation
pop_data = pd.read_csv('../data/integrated_call_samples_v3.20130502.ALL.panel', sep = '\t')
# Drop any Unnamed columns:
pop_data = pop_data.loc[:, ~pop_data.columns.str.contains('^Unnamed')]

In [11]:
population_totals = pop_data['super_pop'].value_counts().to_dict()
population_totals

{'AFR': 661, 'EAS': 504, 'EUR': 503, 'SAS': 489, 'AMR': 347}

### Calculating ALT allele frequency using ALT allele counts

1. 1 x heterozygous count because each het individual has 1 copy of ALT allele
2. 2 x homozygous count bc each homo individual has 2 copies of ALT allele
3. Dividing sum of 1. and 2. by 2 x size of population as each individual has 2 alleles therefore total number alleles is 2 x population.


In [12]:
alt_freq_cols = [f'ALT_AF_{country}' for country in countries]
alt_freq_data = [((AC_het_df[het_col]) + (2*AC_hom_df[hom_col]))/(population_totals[het_col[7:10]]*2) 
 for het_col, hom_col in zip(ac_het, ac_hom)]
alt_freq_df = pd.DataFrame({alt_freq_cols[i]: alt_freq_data[i] for i in range(len(alt_freq_cols))})
alt_freq_df.head()

Unnamed: 0,ALT_AF_AFR,ALT_AF_AMR,ALT_AF_EAS,ALT_AF_EUR,ALT_AF_SAS
0,0.0,0.0,0.0,0.0,0.001022
1,0.0,0.0,0.000992,0.0,0.0
2,0.0,0.001441,0.0,0.0,0.0
3,0.003026,0.005764,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.00409


### Calculating genotype frequency from allele frequency

1. Given the ALT allele frequency, REF allele frquency = 1 - ALT allele frequency
2. Hardy-Weinberg law states that p^2 + 2pq + q^2 = 1 where p = REF allele freq and q = ALT allele freq therefore
3. Expected genotype frequency of ALT allele = (allele counts)^2 other GT freqs calculated using 1-x logic

* expected homozygous alt genotype frequency is alt allele frequency squared
* expected homozygous ref genotype frequency is (1 - alt allele frequency)^2
* expected heterozygous genotype frequency is (1 - sum of homozygous GF)

In [13]:
homozygous_alt_GT_freq = alt_freq_df.apply(lambda x: x**2)
homozygous_ref_GT_freq = alt_freq_df.apply(lambda x: (1-x)**2)
heterozygous_GT_freq = (homozygous_alt_GT_freq + homozygous_ref_GT_freq).apply(lambda x: 1-x)

In [14]:
homozygous_alt_GT_freq.columns = [f'GT_FREQ_HOM_ALT_{x}' for x in countries]
homozygous_ref_GT_freq.columns = [f'GT_FREQ_HOM_REF_{x}' for x in countries]
heterozygous_GT_freq.columns = [f'GT_FREQ_HET_{x}' for x in countries]

## Saving files to zarr

In [15]:
final_data = pd.concat([homozygous_ref_GT_freq, heterozygous_GT_freq, homozygous_alt_GT_freq], axis =1)

In [16]:
df_to_zarr(df=final_data, path = '../data/GF.zarr/')

In [17]:
final_data.count()

GT_FREQ_HOM_REF_AFR    1927372
GT_FREQ_HOM_REF_AMR    1927372
GT_FREQ_HOM_REF_EAS    1927372
GT_FREQ_HOM_REF_EUR    1927372
GT_FREQ_HOM_REF_SAS    1927372
GT_FREQ_HET_AFR        1927372
GT_FREQ_HET_AMR        1927372
GT_FREQ_HET_EAS        1927372
GT_FREQ_HET_EUR        1927372
GT_FREQ_HET_SAS        1927372
GT_FREQ_HOM_ALT_AFR    1927372
GT_FREQ_HOM_ALT_AMR    1927372
GT_FREQ_HOM_ALT_EAS    1927372
GT_FREQ_HOM_ALT_EUR    1927372
GT_FREQ_HOM_ALT_SAS    1927372
dtype: int64