In [1]:
###################################################
##              Import libraries                 ##
###################################################

import pandas as pd
import numpy as np
import itertools
from collections import defaultdict
import  math

In [2]:
######################################################################
##                        Read the files                            ##
######################################################################
data = {}
for file in ['chania','rethymno','heraklion','lasithi']:
    data[file] = pd.read_excel(f'/Users/vasou/Documents/HLA/DHTOB/{file}.xlsx')

In [3]:
##############################################################################  
##                 Found the alleles for all regions                        ##
##############################################################################

def unique_and_sorted_allele(data, #Dictionary: key (string) = region (chania,rethymno etc), value(dataframe) = allleles
                            ):
    
    alleles_all_regions = defaultdict(list)
    
    for region in data:
        for allele in ['A', 'B', 'C', 'DRB1','DQB1','DPB1']:#set([i.split('_')[0] if '_' in i else i[0] for i in data[key].columns[1:]]):
            alleles_all_regions[allele].append(list(pd.unique(
                                                data[region][[allele+'_1' if 'D' in allele else allele + '1',
                                                allele+'_2' if 'D' in allele else allele + '2']].values.ravel())))

    alleles_all_regions = {allele:sorted(set(list(itertools.chain(*alleles_all_regions[allele])))) for allele in alleles_all_regions}
    
    return alleles_all_regions

In [4]:
#############################################################################################  
##                            Allele and Genotype counts                                   ##
#############################################################################################


def allele_genotype_counts(data, #Dictionary: key = region(chania,rethymno etc), value(dataframe) = allleles
                           type_count = 'Allele', #Allele: returns allele counts, Genotype:returns observed genotype counts
                           alleles_all_regions = None
                          ):
    
    count_dict = defaultdict(lambda:defaultdict())
    
    if not alleles_all_regions:
        ########################################################
        ##   First found the unique alleles from all regions  ##
        ########################################################
        alleles_all_regions = unique_and_sorted_allele(data)

    for region in data:
        for allele in alleles_all_regions:
            column_1 = allele + '_1' if 'D' in allele else allele + '1'
            column_2 = allele + '_2' if 'D' in allele else allele + '2'
            total_het = 0
            total_hom = 0
            for i in range(len(alleles_all_regions[allele])):

                #Check Hom
                count_hom = sum((data[region][column_1] == alleles_all_regions[allele][i]) & (
                            data[region][column_2] == alleles_all_regions[allele][i]))
                
                if type_count == 'Allele': 
                    count_hom = 2*count_hom
                    
                    #Het
                    count_het_1 = sum((data[region][column_1] == alleles_all_regions[allele][i]))
                    count_het_2 = sum((data[region][column_2] == alleles_all_regions[allele][i]))
                    count_het = (count_het_1 + count_het_2) - count_hom
                else:
                    #Het
                    count_het = sum((data[region][column_1] == alleles_all_regions[allele][i])) - count_hom
                    
                total_het += count_het
                total_hom += count_hom
                
                count_dict[f'hom_{region}_{allele}'][alleles_all_regions[allele][i]] = count_hom
                
                #Store Het
                count_dict[f'het_{region}_{allele}'][alleles_all_regions[allele][i]] = count_het
                
            count_dict[f'n_hom_{allele}_{region}'] = total_hom
            count_dict[f'n_het_{allele}_{region}'] = total_het
                        

            #for i in range(len(alleles_all_regions[allele])):
            #    
            #    count_dict[f'het_rest_{region}_{allele}'][alleles_all_regions[allele][i]] = total_het - count_dict[f'het_{region}_{allele}'][alleles_all_regions[allele][i]]
            #    count_dict[f'hom_rest_{region}_{allele}'][alleles_all_regions[allele][i]] = total_hom - count_dict[f'hom_{region}_{allele}'][alleles_all_regions[allele][i]]
    
    
    for allele in alleles_all_regions:
        hom_total = 0
        het_total = 0
        for i in range(len(alleles_all_regions[allele])):
            hom = 0
            het = 0
            for region in data:
                hom += count_dict[f'hom_{region}_{allele}'][alleles_all_regions[allele][i]]
                het += count_dict[f'het_{region}_{allele}'][alleles_all_regions[allele][i]]
                
                 
                count_dict[f'het_rest_{region}_{allele}'][alleles_all_regions[allele][i]] = count_dict[f'n_het_{allele}_{region}'] - count_dict[f'het_{region}_{allele}'][alleles_all_regions[allele][i]]
                count_dict[f'hom_rest_{region}_{allele}'][alleles_all_regions[allele][i]] = count_dict[f'n_hom_{allele}_{region}'] - count_dict[f'hom_{region}_{allele}'][alleles_all_regions[allele][i]]

                    
            count_dict[f'hom_total_{allele}'][alleles_all_regions[allele][i]] = hom 
            count_dict[f'het_total_{allele}'][alleles_all_regions[allele][i]] = het
            hom_total += hom
            het_total += het
            
        count_dict[f'n_hom_total_{allele}'] = hom_total
        count_dict[f'n_het_total_{allele}'] = het_total
            

        for i in range(len(alleles_all_regions[allele])):

            count_dict[f'hom_rest_total_{allele}'][alleles_all_regions[allele][i]] = hom_total - count_dict[f'hom_total_{allele}'][alleles_all_regions[allele][i]]
            count_dict[f'het_rest_total_{allele}'][alleles_all_regions[allele][i]] = het_total - count_dict[f'het_total_{allele}'][alleles_all_regions[allele][i]]
           
    return count_dict


#allele_count = allele_genotype_counts(data, type_count = 'Allele')
#obs_gen_count = allele_genotype_counts(data, type_count = 'Genotype')
allele_count = allele_genotype_counts(data, type_count = 'Allele')


In [5]:
#######################################################################################
##                         Step 1: Allele Frequencies                                ##
#######################################################################################


def help_func_freq(a,b,c,d, allele_freq, key_part1, key_part2):
    nominator = a +b
    denominator = c + d
    calc_p = nominator/denominator
    
    allele_freq[f'p_{key_part1}'][key_part2] = calc_p
    allele_freq[f'p^2_{key_part1}'][key_part2] = calc_p * calc_p
    allele_freq[f'q_{key_part1}'][key_part2] = 1 - calc_p
    allele_freq[f'q^2_{key_part1}'][key_part2] = (1 - calc_p)**2
    
    
def allele_frequencies(data, #Dictionary: key (string) = region (chania,rethymno etc), value(dataframe) = allleles
                      alleles_all_regions = None,
                      allele_count = None
                      ):
    
    n_allele = defaultdict(lambda: defaultdict(lambda: 0))
    allele_freq = defaultdict(lambda: defaultdict(lambda: 0))
    
    if not alleles_all_regions and not allele_count:
        ########################################################
        ##   First found the unique alleles from all regions  ##
        ########################################################
        alleles_all_regions = unique_and_sorted_allele(data)

        ########################################################
        ##                   Allele counts                    ##
        ########################################################
        allele_count = allele_genotype_counts(data, type_count = 'Allele')

    ########################################################
    ##              Calculate frequencies                 ##
    ########################################################

    for allele in alleles_all_regions:
        for i in range(len(alleles_all_regions[allele])):
            total_samples = 0
            for region in data:
                total_samples += data[region].shape[0]
                help_func_freq(allele_count[f'hom_{region}_{allele}'][alleles_all_regions[allele][i]],
                          allele_count[f'het_{region}_{allele}'][alleles_all_regions[allele][i]],
                          allele_count[f'n_hom_{allele}_{region}'],
                          allele_count[f'n_het_{allele}_{region}'], 
                          allele_freq, 
                          f"{region}_{allele}", 
                          alleles_all_regions[allele][i])
                
               
            help_func_freq(allele_count[f'hom_total_{allele}'][alleles_all_regions[allele][i]],
                      allele_count[f'het_total_{allele}'][alleles_all_regions[allele][i]],
                      allele_count[f'n_hom_total_{allele}'], 
                      allele_count[f'n_het_total_{allele}'],
                      allele_freq,
                      f"total_{allele}", 
                      alleles_all_regions[allele][i])


    return allele_freq


#allele_freq = allele_frequencies(data)


In [7]:
#######################################################################################
##             Step 2: expected genotypic counts (under HWE)                         ##
#######################################################################################


def help_func_hwe(a,b,c,d,counts_HWE, key_part1, key_part2):
    counts_HWE[f'hom_p^2_{key_part1}'][key_part2] = a*b
    counts_HWE[f'het_2pq_{key_part1}'][key_part2] = 2*c*d*b

def HWE(data,
        alleles_all_regions = None,
        allele_freq = None
       ):
    
    counts_HWE = defaultdict(lambda: defaultdict(lambda: 0))
    
    if not alleles_all_regions and not allele_freq:
        ########################################################
        ##   First found the unique alleles from all regions  ##
        ########################################################
        alleles_all_regions = unique_and_sorted_allele(data)

        ########################################################
        ##            Calculate allele frequencies            ##
        ########################################################
        allele_freq = allele_frequencies(data)
    
    for allele in alleles_all_regions:
        for i in range(len(alleles_all_regions[allele])):
            total_samples = 0
            for region in data:
                total_samples += data[region].shape[0]
                
                help_func_hwe(allele_freq[f'p^2_{region}_{allele}'][alleles_all_regions[allele][i]],
                              data[region].shape[0],
                              allele_freq[f'p_{region}_{allele}'][alleles_all_regions[allele][i]],
                              allele_freq[f'q_{region}_{allele}'][alleles_all_regions[allele][i]],
                              counts_HWE,
                              f'{region}_{allele}',
                              alleles_all_regions[allele][i] 
                             )
                
            help_func_hwe(allele_freq[f'p^2_total_{allele}'][alleles_all_regions[allele][i]],
                          total_samples,
                          allele_freq[f'p_total_{allele}'][alleles_all_regions[allele][i]],
                          allele_freq[f'q_total_{allele}'][alleles_all_regions[allele][i]],
                          counts_HWE,
                          f'total_{allele}',
                          alleles_all_regions[allele][i] 
                         )
    return counts_HWE
              
#counts_HWE = HWE(data)

In [6]:
#######################################################################################
##                 Step 3: excess or deficiencies of observed                        ##
##      homozygotes in each subpopulation relative to HWE genotypic counts           ##
#######################################################################################

def deficiencies_sub(data,
                     alleles_all_regions = None,
                     obs_gen_count = None,
                     counts_HWE = None,
                    ):
    
    obs_exp = defaultdict(lambda: defaultdict(lambda: 0))
    
    if not alleles_all_regions and not obs_gen_count and not counts_HWE:
        ########################################################
        ##   First found the unique alleles from all regions  ##
        ########################################################
        alleles_all_regions = unique_and_sorted_allele(data)

        ########################################################
        ##                   Allele counts                    ##
        ########################################################
        obs_gen_count = allele_genotype_counts(data, type_count = 'Genotype')

        ########################################################
        ##                   Calculate HWE                    ##
        ########################################################
        counts_HWE = HWE(data)
    
    ########################################################
    ##             Calculate deficiencies                 ##
    ########################################################
    for allele in alleles_all_regions:
        for i in range(len(alleles_all_regions[allele])):
            total_samples = 0
            for region in data:
    
                obs_exp[f'obs-exp_p^2_{region}_{allele}'][alleles_all_regions[allele][i]] = (
                                obs_gen_count[f'hom_{region}_{allele}'][alleles_all_regions[allele][i]] -
                                counts_HWE[f'hom_p^2_{region}_{allele}'][alleles_all_regions[allele][i]])
            obs_exp[f'obs-exp_p^2_total_{allele}'][alleles_all_regions[allele][i]] = (
                            obs_gen_count[f'hom_total_{allele}'][alleles_all_regions[allele][i]]-
                            counts_HWE[f'hom_p^2_total_{allele}'][alleles_all_regions[allele][i]])

    return obs_exp


#excess = deficiencies_sub(data)


In [8]:
#######################################################################################
##   Step4: excess or deficiencies of observed homozygotes in the total population   ##
#######################################################################################

def deficiencies_total(data,
                       excess_sub = None
                      ):
    
    sum_obs_exp = defaultdict(lambda: 0)
    
    if not excess_sub:
        ########################################################
        ##          Calculate deficiencies_sub                ##
        ########################################################
        excess_sub = deficiencies_sub(data)
    
    ########################################################
    ##             Calculate deficiencies                 ##
    ########################################################
    for key in excess_sub:

        sum_obs_exp[f'sum_{key}'] = "{:.2f}".format(sum(excess_sub[key].values()))

    return sum_obs_exp


In [9]:
#######################################################################################
##             Step 5: local observed heterozygosities of each subpopulation         ##
#######################################################################################


def obs_heterozygosities(data,
                         obs_gen_count = None,
                        ):
    obs_het = defaultdict(lambda:0)
    
    if not obs_gen_count:
        ########################################################
        ##                   Allele counts                    ##
        ########################################################
        obs_gen_count = allele_genotype_counts(data, type_count = 'Genotype')
    
    ########################################################
    ##           local observed heterozygosities          ##
    ########################################################
    
    for allele in data['chania'].columns[1:]:
        total_samples = 0
        for region in data:
            total_samples+=data[region].shape[0]
            allele = allele.split('_')[0] if 'D' in allele else allele[0]
            obs_het[f'Het_{region}_{allele}'] = obs_gen_count[f'n_het_{allele}_{region}']/data[region].shape[0]
        obs_het[f'Het_total_{allele}'] = obs_gen_count[f'n_het_total_{allele}']/total_samples
            
    return obs_het


#obs_het = obs_heterozygosities(data)


In [10]:
#######################################################################################
##               Step 6: expected heterozygosities of each subpopulation             ##
#######################################################################################

def exp_heterozygosities(data,
                        allele_freq = None,
                        ):
    
    exp_het = defaultdict(lambda:0)
    
    if not allele_freq:
        ########################################################
        ##            Calculate allele frequencies            ##
        ########################################################
        allele_freq = allele_frequencies(data)
    
    for allele in data['chania'].columns[1:]:
        allele = allele.split('_')[0] if 'D' in allele else allele[0]
        for region in data:
            exp_het[f'Het_{region}_{allele}'] = 1 - sum(allele_freq[f'p^2_{region}_{allele}'].values())
        exp_het[f'Het_total_{allele}'] = 1 - sum(allele_freq[f'p^2_total_{allele}'].values())
    return exp_het


#expected_het = exp_heterozygosities(data)

In [17]:
#######################################################################################
##  Step 7: local inbreeding coefficient of each subpopulation (Fs=Hexp-Hobs/Hexp)   ##
#######################################################################################

def Fs_coefficient(data,
                   expected_het = None,
                   obs_het = None,
                  ):
    
    Fs = defaultdict(lambda: 0)
    
    if not expected_het and not obs_het:
        
        ########################################################
        ##         Calculate expected heterozygosities        ##
        ########################################################
        expected_het = exp_heterozygosities(data)

        ########################################################
        ##         Calculate observed heterozygosities        ##
        ########################################################
        obs_het = obs_heterozygosities(data)
    
    for allele in data['chania'].columns[1:]:
        allele = allele.split('_')[0] if 'D' in allele else allele[0]
        for region in data:
            Fs[f'Fs_{region}_{allele}'] = (expected_het[f'Het_{region}_{allele}'] - 
                                           obs_het[f'Het_{region}_{allele}'])/(
                                           expected_het[f'Het_{region}_{allele}'])
            
        Fs[f'Fs_total_{allele}'] = (expected_het[f'Het_total_{allele}'] - 
                                           obs_het[f'Het_total_{allele}'])/(
                                            expected_het[f'Het_total_{allele}'])
            
    return Fs
            

#fs_coef = Fs_coefficient(data)

In [21]:
#######################################################################################
##            Step8: p_allele bar (global frequency of each allele                   ##
##       across all subpopulations and weighted by each subpopulation size)          ##
#######################################################################################


def help_func_bar(a,b,c,d,allele_bar, key_part1, key_part2):
    
    if d == 0:
        denominator = b
        nominator = c*a
        allele_bar[f'{key_part1}'][key_part2] += nominator/denominator
        allele_bar[f'square_{key_part1}'][key_part2] = allele_bar[f'{key_part1}'][key_part2]**2
    else: 
        denominator = a+b
        nominator = c*a + d*b
        allele_bar[f'{key_part1}'][key_part2] = nominator/denominator
        allele_bar[f'square_{key_part1}'][key_part2] = (nominator/denominator)**2

def p_allele_bar(data,
                 alleles_all_regions = None,
                 allele_freq = None,
                 allele_count = None
                ):
    
    n_allele = defaultdict(lambda: 0)
    allele_bar = defaultdict(lambda: defaultdict(lambda: 0))
    
    if not alleles_all_regions and not allele_freq and not allele_count:
    
        ########################################################
        ##   First found the unique alleles from all regions  ##
        ########################################################
        alleles_all_regions = unique_and_sorted_allele(data)

        ########################################################
        ##            Calculate allele frequencies            ##
        ########################################################
        allele_freq = allele_frequencies(data)

        ########################################################
        ##                   Allele counts                    ##
        ########################################################
        allele_count = allele_genotype_counts(data, type_count = 'Allele')
    
    total_samples = sum([data[region].shape[0] for region in data])*2
    for allele in alleles_all_regions:
        for region in data:
            for i in range(len(alleles_all_regions[allele])):
                
                help_func_bar(data[region].shape[0]*2,
                              total_samples,
                              allele_freq[f'p_{region}_{allele}'][alleles_all_regions[allele][i]],
                              0,
                              allele_bar,
                              f'all_{allele}',
                              alleles_all_regions[allele][i]
                             )


        for i in range(len(alleles_all_regions[allele])):
            for region_1, region_2 in list(itertools.combinations(data.keys(), 2)):

                help_func_bar(data[region_1].shape[0]*2,
                              data[region_2].shape[0]*2,
                              allele_freq[f'p_{region_1}_{allele}'][alleles_all_regions[allele][i]],
                              allele_freq[f'p_{region_2}_{allele}'][alleles_all_regions[allele][i]],
                              allele_bar,
                              f'{region_1}_{region_2}_{allele}',
                              alleles_all_regions[allele][i]
                             )

    return allele_bar

In [27]:
#######################################################################################
##                   Step9&10&11: heterozygosity indices                             ##
##              (Hi=weighted average of observed heterozygosities),                  ##
##              (Hs =weighted average of expected heterozygosities),                 ##
## (Ht=expected hetrozygosty based on the global gene frequencies i.e. p_allele-bar) ##
#######################################################################################

def heterozygosity_indices(data,
                           expected_het = None,
                           obs_het = None,
                           allele_bar = None,
                          ):
    
    Het_indices = defaultdict(lambda: 0)
    
    if not expected_het and not obs_het and not allele_bar:
        ########################################################
        ##         Calculate expected heterozygosities        ##
        ########################################################
        expected_het = exp_heterozygosities(data)

        ########################################################
        ##         Calculate observed heterozygosities        ##
        ########################################################
        obs_het = obs_heterozygosities(data)

        ########################################################
        ##         Calculate observed heterozygosities        ##
        ########################################################
        allele_bar = p_allele_bar(data)

    
    for allele in data['chania'].columns[1:]:
        total_samples = 0
        allele = allele.split('_')[0] if 'D' in allele else allele[0]
        for region in data:
            total_samples+=data[region].shape[0]
            Het_indices[f'Hi_all_{allele}'] += (obs_het[f'Het_{region}_{allele}'] * data[region].shape[0])
            Het_indices[f'Hs_all_{allele}'] += (expected_het[f'Het_{region}_{allele}'] * data[region].shape[0])
            
        Het_indices[f'Hi_all_{allele}'] /= total_samples
        Het_indices[f'Hs_all_{allele}'] /= total_samples
        Het_indices[f'Ht_all_{allele}'] = 1 - sum(allele_bar[f'square_all_{allele}'].values())
        

        for region_1, region_2 in list(itertools.combinations(data.keys(), 2)):  
            Het_indices[f'Hi_{region_1}_{region_2}_{allele}'] = (
                                                    (obs_het[f'Het_{region_1}_{allele}'] * data[region_1].shape[0])
                                                     + (obs_het[f'Het_{region_2}_{allele}'] * data[region_2].shape[0]))\
                                                     / (data[region_1].shape[0] + data[region_2].shape[0])

            Het_indices[f'Hs_{region_1}_{region_2}_{allele}'] = (
                                                    (expected_het[f'Het_{region_1}_{allele}'] * data[region_1].shape[0])
                                                 + (expected_het[f'Het_{region_2}_{allele}'] * data[region_2].shape[0]))\
                                                 / (data[region_1].shape[0] + data[region_2].shape[0])
            Het_indices[f'Ht_{region_1}_{region_2}_{allele}'] = 1 - \
                                    sum(allele_bar[f'square_{region_1}_{region_2}_{allele}'].values())
            
    return Het_indices
 
    
    
#Het_indices = heterozygosity_indices(data)


In [14]:
#######################################################################################
##           Step12: global Fstatistics (FIS_observed heterozygosities,              ## 
##                    FST_expected heterozygosities,                                 ##
##                    FIT_expected heterozygosities)                                 ##
#######################################################################################

def help_func_Fstats(a,b,c, Fstatistics, key_part):
    
    Fstatistics[f'Fis_{key_part}'] = (a-b)/a
    Fstatistics[f'Fst_{key_part}'] = (c-a)/c
    Fstatistics[f'Fit_{key_part}'] = (c-b)/c

def global_Fstatistics(data,
                       Het_indices = None,
                      ):
    
    Fstatistics = defaultdict(lambda:0)
    
    if not Het_indices:
        ########################################################
        ##          Calculate heterozygosity indices          ##
        ########################################################
        Het_indices = heterozygosity_indices(data)
    
    for allele in data['chania'].columns[1:]:
        allele = allele.split('_')[0] if 'D' in allele else allele[0]
        help_func_Fstats(Het_indices[f'Hs_all_{allele}'],
                         Het_indices[f'Hi_all_{allele}'],
                         Het_indices[f'Ht_all_{allele}'],
                         Fstatistics,
                         f'all_{allele}',
                         )
        
        for region_1, region_2 in list(itertools.combinations(data.keys(), 2)):
            
            help_func_Fstats(Het_indices[f'Hs_{region_1}_{region_2}_{allele}'],
                             Het_indices[f'Hi_{region_1}_{region_2}_{allele}'],
                             Het_indices[f'Ht_{region_1}_{region_2}_{allele}'],
                             Fstatistics,
                             f'_{region_1}_{region_2}_{allele}',
                            )
    return Fstatistics
    
        
#Fstatistics = global_Fstatistics(data)


In [31]:
def fst_pipeline(data):
    ########################################################
    ##   First found the unique alleles from all regions  ##
    ########################################################
    alleles_all_regions = unique_and_sorted_allele(data)
    
    ########################################################
    ##            Allele and Genotype counts              ##
    ########################################################
    allele_count = allele_genotype_counts(data, 
                                          'Allele', 
                                          alleles_all_regions)
    
    obs_gen_count = allele_genotype_counts(data, 
                                          'Genotype', 
                                           alleles_all_regions
                                          )
    
    ########################################################
    ##            Calculate allele frequencies            ##
    ########################################################
    allele_freq = allele_frequencies(data,
                                     alleles_all_regions,
                                     allele_count
                                    )
    
    ########################################################
    ##                   Calculate HWE                    ##
    ########################################################
    counts_HWE = HWE(data,
                     alleles_all_regions,
                     allele_freq
                    )
    
    ########################################################
    ##          Calculate deficiencies_sub                ##
    ########################################################
    excess_sub = deficiencies_sub(data,
                                  alleles_all_regions,
                                  obs_gen_count,
                                  counts_HWE
                                 )
    
    ########################################################
    ##          Calculate deficiencies_sub                ##
    ########################################################
    excess_total = deficiencies_total(data,
                                      excess_sub
                                     )
    
    ########################################################
    ##         Calculate observed heterozygosities        ##
    ########################################################
    obs_het = obs_heterozygosities(data,
                                   obs_gen_count
                                  )
 
    ########################################################
    ##         Calculate expected heterozygosities        ##
    ########################################################
    expected_het = exp_heterozygosities(data,
                                        allele_freq
                                       )
    
    ########################################################
    ##               Calculate allele bar                 ##
    ########################################################
    allele_bar = p_allele_bar(data,
                              alleles_all_regions,
                              allele_freq,
                              allele_count
                             )
    
    ########################################################
    ##          Calculate heterozygosity indices          ##
    ########################################################
    Het_indices = heterozygosity_indices(data,
                                         expected_het,
                                         obs_het,
                                         allele_bar
                                        )

    ########################################################
    ##            Calculate global Fstatistic             ##
    ########################################################

    Fstatistics = global_Fstatistics(data,
                                     Het_indices
                                    )
    
    return (alleles_all_regions, allele_count, 
            obs_gen_count, allele_freq,counts_HWE,
            excess_sub,excess_total,obs_het,expected_het,
            allele_bar, Het_indices, Fstatistics)
    
    
    

In [32]:
(alleles_all_regions, allele_count, 
obs_gen_count, allele_freq,counts_HWE,
excess_sub,excess_total,obs_het,expected_het,
allele_bar, Het_indices, Fstatistics) = fst_pipeline(data)


In [37]:
########################################
## store allele_counts and gen_counts ##
########################################

results_dataframe = pd.concat([pd.DataFrame([ f"{key}*{element}"for element in alleles_all_regions[key]]) for key in alleles_all_regions.keys()])
for key in set(["_".join(values.split("_")[:-1]) for values in allele_count if not values.startswith("n_")]):
    results_dataframe['allele_' + key] = [allele_count[key + "_" + item.split("*")[0]][item.split("*")[1]] for item in list(results_dataframe.iloc[:,0].values)]
    results_dataframe['gen_' + key] = [obs_gen_count[key + "_" + item.split("*")[0]][item.split("*")[1]] for item in list(results_dataframe.iloc[:,0].values)]

for key in set(["_".join(values.split("_")[:-1]) for values in allele_freq]):
    results_dataframe[key] = [allele_freq[key + "_" + item.split("*")[0]][item.split("*")[1]] for item in list(results_dataframe.iloc[:,0].values)]
    
for key in set(["_".join(values.split("_")[:-1]) for values in counts_HWE]):
    results_dataframe['HWE_'+key] = [counts_HWE[key + "_" + item.split("*")[0]][item.split("*")[1]] for item in list(results_dataframe.iloc[:,0].values)]
    
for key in set(["_".join(values.split("_")[:-1]) for values in excess_sub]):
    results_dataframe[key] = [excess_sub[key + "_" + item.split("*")[0]][item.split("*")[1]] for item in list(results_dataframe.iloc[:,0].values)]

for key in set(["_".join(values.split("_")[:-1]) for values in allele_bar]):
    results_dataframe['allele_bar_'+key] = [allele_bar[key + "_" + item.split("*")[0]][item.split("*")[1]] for item in list(results_dataframe.iloc[:,0].values)]
 

In [40]:
results_dataframe

Unnamed: 0,0,allele_hom_total,gen_hom_total,allele_het_rest_total,gen_het_rest_total,allele_het_lasithi,gen_het_lasithi,allele_hom_rest_lasithi,gen_hom_rest_lasithi,allele_hom_rest_heraklion,...,allele_bar_square_chania_rethymno,allele_bar_rethymno_heraklion,allele_bar_chania_heraklion,allele_bar_rethymno_lasithi,allele_bar_heraklion_lasithi,allele_bar_square_chania_lasithi,allele_bar_chania_rethymno,allele_bar_square_rethymno_heraklion,allele_bar_square_chania_heraklion,allele_bar_chania_lasithi
0,A*01:01,48,24,2812,1249,56,56,74,37,96,...,9.200333e-03,0.114301,0.107679,0.098200,0.110505,8.508067e-03,0.095918,1.306463e-02,1.159471e-02,0.092239
1,A*01:02,0,0,3125,1561,0,0,80,40,118,...,3.188776e-06,0.001044,0.001324,0.000000,0.003021,2.281834e-06,0.001786,1.089605e-06,1.752761e-06,0.001511
2,A*01:03,0,0,3120,1558,4,3,80,40,118,...,4.164931e-06,0.001044,0.001324,0.004092,0.002478,1.456792e-05,0.002041,1.089605e-06,1.752761e-06,0.003817
3,A*02:01,144,72,2598,1107,109,95,44,22,70,...,3.225323e-02,0.180585,0.197705,0.184943,0.203171,4.353541e-02,0.179592,3.261078e-02,3.908735e-02,0.208651
4,A*02:02,0,0,3104,1545,1,1,80,40,118,...,1.184692e-04,0.004175,0.009709,0.001637,0.003964,1.035941e-04,0.010884,1.743368e-05,9.425959e-05,0.010178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,DPB1*701:01,0,0,2806,1404,1,0,112,56,290,...,0.000000e+00,0.000522,0.000441,0.000818,0.000991,4.046643e-07,0.000000,2.724012e-07,1.947512e-07,0.000636
37,DPB1*791:01,0,0,2807,1404,1,0,112,56,290,...,0.000000e+00,0.000000,0.000000,0.000818,0.000496,4.046643e-07,0.000000,0.000000e+00,0.000000e+00,0.000636
38,DPB1*80:01,0,0,2807,1404,0,0,112,56,290,...,3.188776e-06,0.000000,0.000737,0.000000,0.000000,2.281834e-06,0.001786,0.000000e+00,5.438519e-07,0.001511
39,DPB1*81:01,0,0,2807,1404,0,0,112,56,290,...,3.188776e-06,0.000000,0.000737,0.000000,0.000000,2.281834e-06,0.001786,0.000000e+00,5.438519e-07,0.001511
