Code to reproduce plots from: https://www.sciencedirect.com/science/article/pii/S1465324921007453 

In [2]:
fn = '/Users/admin/HLA/RUNS_haplomat/1835/frequencies.dat'

In [3]:
def haplomat_output_parce(hfs_dat, n):
    
    with open(hfs_dat) as f:
        cretan_haplotypes = [x.replace('\n', '').replace('g', '').split() for x in f.readlines()]

    cretan_haplotypes = [( '-'.join(sorted(x[0].split('~'))  ), float(x[1])) for x in cretan_haplotypes]

    cretan_haplotypes = {x[0]:x[1] for x in cretan_haplotypes}
    cretan_haplotypes = {k:v for k,v in cretan_haplotypes.items() if v >= 1/(2*n)}
    
    return cretan_haplotypes

In [14]:
### After we run the Hapl-o-Mat with the g grouping without missing data we calculate the allele frequencies

def af_estimation_from_HaploMat_g_group(hfs_dat, hla_gene, num_of_people, alleles=None, counts=None):
    
    '''
    !!!README!!!
    
    This function calculates the allele frequency of the HLA genes A, B, C, DRB1, DQB1 and DPB1 
    if the haplotypes have them. 
    
    Hapl-o-Mat orders the genes in the haplotypes alphabetically.
    
    The haplotypes with 4 genes should have the HLA genes below in this order only:
    A~B~C~DRB1
    
    The haplotypes with 5 genes should have the HLA genes below in this order only:
    A~B~C~DQB1~DRB1
    
    The haplotypes with 6 genes should have the HLA genes below in this order only:
    A~B~C~DPB1~DQB1~DRB1
    
    Other combinations of the HLA genes can not be used as input.    
    
    This function returns the estimated allele frequencies/counts and alleles of the sample from Hapl-o-Mat.
    '''
    
    if hla_gene not in ('A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1'):
        raise Exception("The hla_gene can only be 'A', 'B', 'C', 'DRB1', 'DQB1' or 'DPB1'")
        
    if counts and alleles:
        raise Exception('Only 1 output can come from this function.')
    
    with open(hfs_dat, 'r') as f:
        f = f.readlines()
        
    f = [x.strip('\n').split('\t') for x in f]
    allele_freq = [(x[0].split('~'), float(x[1])) for x in f]
    allele_freq = [([y.strip('g') for y in x[0]],x[1]) for x in allele_freq]
    
    if hla_gene == 'A':
        n = 0        
    elif hla_gene == 'B':
        n = 1        
    elif hla_gene == 'C':
        n = 2        
    
    elif hla_gene == 'DRB1': # Hapl-o-Mat orders the genes in the haplotypes alphabetically.
        if len(allele_freq[0][0]) == 4:     
            n = 3
        elif len(allele_freq[0][0]) == 5:     
            n = 4    
        elif len(allele_freq[0][0]) == 6:
            n = 5
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')
    
    elif hla_gene == 'DQB1':
        if len(allele_freq[0][0]) == 4:     
            raise Exception('DQB1 should not be in a haplotype with only 4 genes. Read the documentation.')
        elif len(allele_freq[0][0]) == 5:     
            n = 3    
        elif len(allele_freq[0][0]) == 6:
            n = 4
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')                 
    
    elif hla_gene == 'DPB1':
        if len(allele_freq[0][0]) == 4:     
            raise Exception('DPB1 should not be in a haplotype with only 4 genes. Read the documentation.')
        elif len(allele_freq[0][0]) == 5:     
            raise Exception('DPB1 should not be in a haplotype with only 5 genes. Read the documentation.')    
        elif len(allele_freq[0][0]) == 6:
            n = 3
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')
    
    HLA_gene = {el:0 for el in [x[0][n] for x in allele_freq]}
    
    for x in allele_freq:
        HLA_gene[x[0][n]] += x[1]
    
    HLA_gene = {k:v for k, v in sorted(HLA_gene.items(), key=lambda item: item[1], reverse=True)}
    
    if alleles:
        return list(HLA_gene.keys())
    
    elif counts:        
        return {k:round(v*num_of_people) for k,v in HLA_gene.items()} 
        # No need to multiply by 2, because of the 2 genotypes on MAC input
    
    else:
        return HLA_gene

In [16]:
def chi2(gene_final_cretan_study, gene_final_freqnet, common_alleles, people_in_study, population, sig_afs=None, counts=None):
    
    if not len(gene_final_cretan_study) == len(gene_final_freqnet) == len(common_alleles):
        raise Exception('Not the same length of common genes')
    
    gene = common_alleles[0].split('*')[0]
    
    cretan_counts = [x*people_in_study*2 for x in gene_final_cretan_study]
    cretan_counts = [np.round(x) for x in cretan_counts]

    af_pop = population['pop'].to_list()[0]
    freqnet_num = population['sample_size'].to_list()[0]   
    
    pop_counts = [x*freqnet_num*2 for x in gene_final_freqnet]
    
    comparison_list = [list(a) for a in zip(cretan_counts, pop_counts)]
    
    if counts:   
        obs = []
        for x in range(len(comparison_list)):
            obs.append(np.round(np.array([comparison_list[x], [people_in_study*2-comparison_list[x][0],freqnet_num*2-comparison_list[x][1]]])))
        
        obs = [np.flip(obs[x], 0).T for x in range(len(obs))]
        
        return obs
    
    comparison_list_af = [list(a) for a in zip(gene_final_cretan_study, gene_final_freqnet)]
        
    pvalues = []

    for x in range(len(comparison_list)):
        obs = np.round(np.array([comparison_list[x], [people_in_study*2-comparison_list[x][0],freqnet_num*2-comparison_list[x][1]]]))
        chi2, p, df, exp = chi2_contingency(obs)
        pvalues.append(p)

    pval_dict = dict(zip(common_alleles, pvalues))

    wanted_alleles = {k:v for k,v in pval_dict.items() if v<0.05}

    initial_dict = dict(zip(common_alleles, zip(gene_final_cretan_study, gene_final_freqnet)))
    significant_dict = {k:v for z,w in wanted_alleles.items() for k,v in initial_dict.items() if z in k}

    if sig_afs:            
        significant_dict = {k:v for k,v in significant_dict.items() if v[0] > round(1/(2*people_in_study), 6) and v[1] > round(1/(2*freqnet_num), 4)}

    common_alleles_sig = list(significant_dict.keys())
    gene_final_cretan_study_sig = [x[0] for x in significant_dict.values()]
    gene_final_freqnet_sig = [x[1] for x in significant_dict.values()]

    return gene_final_cretan_study_sig, gene_final_freqnet_sig, common_alleles_sig 

In [17]:
def get_common_g_alleles(cretans, population, HLA_gene):
    
    alleles_allelefreq_net = population[population['gene']==HLA_gene]['allele'].tolist()  
    common_alleles = list(set(alleles_allelefreq_net).intersection(list(cretans.keys())))
    
    alleles_freqnet = dict(zip(population[population['gene']==HLA_gene].sort_values('af', ascending=False)['allele'].tolist(), population[population['gene']==HLA_gene].sort_values('af', ascending=False)['af'].tolist()))

    alleles_final_Cretans = []

    for x in common_alleles:
        alleles_final_Cretans.append(cretans[x])

    tmp_hla_alleles_Cretans = dict(zip(common_alleles, alleles_final_Cretans))
    tmp_hla_alleles_Cretans = {k: v for k, v in sorted(tmp_hla_alleles_Cretans.items(), key=lambda item: item[1], reverse=True)}

    tmp_alleles_freqnet = {}

    for x in tmp_hla_alleles_Cretans.keys():
        tmp_alleles_freqnet[x] = alleles_freqnet[x]

    common_alleles = list(tmp_hla_alleles_Cretans.keys())
    alleles_final_freqnet = list(tmp_alleles_freqnet.values())
    alleles_final_Cretans = list(tmp_hla_alleles_Cretans.values())
    
    return alleles_final_Cretans, alleles_final_freqnet, common_alleles

In [13]:
_ = haplomat_output_parce(fn, 1835)

In [11]:
_ = af_estimation_from_HaploMat_g_group(fn, 'A', 1835)

In [19]:
A_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'A', 1835)
B_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'B', 1835)
C_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'C', 1835)
DRB1_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'DRB1', 1835)
DQB1_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g5_path, 'DQB1', 1835)

NameError: name 'g4_path' is not defined

In [18]:
# ### Let's compare the HLA allelic frequencies of all minorities in Germany with the 1835 Cretans ###

A_final_1835_DKMSALL_GER, A_final_freqnet_1835_DKMSALL_GER, common_A_1835_DKMSALL_GER = get_common_g_alleles(A_g_grouped_1835_Cretans, DKMS_all_final, 'A')
B_final_1835_DKMSALL_GER, B_final_freqnet_1835_DKMSALL_GER, common_B_1835_DKMSALL_GER = get_common_g_alleles(B_g_grouped_1835_Cretans, DKMS_all_final, 'B')
C_final_1835_DKMSALL_GER, C_final_freqnet_1835_DKMSALL_GER, common_C_1835_DKMSALL_GER = get_common_g_alleles(C_g_grouped_1835_Cretans, DKMS_all_final, 'C')
DRB1_final_1835_DKMSALL_GER, DRB1_final_freqnet_1835_DKMSALL_GER, common_DRB1_1835_DKMSALL_GER = get_common_g_alleles(DRB1_g_grouped_1835_Cretans, DKMS_all_final, 'DRB1')

# ### Let's compare the HLA allelic frequencies of all minorities without the Greeks in Germany with the 1835 Cretans ###

A_final_1835_DKMSNOGR_GER, A_final_freqnet_1835_DKMSNOGR_GER, common_A_1835_DKMSNOGR_GER = get_common_g_alleles(A_g_grouped_1835_Cretans, DKMS_final, 'A')
B_final_1835_DKMSNOGR_GER, B_final_freqnet_1835_DKMSNOGR_GER, common_B_1835_DKMSNOGR_GER = get_common_g_alleles(B_g_grouped_1835_Cretans, DKMS_final, 'B')
C_final_1835_DKMSNOGR_GER, C_final_freqnet_1835_DKMSNOGR_GER, common_C_1835_DKMSNOGR_GER = get_common_g_alleles(C_g_grouped_1835_Cretans, DKMS_final, 'C')
DRB1_final_1835_DKMSNOGR_GER, DRB1_final_freqnet_1835_DKMSNOGR_GER, common_DRB1_1835_DKMSNOGR_GER = get_common_g_alleles(DRB1_g_grouped_1835_Cretans, DKMS_final, 'DRB1')

NameError: name 'A_g_grouped_1835_Cretans' is not defined

In [None]:
e,f,g=chi2(A_final_1835_DKMSALL_GER, A_final_freqnet_1835_DKMSALL_GER, common_A_1835_DKMSALL_GER, 1835, DKMS_all_final, sig_afs=True)

In [12]:
def get_comparison_diagram(gene_final_cretan_study, gene_final_freqnet, common_alleles, people_in_study, population, sig_af=None, sig_p=None):

    if not len(gene_final_cretan_study) == len(gene_final_freqnet) == len(common_alleles):
        raise Exception('Not the same length of common genes')
    
    af_pop = population['pop'].to_list()[0]
    freqnet_num = population['sample_size'].to_list()[0]
    
    if sig_af and not sig_p:
        tmp = dict(zip(common_alleles, zip(gene_final_cretan_study, gene_final_freqnet)))
        tmp = {k:v for k,v in tmp.items() if v[0] > round(1/(2*people_in_study), 6) and v[1] > round(1/(2*freqnet_num), 4)}
        
        gene_final_cretan_study = [v[0] for k,v in tmp.items()]
        gene_final_freqnet = [v[1] for k,v in tmp.items()]
        common_alleles = list(tmp.keys()) 
        
    common_alleles_num = len(common_alleles)
    gene = common_alleles[0].split('*')[0]
    index = np.arange(common_alleles_num) 
    bar_width = 0.35
    
    fig, ax = plt.subplots()

    if people_in_study == 94:
        Cretans_94 = ax.bar(index, gene_final_cretan_study, bar_width, label="94 Cretans", color = 'orange')

    if 675 < people_in_study < 690:    
        Cretans_689 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'green')
     
    if 1100 < people_in_study < 1215:    
        Cretans_1204 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'red')
    
    if 1700 < people_in_study < 2001:    
#         Cretans_1835 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'goldenrod')
        Cretans_1835 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'dimgrey')
    
#     freq_net = ax.bar(index+bar_width, gene_final_freqnet, bar_width, label=f"{freqnet_num} {af_pop}", color = 'blue')
    freq_net = ax.bar(index+bar_width, gene_final_freqnet, bar_width, label=f"{freqnet_num} {af_pop}", color = 'darkgrey')

    ax.set_ylabel('Allele Frequency', fontsize=13)
    
#     if sig_p and not sig_af:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies with p < 0.05', fontsize=15)
#     elif sig_af and not sig_p:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies with af > 1/2n', fontsize=15)
#     elif sig_af and sig_p:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies with p < 0.05 and af > 1/2n', fontsize=11)
#     else:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies', fontsize=15)
        
    ax.set_xticks(index + bar_width / 2)
    
    if gene == 'B':
        font = 7
    else:
        font = 11
        
    ax.set_xticklabels(common_alleles, rotation=90, fontsize=font)
    ax.legend()
    
#     fig.tight_layout()    
#     fig.set_dpi(300)
    
#     if sig_p and not sig_af:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png")
#     elif sig_af and not sig_p:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png")
#     elif sig_af and sig_p:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png")
#     else:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png")
    
    plt.show()

In [None]:
get_comparison_diagram(e,f,g,1835, DKMS_all_final, sig_p=True, sig_af=True)

In [None]:
###########################################################################################################################
###########################################################################################################################
###########################################################################################################################
###########################################################################################################################
###########################################################################################################################

In [194]:
from collections import Counter, defaultdict
import pandas as pd
import itertools
from itertools import chain
import numpy as np

In [112]:
###############################################################
##      Calculates the frequency of each allele              ##
###############################################################
def allele_freq(pop_txt, #txt file=> columns = [A,A,,B,B...], rows = [01:01,02:01,....]
                classII=None, #classII = [DRB1,DQB1,DPB1]
               ):
    
    '''
    The Cretan  HLA profile should include all the HLA genes (A,B,C,DRB1,DQB1,DPB1). 
    So we keep all the individuals (1835) who are fully genotyped on these 6 HLA genes.
    '''
    
    with open('/Users/vasou/Downloads/'+pop_txt, 'r') as f:
        af = f.readlines()
        
    results = defaultdict()
    af = [x.strip('\n').split('\t') for x in af]
    num_people=len(af)
    
    alleles = {'A':[1,2], #allele:indexes on x(for x in af)
               'B':[3,4],
               'C':[5,6],
               'DRB1':[7,8],
               'DQB1':[9,10],
               'DPB1':[11,12],
              }

    for key in alleles:
        results[f'N_{key}'] = dict(Counter([x[alleles[key][0]] for x in af[1:]] + [x[alleles[key][1]] for x in af[1:]]))
        results[f'N_{key}'] = {k:v for k,v in sorted(results[f'N_{key}'].items(), key=lambda item:item[1], reverse = True)}
        results[f'AF_{key}'] = {k:v/(2*num_people) for k,v in results[f'N_{key}'].items()}
        results[f'{key}'] = [f'{key}*'+x for x in list(results[f'N_{key}'].keys())]
    
    if not classII:   
        keys = ['A', 'N_A', 'AF_A', 'B', 'N_B', 'AF_B', 'C', 'N_C', 'AF_C']
        nest = [results[key].values() if '_' in key else results[key] for key in keys]
        target_class = pd.DataFrame((_ for _ in itertools.zip_longest(*nest,fillvalue = '0')), columns=keys)
        target_class[['N_A','N_B','N_C']] =  target_class[['N_A','N_B','N_C']].astype(int)
        return target_class
    else:
        keys = ['DRB1', 'N_DRB1', 'AF_DRB1', 'DQB1', 'N_DQB1', 'AF_DQB1', 'DPB1', 'N_DPB1', 'AF_DPB1']
        nest = [results[key].values() if '_' in key else results[key] for key in keys]
        target_class = pd.DataFrame((_ for _ in itertools.zip_longest(*nest,fillvalue = '0')), columns=keys)
        target_class[['N_DRB1','N_DQB1','N_DPB1']] =  target_class[['N_DRB1','N_DQB1','N_DPB1']].astype(int)
        return target_class


classI = allele_freq('pop.txt', classII=False).head(10)
classII = allele_freq('pop.txt', classII=True).head(10)

In [192]:
####################################################################################
##       Create dict: key = ID, value = dict(nomos:chania, A:01:01,B....)         ##
####################################################################################
def get_cretan_population(excel_file,#data with necessary columns: ID, REGIONAL UNIT OF DESCENT 2ND GENERATION_75%, alleles
                          sheet_name = None,#read sheets
                          Crete_75=None, #take the samples that the 75% belongs to crete 
                          Crete_50_and_75=None,
                          municipality_75=None,
                          municipality_50_and_75=None):
    
    if sum(x for x in [Crete_75, Crete_50_and_75, municipality_75, municipality_50_and_75] if x) > 1:
        raise Exception('Only 1 value must be True.')

    if sheet_name:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
    else:
        df = pd.read_excel(excel_file)
    df.fillna('', inplace=True)  
    print(f'The number of unique samples in {excel_file}: {len(pd.unique(df["ID"]))}')

    targets = ['REGIONAL UNIT OF DESCENT 2ND GENERATION_75%',\
             'A','B','C',\
             'DRB1','DQB1','DPB1']

    data = defaultdict(lambda:defaultdict())
    
    for row in df.iterrows():
        for target in targets:
            if target in ['REGIONAL UNIT OF DESCENT 2ND GENERATION_75%']:
                data[row[1]['ID']][target] = row[1]['REGIONAL UNIT OF DESCENT 2ND GENERATION_75%']
            else:
                data[row[1]['ID']][target] = (row[1][target+'1'], row[1][target+'2']) \
                if 'D' not in target else (row[1][target+'_1'], row[1][target+'_2'])
                
    if Crete_50_and_75:
        return data
    
    elif Crete_75:
        return {k:v for k,v in data.items() if not any(
                x in v['REGIONAL UNIT OF DESCENT 2ND GENERATION_75%']  
                for x in ['','not found','ABROAD']
                )
               }
    
    elif municipality_50_and_75: #pali ta abroad vgazei?
        return {k:v for k,v in data.items() if v['REGIONAL UNIT OF DESCENT 2ND GENERATION_75%'] not in (
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΗΡΑΚΛΕΙΟΥ & 25% ΡΕΘΥΜΝΟΥ',
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΛΑΣΙΘΙΟΥ & 25% ΗΡΑΚΛΕΙΟΥ',
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΡΕΘΥΜΝΟΥ & 25% ΗΡΑΚΛΕΙΟΥ',
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΡΕΘΥΜΝΟΥ & 25% ΛΑΣΙΘΙΟΥ',
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΧΑΝΙΩΝ & 25% ΗΡΑΚΛΕΙΟΥ',
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΧΑΝΙΩΝ & 25% ΛΑΣΙΘΙΟΥ', 
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΧΑΝΙΩΝ & 25% ΡΕΘΥΜΝΟΥ',
           '25% ΧΑΝΙΩΝ & 25% ΛΑΣΙΘΙΟΥ/50% ΕΚΤΟΣ ΚΡΗΤΗΣ',
           '25% ΡΕΘΥΜΝΟΥ & 25% ΗΡΑΚΛΕΙΟΥ/50% ΕΚΤΟΣ ΚΡΗΤΗΣ',
           '50% ΕΚΤΟΣ ΚΡΗΤΗΣ/25% ΗΡΑΚΛΕΙΟΥ & 25% ΑΝΩΓΕΙΩΝ',
           '25% ΡΕΘΥΜΝΟΥ & 25% ΛΑΣΙΘΙΟΥ/50% ΕΚΤΟΣ ΚΡΗΤΗΣ')}
    
    elif municipality_75:                
        return {k:v for k,v in data.items() if v['REGIONAL UNIT OF DESCENT 2ND GENERATION_75%'] in ('ΗΡΑΚΛΕΙΟΥ', 'ΛΑΣΙΘΙΟΥ', 'ΡΕΘΥΜΝΟΥ', 'ΧΑΝΙΩΝ')}

Crete_50_and_75 = get_cretan_population('Crete_428CBUs_120723.xlsx', Crete_50_and_75=True)
Crete_75 = get_cretan_population('Crete_428CBUs_120723.xlsx', Crete_75=True)
#municipality_50_and_75 = get_cretan_population('final samples_prefecture analyses_111120.xlsx',  '1926_excl_relations', municipality_50_and_75=True)
municipality_75 = get_cretan_population('Crete_428CBUs_120723.xlsx', municipality_75=True)#n_Cretans = len(Crete_75)
#n_Prefectures = len(municipality_75)

#Chania_75 = {k:v for k,v in municipality_75.items() if v['ΝΟΜΟΣ_ΚΑΤΑΓΩΓΗΣ_min75%']=='ΧΑΝΙΩΝ'}
#Rethymno_75 = {k:v for k,v in municipality_75.items() if v['ΝΟΜΟΣ_ΚΑΤΑΓΩΓΗΣ_min75%']=='ΡΕΘΥΜΝΟΥ'}
#Heraklion_75 = {k:v for k,v in municipality_75.items() if v['ΝΟΜΟΣ_ΚΑΤΑΓΩΓΗΣ_min75%']=='ΗΡΑΚΛΕΙΟΥ'}
Lasithi_75 = {k:v for k,v in municipality_75.items() if v['REGIONAL UNIT OF DESCENT 2ND GENERATION_75%']=='ΛΑΣΙΘΙΟΥ'}
#n_Chania = len(Chania_75)
#n_Rethymno = len(Rethymno_75)
#n_Heraklion = len(Heraklion_75)
#n_Lasithi = len(Lasithi_75)



The number of unique samples in Crete_428CBUs_120723.xlsx: 384
The number of unique samples in Crete_428CBUs_120723.xlsx: 384
The number of unique samples in Crete_428CBUs_120723.xlsx: 384


In [211]:
###########################################################################################
##    Return a dict with key:allelle or nan and value:[homozygosity, heterozygosity]     ##
###########################################################################################
def find_allele_homozygosity(pop_dict, #dict key = ID, value = dict(nomos:chania, A:01:01,B....)
                             hla_gene # A,B,C....
                            ):
    
    if hla_gene not in ('A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1'):
        raise Exception('The HLA gene can only be A, B, C, DRB1, DQB1 or DPB1.')
    
    n_Cretans = len(pop_dict)
    alleles = sorted(list(set(chain(*[v[hla_gene] for k,v in pop_dict.items()]))))
    allele_homozygosity = {x:0 for x in alleles}
    allele_heterozygosity = {x:0 for x in alleles}
    homozygosity_check = [v[hla_gene] for k,v in pop_dict.items()]
    
    for x in alleles:
        for y in homozygosity_check:
            if y[0]==y[1]==x:
                allele_homozygosity[x] += 1
            if y[0]==x and y[0]!=y[1] or y[1]==x and y[0]!=y[1]:
                allele_heterozygosity[x] += 1
                
    result = {}
    
    for key in (allele_homozygosity.keys() | allele_heterozygosity.keys()):
        if key in allele_homozygosity: result.setdefault(key, []).append(allele_homozygosity[key])
        if key in allele_heterozygosity: result.setdefault(key, []).append(allele_heterozygosity[key])
    result = {k:v for k,v in sorted(result.items(), key = lambda item:item[0])}
    
    return result

find_allele_homozygosity(Lasithi_75,#created by get_cretan_population()
                         'A'
                        )

{'': [4, 1],
 '01:01': [0, 2],
 '02:01': [0, 2],
 '02:05': [0, 1],
 '11:01': [1, 1],
 '23:01': [0, 1],
 '30:01': [0, 1],
 '32:01': [0, 1]}

In [215]:
###########################################################################################
##    Return a dict with key:allelle or nan and value:[homozygosity, heterozygosity]     ##
###########################################################################################
def get_homozygosity(pop_dict, #dict key = ID, value = dict(nomos:chania, A:01:01,B....)
                     get_ids=None#=1, if you want to take the IDs of homozygotes
                    ):

    homozygotes_ids = []
    homozygotes = defaultdict(lambda:0)
    genes = ['A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1']
    n_Cretans = len(pop_dict)

    for x in list(pop_dict.keys()):
        for gene in genes:
            
            if pop_dict[x][gene][0] == pop_dict[x][gene][1]:
                homozygotes[gene] += 1
                homozygotes_ids.append(x)
    if get_ids:
        return {x:pop_dict[x] for x in list(set(homozygotes_ids))}
    
    print(f'Total Homozygotes on {n_Cretans} Cretans:', len(set(homozygotes_ids)), f'({round((len(set(homozygotes_ids))/n_Cretans)*100,2)}%)')
    for gene in genes:
        print(f'Homozygotes on {gene}: {homozygotes[gene]} ({round((homozygotes[gene]/n_Cretans)*100,2)}%)')


get_homozygosity(Lasithi_75, get_ids=0)

Total Homozygotes on 10 Cretans: 6 (60.0%)
Homozygotes on A: 5 (50.0%)
Homozygotes on B: 5 (50.0%)
Homozygotes on C: 5 (50.0%)
Homozygotes on DRB1: 5 (50.0%)
Homozygotes on DQB1: 4 (40.0%)
Homozygotes on DPB1: 3 (30.0%)


In [None]:
#########################################################################################################################
## den pianw akoma:                                                                                                    ##
## create_pop_excel => download excels                                                                                 ##
## find_high_homozygosity => ??????????????                                                                            ##
## preprocess_af => This function returns the allelic frequencies of the populations of allelefrequencies.net          ##
## get_diagram_af_net => This function creates the plots for the allele frequencies of the                             ##
##                       DKMS minorities as a whole and 1 by 1.                                                        ##
##                                                                                                                     ##
#########################################################################################################################

In [217]:
### After we run the Hapl-o-Mat with the g grouping without missing data we calculate the allele frequencies

def af_estimation_from_HaploMat_g_group(hfs_dat, hla_gene, num_of_people, alleles=None, counts=None):
    
    '''
    !!!README!!!
    
    This function calculates the allele frequency of the HLA genes A, B, C, DRB1, DQB1 and DPB1 
    if the haplotypes have them. 
    
    Hapl-o-Mat orders the genes in the haplotypes alphabetically.
    
    The haplotypes with 4 genes should have the HLA genes below in this order only:
    A~B~C~DRB1
    
    The haplotypes with 5 genes should have the HLA genes below in this order only:
    A~B~C~DQB1~DRB1
    
    The haplotypes with 6 genes should have the HLA genes below in this order only:
    A~B~C~DPB1~DQB1~DRB1
    
    Other combinations of the HLA genes can not be used as input.    
    
    This function returns the estimated allele frequencies/counts and alleles of the sample from Hapl-o-Mat.
    '''
    
    if hla_gene not in ('A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1'):
        raise Exception("The hla_gene can only be 'A', 'B', 'C', 'DRB1', 'DQB1' or 'DPB1'")
        
    if counts and alleles:
        raise Exception('Only 1 output can come from this function.')
    
    with open(hfs_dat, 'r') as f:
        f = f.readlines()
        
    f = [x.strip('\n').split('\t') for x in f]
    allele_freq = [(x[0].split('~'), float(x[1])) for x in f]
    allele_freq = [([y.strip('g') for y in x[0]],x[1]) for x in allele_freq]
    
    if hla_gene == 'A':
        n = 0        
    elif hla_gene == 'B':
        n = 1        
    elif hla_gene == 'C':
        n = 2        
    
    elif hla_gene == 'DRB1': # Hapl-o-Mat orders the genes in the haplotypes alphabetically.
        if len(allele_freq[0][0]) == 4:     
            n = 3
        elif len(allele_freq[0][0]) == 5:     
            n = 4    
        elif len(allele_freq[0][0]) == 6:
            n = 5
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')
    
    elif hla_gene == 'DQB1':
        if len(allele_freq[0][0]) == 4:     
            raise Exception('DQB1 should not be in a haplotype with only 4 genes. Read the documentation.')
        elif len(allele_freq[0][0]) == 5:     
            n = 3    
        elif len(allele_freq[0][0]) == 6:
            n = 4
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')                 
    
    elif hla_gene == 'DPB1':
        if len(allele_freq[0][0]) == 4:     
            raise Exception('DPB1 should not be in a haplotype with only 4 genes. Read the documentation.')
        elif len(allele_freq[0][0]) == 5:     
            raise Exception('DPB1 should not be in a haplotype with only 5 genes. Read the documentation.')    
        elif len(allele_freq[0][0]) == 6:
            n = 3
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')
    
    HLA_gene = {el:0 for el in [x[0][n] for x in allele_freq]}
    
    for x in allele_freq:
        HLA_gene[x[0][n]] += x[1]
    
    HLA_gene = {k:v for k, v in sorted(HLA_gene.items(), key=lambda item: item[1], reverse=True)}
    
    if alleles:
        return list(HLA_gene.keys())
    
    elif counts:        
        return {k:round(v*num_of_people) for k,v in HLA_gene.items()} 
        # No need to multiply by 2, because of the 2 genotypes on MAC input
    
    else:
        return HLA_gene

In [216]:
g4_path = '/Users/vasou/Downloads/test.dat'

In [218]:
test_results = af_estimation_from_HaploMat_g_group(g4_path, 'A', 1835) #g4_path = dat file

In [219]:
test_results

{'A*24:02': 0.08969907407407002,
 'A*29:25': 0.06770833333333,
 'A*11:01': 0.06770833333333,
 'A*02:77': 0.06250000000001,
 'A*03:217': 0.05729166666666,
 'A*23:06': 0.04687500000003,
 'A*68:70': 0.046875,
 'A*23:01': 0.04600694444447,
 'A*02:454': 0.04166666666667,
 'A*29:67': 0.036458333333340004,
 'A*30:13': 0.03645833333333,
 'A*03:02': 0.03125,
 'A*66:10': 0.03125,
 'A*30:73N': 0.031249999999999993,
 'A*33:27': 0.026041666666670002,
 'A*32:33': 0.02604166666667,
 'A*26:04': 0.026041666666660003,
 'A*03:93': 0.020833333333330005,
 'A*02:570': 0.02083333333333,
 'A*32:73': 0.02083333333333,
 'A*02:163': 0.020833333333329998,
 'A*68:94N': 0.02083333333332,
 'A*02:258': 0.015625,
 'A*03:239': 0.01562499999992,
 'A*11:109N': 0.01041666666667,
 'A*02:259': 0.01041666666667,
 'A*02:158': 0.01041666666667,
 'A*29:36': 0.01041666666666,
 'A*29:01': 0.00520833333333,
 'A*23:03': 0.00520833333333,
 'A*68:01': 0.00520833333333,
 'A*24:215': 0.00520833333333,
 'A*24:05': 0.00520833333333,
 'A*