Code to reproduce plots from: https://www.sciencedirect.com/science/article/pii/S1465324921007453 

In [2]:
fn = '/Users/admin/HLA/RUNS_haplomat/1835/frequencies.dat'

In [3]:
def haplomat_output_parce(hfs_dat, n):
    
    with open(hfs_dat) as f:
        cretan_haplotypes = [x.replace('\n', '').replace('g', '').split() for x in f.readlines()]

    cretan_haplotypes = [( '-'.join(sorted(x[0].split('~'))  ), float(x[1])) for x in cretan_haplotypes]

    cretan_haplotypes = {x[0]:x[1] for x in cretan_haplotypes}
    cretan_haplotypes = {k:v for k,v in cretan_haplotypes.items() if v >= 1/(2*n)}
    
    return cretan_haplotypes

In [14]:
### After we run the Hapl-o-Mat with the g grouping without missing data we calculate the allele frequencies

def af_estimation_from_HaploMat_g_group(hfs_dat, hla_gene, num_of_people, alleles=None, counts=None):
    
    '''
    !!!README!!!
    
    This function calculates the allele frequency of the HLA genes A, B, C, DRB1, DQB1 and DPB1 
    if the haplotypes have them. 
    
    Hapl-o-Mat orders the genes in the haplotypes alphabetically.
    
    The haplotypes with 4 genes should have the HLA genes below in this order only:
    A~B~C~DRB1
    
    The haplotypes with 5 genes should have the HLA genes below in this order only:
    A~B~C~DQB1~DRB1
    
    The haplotypes with 6 genes should have the HLA genes below in this order only:
    A~B~C~DPB1~DQB1~DRB1
    
    Other combinations of the HLA genes can not be used as input.    
    
    This function returns the estimated allele frequencies/counts and alleles of the sample from Hapl-o-Mat.
    '''
    
    if hla_gene not in ('A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1'):
        raise Exception("The hla_gene can only be 'A', 'B', 'C', 'DRB1', 'DQB1' or 'DPB1'")
        
    if counts and alleles:
        raise Exception('Only 1 output can come from this function.')
    
    with open(hfs_dat, 'r') as f:
        f = f.readlines()
        
    f = [x.strip('\n').split('\t') for x in f]
    allele_freq = [(x[0].split('~'), float(x[1])) for x in f]
    allele_freq = [([y.strip('g') for y in x[0]],x[1]) for x in allele_freq]
    
    if hla_gene == 'A':
        n = 0        
    elif hla_gene == 'B':
        n = 1        
    elif hla_gene == 'C':
        n = 2        
    
    elif hla_gene == 'DRB1': # Hapl-o-Mat orders the genes in the haplotypes alphabetically.
        if len(allele_freq[0][0]) == 4:     
            n = 3
        elif len(allele_freq[0][0]) == 5:     
            n = 4    
        elif len(allele_freq[0][0]) == 6:
            n = 5
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')
    
    elif hla_gene == 'DQB1':
        if len(allele_freq[0][0]) == 4:     
            raise Exception('DQB1 should not be in a haplotype with only 4 genes. Read the documentation.')
        elif len(allele_freq[0][0]) == 5:     
            n = 3    
        elif len(allele_freq[0][0]) == 6:
            n = 4
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')                 
    
    elif hla_gene == 'DPB1':
        if len(allele_freq[0][0]) == 4:     
            raise Exception('DPB1 should not be in a haplotype with only 4 genes. Read the documentation.')
        elif len(allele_freq[0][0]) == 5:     
            raise Exception('DPB1 should not be in a haplotype with only 5 genes. Read the documentation.')    
        elif len(allele_freq[0][0]) == 6:
            n = 3
        else:
            raise Exception('The haplotypes should have 4, 5 or 6 genes in a specific order. Read the documentation.')
    
    HLA_gene = {el:0 for el in [x[0][n] for x in allele_freq]}
    
    for x in allele_freq:
        HLA_gene[x[0][n]] += x[1]
    
    HLA_gene = {k:v for k, v in sorted(HLA_gene.items(), key=lambda item: item[1], reverse=True)}
    
    if alleles:
        return list(HLA_gene.keys())
    
    elif counts:        
        return {k:round(v*num_of_people) for k,v in HLA_gene.items()} 
        # No need to multiply by 2, because of the 2 genotypes on MAC input
    
    else:
        return HLA_gene

In [16]:
def chi2(gene_final_cretan_study, gene_final_freqnet, common_alleles, people_in_study, population, sig_afs=None, counts=None):
    
    if not len(gene_final_cretan_study) == len(gene_final_freqnet) == len(common_alleles):
        raise Exception('Not the same length of common genes')
    
    gene = common_alleles[0].split('*')[0]
    
    cretan_counts = [x*people_in_study*2 for x in gene_final_cretan_study]
    cretan_counts = [np.round(x) for x in cretan_counts]

    af_pop = population['pop'].to_list()[0]
    freqnet_num = population['sample_size'].to_list()[0]   
    
    pop_counts = [x*freqnet_num*2 for x in gene_final_freqnet]
    
    comparison_list = [list(a) for a in zip(cretan_counts, pop_counts)]
    
    if counts:   
        obs = []
        for x in range(len(comparison_list)):
            obs.append(np.round(np.array([comparison_list[x], [people_in_study*2-comparison_list[x][0],freqnet_num*2-comparison_list[x][1]]])))
        
        obs = [np.flip(obs[x], 0).T for x in range(len(obs))]
        
        return obs
    
    comparison_list_af = [list(a) for a in zip(gene_final_cretan_study, gene_final_freqnet)]
        
    pvalues = []

    for x in range(len(comparison_list)):
        obs = np.round(np.array([comparison_list[x], [people_in_study*2-comparison_list[x][0],freqnet_num*2-comparison_list[x][1]]]))
        chi2, p, df, exp = chi2_contingency(obs)
        pvalues.append(p)

    pval_dict = dict(zip(common_alleles, pvalues))

    wanted_alleles = {k:v for k,v in pval_dict.items() if v<0.05}

    initial_dict = dict(zip(common_alleles, zip(gene_final_cretan_study, gene_final_freqnet)))
    significant_dict = {k:v for z,w in wanted_alleles.items() for k,v in initial_dict.items() if z in k}

    if sig_afs:            
        significant_dict = {k:v for k,v in significant_dict.items() if v[0] > round(1/(2*people_in_study), 6) and v[1] > round(1/(2*freqnet_num), 4)}

    common_alleles_sig = list(significant_dict.keys())
    gene_final_cretan_study_sig = [x[0] for x in significant_dict.values()]
    gene_final_freqnet_sig = [x[1] for x in significant_dict.values()]

    return gene_final_cretan_study_sig, gene_final_freqnet_sig, common_alleles_sig 

In [17]:
def get_common_g_alleles(cretans, population, HLA_gene):
    
    alleles_allelefreq_net = population[population['gene']==HLA_gene]['allele'].tolist()  
    common_alleles = list(set(alleles_allelefreq_net).intersection(list(cretans.keys())))
    
    alleles_freqnet = dict(zip(population[population['gene']==HLA_gene].sort_values('af', ascending=False)['allele'].tolist(), population[population['gene']==HLA_gene].sort_values('af', ascending=False)['af'].tolist()))

    alleles_final_Cretans = []

    for x in common_alleles:
        alleles_final_Cretans.append(cretans[x])

    tmp_hla_alleles_Cretans = dict(zip(common_alleles, alleles_final_Cretans))
    tmp_hla_alleles_Cretans = {k: v for k, v in sorted(tmp_hla_alleles_Cretans.items(), key=lambda item: item[1], reverse=True)}

    tmp_alleles_freqnet = {}

    for x in tmp_hla_alleles_Cretans.keys():
        tmp_alleles_freqnet[x] = alleles_freqnet[x]

    common_alleles = list(tmp_hla_alleles_Cretans.keys())
    alleles_final_freqnet = list(tmp_alleles_freqnet.values())
    alleles_final_Cretans = list(tmp_hla_alleles_Cretans.values())
    
    return alleles_final_Cretans, alleles_final_freqnet, common_alleles

In [13]:
_ = haplomat_output_parce(fn, 1835)

In [11]:
_ = af_estimation_from_HaploMat_g_group(fn, 'A', 1835)

In [19]:
A_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'A', 1835)
B_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'B', 1835)
C_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'C', 1835)
DRB1_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g4_path, 'DRB1', 1835)
DQB1_g_grouped_1835_Cretans = af_estimation_from_HaploMat_g_group(g5_path, 'DQB1', 1835)

NameError: name 'g4_path' is not defined

In [18]:
# ### Let's compare the HLA allelic frequencies of all minorities in Germany with the 1835 Cretans ###

A_final_1835_DKMSALL_GER, A_final_freqnet_1835_DKMSALL_GER, common_A_1835_DKMSALL_GER = get_common_g_alleles(A_g_grouped_1835_Cretans, DKMS_all_final, 'A')
B_final_1835_DKMSALL_GER, B_final_freqnet_1835_DKMSALL_GER, common_B_1835_DKMSALL_GER = get_common_g_alleles(B_g_grouped_1835_Cretans, DKMS_all_final, 'B')
C_final_1835_DKMSALL_GER, C_final_freqnet_1835_DKMSALL_GER, common_C_1835_DKMSALL_GER = get_common_g_alleles(C_g_grouped_1835_Cretans, DKMS_all_final, 'C')
DRB1_final_1835_DKMSALL_GER, DRB1_final_freqnet_1835_DKMSALL_GER, common_DRB1_1835_DKMSALL_GER = get_common_g_alleles(DRB1_g_grouped_1835_Cretans, DKMS_all_final, 'DRB1')

# ### Let's compare the HLA allelic frequencies of all minorities without the Greeks in Germany with the 1835 Cretans ###

A_final_1835_DKMSNOGR_GER, A_final_freqnet_1835_DKMSNOGR_GER, common_A_1835_DKMSNOGR_GER = get_common_g_alleles(A_g_grouped_1835_Cretans, DKMS_final, 'A')
B_final_1835_DKMSNOGR_GER, B_final_freqnet_1835_DKMSNOGR_GER, common_B_1835_DKMSNOGR_GER = get_common_g_alleles(B_g_grouped_1835_Cretans, DKMS_final, 'B')
C_final_1835_DKMSNOGR_GER, C_final_freqnet_1835_DKMSNOGR_GER, common_C_1835_DKMSNOGR_GER = get_common_g_alleles(C_g_grouped_1835_Cretans, DKMS_final, 'C')
DRB1_final_1835_DKMSNOGR_GER, DRB1_final_freqnet_1835_DKMSNOGR_GER, common_DRB1_1835_DKMSNOGR_GER = get_common_g_alleles(DRB1_g_grouped_1835_Cretans, DKMS_final, 'DRB1')

NameError: name 'A_g_grouped_1835_Cretans' is not defined

In [None]:
e,f,g=chi2(A_final_1835_DKMSALL_GER, A_final_freqnet_1835_DKMSALL_GER, common_A_1835_DKMSALL_GER, 1835, DKMS_all_final, sig_afs=True)

In [12]:
def get_comparison_diagram(gene_final_cretan_study, gene_final_freqnet, common_alleles, people_in_study, population, sig_af=None, sig_p=None):

    if not len(gene_final_cretan_study) == len(gene_final_freqnet) == len(common_alleles):
        raise Exception('Not the same length of common genes')
    
    af_pop = population['pop'].to_list()[0]
    freqnet_num = population['sample_size'].to_list()[0]
    
    if sig_af and not sig_p:
        tmp = dict(zip(common_alleles, zip(gene_final_cretan_study, gene_final_freqnet)))
        tmp = {k:v for k,v in tmp.items() if v[0] > round(1/(2*people_in_study), 6) and v[1] > round(1/(2*freqnet_num), 4)}
        
        gene_final_cretan_study = [v[0] for k,v in tmp.items()]
        gene_final_freqnet = [v[1] for k,v in tmp.items()]
        common_alleles = list(tmp.keys()) 
        
    common_alleles_num = len(common_alleles)
    gene = common_alleles[0].split('*')[0]
    index = np.arange(common_alleles_num) 
    bar_width = 0.35
    
    fig, ax = plt.subplots()

    if people_in_study == 94:
        Cretans_94 = ax.bar(index, gene_final_cretan_study, bar_width, label="94 Cretans", color = 'orange')

    if 675 < people_in_study < 690:    
        Cretans_689 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'green')
     
    if 1100 < people_in_study < 1215:    
        Cretans_1204 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'red')
    
    if 1700 < people_in_study < 2001:    
#         Cretans_1835 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'goldenrod')
        Cretans_1835 = ax.bar(index, gene_final_cretan_study, bar_width, label=f"{people_in_study} Cretans", color = 'dimgrey')
    
#     freq_net = ax.bar(index+bar_width, gene_final_freqnet, bar_width, label=f"{freqnet_num} {af_pop}", color = 'blue')
    freq_net = ax.bar(index+bar_width, gene_final_freqnet, bar_width, label=f"{freqnet_num} {af_pop}", color = 'darkgrey')

    ax.set_ylabel('Allele Frequency', fontsize=13)
    
#     if sig_p and not sig_af:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies with p < 0.05', fontsize=15)
#     elif sig_af and not sig_p:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies with af > 1/2n', fontsize=15)
#     elif sig_af and sig_p:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies with p < 0.05 and af > 1/2n', fontsize=11)
#     else:
#         ax.set_title(f'Comparison of HLA-{gene} frequencies', fontsize=15)
        
    ax.set_xticks(index + bar_width / 2)
    
    if gene == 'B':
        font = 7
    else:
        font = 11
        
    ax.set_xticklabels(common_alleles, rotation=90, fontsize=font)
    ax.legend()
    
#     fig.tight_layout()    
#     fig.set_dpi(300)
    
#     if sig_p and not sig_af:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png")
#     elif sig_af and not sig_p:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png")
#     elif sig_af and sig_p:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_significant_HLA-{gene}_frequencies_{people_in_study}_{af_pop}_with_af_greater_than_1_to_2n.png")
#     else:
#         if af_pop == 'DKMS with Greeks' or af_pop == 'DKMS without Greeks':      
#             plt.savefig(f'{af_pop}_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png')
#         else:
#             plt.savefig(f"{af_pop.split(' ')[3]}_in_Germany_comparison_{people_in_study}_freqnet/HLA-{gene}/Comparison_of_HLA-{gene}_frequencies_{people_in_study}_{af_pop}.png")
    
    plt.show()

In [None]:
get_comparison_diagram(e,f,g,1835, DKMS_all_final, sig_p=True, sig_af=True)