In [1]:
import pandas as pd
import  math
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import re

## Pipeline for Matching Coverage

In [2]:
###############################################################################################################
##                                               PIPELINE                                                    ##
###############################################################################################################

In [2]:
'''
Clean the estimated haplotypes (haplotypes from haplomat=>drop ~ and g/G)
'''
def clean_data(
                filename, #dataframe with haplotypes and frequency,
                n, #number of samples that the haplotypes derived
              ):
    '''
    Read Estimated Haplotypes from Haplomat
    '''
    if 'DKMS' in filename and '3loci' in filename:
        hap_estimated = pd.read_csv(f'/Users/vasou/Documents/HLA/Matching Coverage/Estimated_Haplotypes/{filename}.dat',
                                    sep = '\t',
                                    decimal=',',
                                    header = None)
    else:
        hap_estimated = pd.read_csv(f'/Users/vasou/Documents/HLA/Matching Coverage/Estimated_Haplotypes/{filename}.dat',
                                    sep = '\t',
                                    header = None)
        
    #hap_estimated = pd.read_csv(f'/Users/vasou/Documents/GitHub/HLA.github.io/Analyses/Haplotype_Analyses/Haplotype_data/{filename}.dat', sep="\t",header=None)
    hap_estimated.columns = ['Haplotypes', 'Frequency']
    #display(hap_estimated)
    print(f'previous len: {hap_estimated.shape[0]}')
    correction = 1/(2*n)
    hap_estimated = hap_estimated[hap_estimated['Frequency']>=correction]
    
    #We want the estimated haplotpyes have >=2 counts (second correction)
    counts = [round(freq*n) for freq in hap_estimated['Frequency'].values]
   
    hap_estimated['Counts'] = counts
    counts_correction = 2
    #assert sum(counts) == n
    hap_estimated = hap_estimated[hap_estimated['Counts']>=counts_correction]
    print(f'samples: {sum(hap_estimated["Counts"].values)}')
    haplotypes = []
    for haplotype in hap_estimated['Haplotypes'].values:
        splited_haplotype = haplotype.split('~')
        splited_haplotype = [splited_haplotype[i][:-1] if splited_haplotype[i][-1].isalpha() else splited_haplotype[i] for i in range(len(splited_haplotype)) ]
        haplotype = ' '.join(splited_haplotype)
        haplotypes.append(haplotype)
    #display(hap_estimated)
    print(f'After len: {hap_estimated.shape[0]}')
    return haplotypes

In [3]:
'''
Calculate matching coverage
'''

def matching_coverage(
                        dat_data,
                        haplotypes, #estimated haplotypes from haplomat or haplotypes from homozygous
                        genes,
                        not_found = False
                    ):
    not_found_ids = []
    
    if not_found:
        if genes == 3:
            case = ['ID','A1', 'A2','B1','B2','DRB1_1', 'DRB1_2']
            for index, row in dat_data[case].iterrows():
                if row['A1'] == 'not found' and row['A2'] == 'not found':
                    not_found_ids.append(index)
                elif row['B1'] == 'not found' and row['B2'] == 'not found':
                    not_found_ids.append(index)
                elif row['DRB1_1'] == 'not found' and row['DRB1_2'] == 'not found':
                    not_found_ids.append(index)
        else:
            case = ['ID','A1', 'A2','B1','B2','C1','C2','DRB1_1', 'DRB1_2','DQB1_1', 'DQB1_2']
            for index, row in dat_data[case].iterrows():
                if row['A1'] == 'not found' and row['A2'] == 'not found':
                    not_found_ids.append(index)
                elif row['B1'] == 'not found' and row['B2'] == 'not found':
                    not_found_ids.append(index)
                elif row['C1'] == 'not found' and row['C2'] == 'not found':
                    not_found_ids.append(index)
                elif row['DRB1_1'] == 'not found' and row['DRB1_2'] == 'not found':
                    not_found_ids.append(index)
                elif row['DQB1_1'] == 'not found' and row['DQB1_2'] == 'not found':
                    not_found_ids.append(index)
    else:
        if genes == 3:
            case = ['ID','A1', 'A2','B1','B2','DRB1_1', 'DRB1_2']
        else:
            case = ['ID','A1', 'A2','B1','B2','C1','C2','DRB1_1', 'DRB1_2','DQB1_1', 'DQB1_2']
            
        for index, row in dat_data[case].iterrows():
                sum_row = sum([1 for allele in case[1:] if row[allele]=='not found'])
                if sum_row > 0:
                    not_found_ids.append(index)
        
        
    #data_nf = dat_data[case][~dat_data[case].apply(lambda row: row.astype(str).str.contains('not found').any(), axis=1)]
    dat_data.drop(labels = not_found_ids, axis = 0, inplace = True)
            
    #Add the loci before allele: A*01:01
    data = dat_data[case].copy()
    for column in data.columns[1:]: #the first column is the ID of each sample
        l = []
        for value in data[column].values:
            if column in ['DRB1_1', 'DRB1_2','DQB1_1','DQB1_2','DPB1_1','DPB1_2']:
                l.append(column.split('_')[0] + '*' + ':'.join(value.split(':')[:2]))
            else: 
                l.append(column[0] + '*' + ':'.join(value.split(':')[:2]))

        data[column] = l
    #data.to_excel('test.xlsx')  
    display(data)
    count_hap = defaultdict(lambda:0)
    ids = defaultdict(lambda:[])
    remaining_data = data.copy()
    for top in haplotypes:
        unique_hap = top.split(' ')
        
        for index, row in remaining_data.iterrows():
            haplotype = []
            if unique_hap[0] in row['A1'] or unique_hap[0] in row['A2']:
                haplotype.append(unique_hap[0])

            if unique_hap[1] in row['B1'] or unique_hap[1] in row['B2']:
                haplotype.append(unique_hap[1])
                
            if genes ==3:
                if unique_hap[2] in row['DRB1_1'] or unique_hap[2] in row['DRB1_2']:
                    haplotype.append(unique_hap[2])
                
            if genes == 5:
                if unique_hap[2] in row['C1'] or unique_hap[2] in row['C2']:
                    haplotype.append(unique_hap[2])
                if unique_hap[3] in row['DQB1_1'] or unique_hap[3] in row['DQB1_2']:
                    haplotype.append(unique_hap[3])
                if unique_hap[4] in row['DRB1_1'] or unique_hap[4] in row['DRB1_2']:
                    haplotype.append(unique_hap[4])  

            haplotype = ' '.join(haplotype)
            #print(haplotype)
            #print('-------------------------------------------')
            #print(top)
            #print('=======================================')
            if haplotype == top:
                count_hap[haplotype]+=1
                ids[haplotype].append((remaining_data['ID'][index],index))
                #print(haplotype)
        print(remaining_data.shape)
        remaining_data.drop(labels = [i[1] for i in ids[top]], axis = 0, inplace = True)
        print(remaining_data.shape)
        
    coverage = defaultdict()
    n = data.shape[0]#dat_data.shape[0]
    print(n)
    if n==0:
        n=0.0001
    for key in count_hap:
        coverage[key] = count_hap[key]/n
            
    return coverage, ids, count_hap

#matching_coverage,ids, count_hap = matching_coverage(data_4451, est_hap, genes = 3)

In [42]:
#############################
##          DATA           ##
#############################
greece_CBUs_BMDs_2fields_3loci = pd.read_excel(f'/Users/vasou/Documents/HLA/Matching Coverage/MC_for_paper/Data/CBUs_BMDs_for_Hap_from_80804.xlsx', 
                                               sheet_name = 'CBUs&BMDs_2fields_3loci_78611',
                                               header = [0])
greece_CBUs_BMDs_2fields_5loci = pd.read_excel(f'/Users/vasou/Documents/HLA/Matching Coverage/MC_for_paper/Data/CBUs_BMDs_for_Hap_from_80804.xlsx',
                                               sheet_name = 'CBUs&BMDs_2fields_5loci_70222',
                                               header = [0])

In [None]:
#########################
##         RUN         ##
#########################
with pd.ExcelWriter('MC_2_2fields_Greece_12Countries_BMDs_to_Greece_all.xlsx', engine = "openpyxl", mode = "w") as writer:
    for n_gene in [3,5]:
        if n_gene == 5:
            est_hap = clean_data('Cyprus/RunX_htf_cyprus_cbus_2fields_2702_5loci', 2702)
            data = greece_CBUs_BMDs_2fields_5loci
            if 'BIRTH' in data.columns:
                data.columns = ['ID','BIRTH', 'A1', 'A2','B1','B2','C1','C2','DRB1_1','DRB1_2','DQB1_1','DQB1_2']
            else:
                data.columns = ['ID', 'A1', 'A2','B1','B2','C1','C2','DRB1_1','DRB1_2','DQB1_1','DQB1_2']
        else:
            est_hap = clean_data('Cyprus/RunX_htf_cyprus_cbus_2fields_2843_3loci', 2843)
            data = greece_CBUs_BMDs_2fields_3loci
            if 'BIRTH' in data.columns:
                data.columns = ['ID','BIRTH', 'A1', 'A2','B1','B2','DRB1_1', 'DRB1_2']
            else:
                data.columns = ['ID','A1', 'A2','B1','B2','DRB1_1', 'DRB1_2']
                
            
        matching_cov, ids, count_hap = matching_coverage(data, est_hap, genes = n_gene, not_found = False)
        #not_found = True if allow not found
            
        a = pd.DataFrame(
            [list(count_hap.keys()), list(count_hap.values()), list(matching_cov.values())],
            ).T
        a.columns = ['Haplotype','Counts', 'Coverage']
        a.to_excel(writer, sheet_name = f'{n_gene}_genes', engine = "openpyxl", index = False)