In [2]:
import pandas as pd
import os
import json
from collections import defaultdict
os.getcwd()

'D:\\drug KG\\pgkb'

In [3]:
haplotype_folder = "haplotype"

# haplotype <-> gene mentioned on clinical annnotation
clinical_genes = [
    'TPMT', 'NAT2', 'G6PD', 'CYP3A5', 'CYP2A6', 'CYP3A4', 'CYP2C19',
    'UGT1A1', 'CYP2D6', 'NUDT15', 'CYP2B6', 'CYP2C9'
] # and 'HLA-B', 'HLA-A'

# haplotype <-> gene mentioned on guideline annotation
guideline_genes = [
    'CACNA1S', 'CFTR', 'CYP2C9', 'CYP2B6', 'CYP2D6', 'SLCO1B1',
    'UGT1A1', 'DPYD', 'NUDT15', 'MT-RNR1', 'CYP3A5', 'TPMT',
    'RYR1', 'CYP2C19', 'G6PD', 'IFNL3'
] # and 'HLA-B', 'HLA-A'

genes = set(clinical_genes + guideline_genes)

# The position mapping for haplotype HLA-A and HLA-B is complex, skip these first.

haplotype_path_dict = {g: "" for g in genes}
for path in os.listdir(haplotype_folder):
    for gene in genes:
        if "{}_haplotypes".format(gene) in path:
            haplotype_path_dict[gene] = os.path.join(haplotype_folder, path)
for key,value in sorted(haplotype_path_dict.items(), key=lambda x: x[0]):
    print(key, value)

CACNA1S haplotype\CACNA1S_haplotypes.xlsx
CFTR haplotype\CFTR_haplotypes.xlsx
CYP2A6 haplotype\CYP2A6_haplotypes.xlsx
CYP2B6 haplotype\CYP2B6_haplotypes.xlsx
CYP2C19 haplotype\CYP2C19_haplotypes.xlsx
CYP2C9 haplotype\CYP2C9_haplotypes.xlsx
CYP2D6 haplotype\CYP2D6_haplotypes.xlsx
CYP3A4 haplotype\CYP3A4_haplotypes.xlsx
CYP3A5 haplotype\CYP3A5_haplotypes.xlsx
DPYD haplotype\DPYD_haplotypes.xlsx
G6PD haplotype\G6PD_haplotypes.xlsx
IFNL3 haplotype\IFNL3_haplotypes.xlsx
MT-RNR1 haplotype\MT-RNR1_haplotypes.xlsx
NAT2 haplotype\NAT2_haplotypes.xlsx
NUDT15 haplotype\NUDT15_haplotypes.xlsx
RYR1 haplotype\RYR1_haplotypes.xlsx
SLCO1B1 haplotype\SLCO1B1_haplotypes.xlsx
TPMT haplotype\TPMT_haplotypes.xlsx
UGT1A1 haplotype\UGT1A1_haplotypes.xlsx


In [4]:
standard_haplotype_dict = {
    "CACNA1S": "Reference",
    "CFTR": "standard",
    "CYP2A6": "*1A",
    "CYP2B6": "*1",
    "CYP2C19": "*38",
    "CYP2C9": "*1",
    "CYP2D6": "*1",
    "CYP3A4": "*1",
    "CYP3A5": "*1",
    "DPYD": "Reference",
    "G6PD": "B (wildtype)",
    "IFNL3": "",
    "MT-RNR1": "Reference",
    "NAT2": "*4",
    "NUDT15": "*1",
    "RYR1": "Reference",
    "SLCO1B1": "*1A",
    "TPMT": "*1",
    "UGT1A1": "*1",
}

### 翻转 dataframe，让第一列做column

In [5]:
def df_T_convert(df):
    index = df.T.index
    values = df.T.values
    df_T = pd.DataFrame(values[1:, :], index=range(len(values) - 1), columns=values[0, :])
    df_index = pd.DataFrame({index[0]: index[1:]})
    df_T = pd.concat([df_index, df_T], axis=1)
    return df_T

In [6]:
# 处理TPMT gene
df_tpmt = pd.read_excel(
    haplotype_path_dict["TPMT"],
    engine="openpyxl",
    sheet_name="modified"
).fillna("")
df_tpmt_T = df_T_convert(df_tpmt)

In [7]:
# 处理 NAT2 gene
df_nat2 = pd.read_excel(
    haplotype_path_dict["NAT2"],
    engine="openpyxl",
    sheet_name="modified"
).fillna("")
df_nat2_T = df_T_convert(df_nat2)

In [8]:
# 处理 G6PD gene
df_G6PD = pd.read_excel(
    haplotype_path_dict["G6PD"],
    engine="openpyxl",
    sheet_name="modified"
).fillna("")
df_G6PD_T = df_T_convert(df_G6PD)


In [9]:
gene_df_T_dict = {}
for gene, path in haplotype_path_dict.items():
    df_tmp = pd.read_excel(
        path,
        engine="openpyxl",
        sheet_name="modified"
    ).fillna("")
    df_tmp_T = df_T_convert(df_tmp)
    gene_df_T_dict[gene] = df_tmp_T

In [10]:
gene_df_T_dict.keys()

dict_keys(['IFNL3', 'TPMT', 'CACNA1S', 'CYP2B6', 'CYP2C9', 'NUDT15', 'MT-RNR1', 'CYP2C19', 'SLCO1B1', 'CYP3A4', 'CYP2D6', 'NAT2', 'RYR1', 'CFTR', 'DPYD', 'CYP2A6', 'UGT1A1', 'CYP3A5', 'G6PD'])

In [11]:
for key, value in sorted(gene_df_T_dict.items(), key=lambda x: x[0]):
    print("{}:".format(key))
    print(value.columns)

CACNA1S:
Index(['Free Text', 'Protein Change NP_000060.2',
       'Chromosome Position Change NC_000001.11',
       'Gene position Change NG_009816.1 (minus)', 'rsID', 'Reference',
       'c.520C>T', 'c.3257G>A'],
      dtype='object')
CFTR:
Index(['Free Text', 'Protein Change NP_000483.3',
       'Chromosome Position Change NC_000007.14',
       'Gene Position Change NG_016465.4 (plus)', 'rsID', 'standard', 'E56K',
       'P67L', 'R74W', 'D110E', 'D110H', 'R117C', 'R117H', 'G178R', 'E193K',
       '711+3A->G', 'L206W', 'R347H', 'R352Q', 'A455E', 'S549N', 'S549R(A>C)',
       'S549R(T>G)', 'G551S', 'G551D', 'D579G', 'E831X', '2789+5G->A', 'S945L',
       'S977F', '3272-26A->G', 'F1052V', 'K1060T', 'A1067T', 'G1069R',
       'R1070Q', 'R1070W', 'F1074L', 'D1152H', '3849+10kbC->T', 'G1244E',
       'S1251N', 'S1255P', 'D1270N', 'G1349D'],
      dtype='object')
CYP2A6:
Index(['Haplotype Set ID', 'Haplotype Set Name', 'rsID', '*1A', '*1B1', '*1B2',
       '*1B3', '*1B4', '*1B5', '*1B6', '*

In [14]:
# input haplotype, return mapping dict
def haplotype_mapping(gene, h_type):
    if gene not in genes:
        return {}
    
    # standard type
    if standard_haplotype_dict[gene] == h_type:
        return {}
    
    # check header
    if h_type not in gene_df_T_dict[gene].columns:
        return {}
    
    h_type_list = list(gene_df_T_dict[gene][h_type].values)
    
    mapping_dict = {}
    # rsID mapping
    rsID_mapping_list = []
    rsID_list = list(gene_df_T_dict[gene]["rsID"].values)
    filter_rs_ID_list = list(filter(lambda x: x[1] != "" and "rs" in x[0], zip(rsID_list, h_type_list)))
    if len(filter_rs_ID_list) != 0:
        rsID_mapping_list = [x[0].strip() for x in filter_rs_ID_list]
        mapping_dict["rsID"] = rsID_mapping_list
    
    # nuleotide change mapping
    nucleotide_mapping_list = []
    # NG position mapping
    NG_position_mapping_list = []
    # NC position mapping
    NC_position_mapping_list = []
    # protein mapping
    protein_mapping_list = []
    for column in gene_df_T_dict[gene].columns:
        if "nucleotide" in column.lower():
            nucleotide_list = gene_df_T_dict[gene][column]
            filter_nucleotide_list = list(filter(lambda x: x[1] != "" and x[0] != "", zip(nucleotide_list, h_type_list)))
            if len(filter_nucleotide_list) != 0:
                nucleotide_mapping_list = [x[0].strip() for x in filter_nucleotide_list]
                mapping_dict["nucleotide change"] = nucleotide_mapping_list
            
        if "NG" in column and "position" in column.lower():
            NG_position_list = gene_df_T_dict[gene][column]
            filter_NG_position_list = list(filter(lambda x: x[1] != "" and x[0] != "", zip(NG_position_list, h_type_list)))
            if len(filter_NG_position_list) != 0:
                NG_position_mapping_list = [x[0].strip() for x in filter_NG_position_list]
                mapping_dict[column] = NG_position_mapping_list
                
        if "NC" in column and "position" in column.lower():
            NC_position_list = gene_df_T_dict[gene][column]
            filter_NC_position_list = list(filter(lambda x: x[1] != "" and x[0] != "", zip(NC_position_list, h_type_list)))
            if len(filter_NC_position_list) != 0:
                NC_position_mapping_list = [x[0].strip() for x in filter_NC_position_list]
                mapping_dict[column] = NC_position_mapping_list
                
        if "protein" in column.lower():
            protein_list = gene_df_T_dict[gene][column]
            filter_protein_list = list(filter(lambda x: x[1] != "" and x[0] != "", zip(protein_list, h_type_list)))
            if len(filter_protein_list) != 0:
                protein_mapping_list = [x[0].strip() for x in filter_protein_list]
                mapping_dict[column] = protein_mapping_list 
                
    return mapping_dict
            
    
print(haplotype_mapping("NUDT15", "*16"))
print(haplotype_mapping("TPMT", "*2"))
print(haplotype_mapping("TPMT", "*21"))
print(haplotype_mapping("CYP2B6", "*17"))
print(haplotype_mapping("CYP3A4", "*1D")) # 没有rsID
print(haplotype_mapping("G6PD", "Mediterranean, Dallas, Panama‚ Sassari, Cagliari, Birmingham"))

{'nucleotide change': ['88C>T'], 'Effect on protein (NP_060753.1)': ['L30V'], 'Position at NC_000013.11 (Homo sapiens chromosome 6, GRCh38.p7)': ['g.48037834C>T'], 'Position at NG_047021.1 (NUDT15 RefSeqGene)': ['g.5268C>T']}
{'rsID': ['rs1800462'], 'nucleotide change': ['238G>C'], 'Effect on protein (NP_000358.1)': ['A80P'], 'Position at NC_000006.12 (Homo sapiens chromosome 6, GRCh38.p2)': ['g.18143724C>G'], 'Position at NG_012137.2 (TPMT RefSeqGene)': ['g.16420G>C']}
{'rsID': ['rs200591577'], 'nucleotide change': ['205C>G'], 'Effect on protein (NP_000358.1)': ['L69V'], 'Position at NC_000006.12 (Homo sapiens chromosome 6, GRCh38.p2)': ['g.18147851G>C'], 'Position at NG_012137.2 (TPMT RefSeqGene)': ['g.12293C>G']}
{'rsID': ['rs33973337', 'rs33980385', 'rs33926104', 'rs34284776'], 'nucleotide change': ['76A>T', '83A>G', '85C>A', '86G>C'], 'Effect on protein (NP_000758.1)': ['T26S', 'D28G', 'p.R29S', 'p.R29P'], 'Position at NC_000019.10 (Homo sapiens chromosome 19, GRCh38.p2)': ['g.409

In [15]:
with open("json/select_variants.json", "r") as f:
    select_dict = json.load(f)