## 生成临床clinical annotation的关系数据

In [2]:
import pandas as pd
import os
import json
import re
os.getcwd()

'D:\\drug KG\\pgkb'

In [3]:
df_clinical_annotation = pd.read_csv('clinical_annotation/clinical_annotations.tsv', sep='\t').fillna("")
df_clinical_annotation = df_clinical_annotation[(df_clinical_annotation["Level of Evidence"] == "1A") | 
                                              (df_clinical_annotation["Level of Evidence"] == "1B")]
df_clinical_annotation.index = range(len(df_clinical_annotation))
print(df_clinical_annotation.shape)
df_clinical_annotation[:5]

(265, 15)


Unnamed: 0,Clinical Annotation ID,Variant/Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,981755803,rs75527207,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,1449191690,rs141033578,CFTR,1A,,Rare Variant; Tier 1 VIP,200.0,Efficacy,1,3,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
2,1449191746,rs78769542,CFTR,1A,,Rare Variant; Tier 1 VIP,200.0,Efficacy,1,3,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
3,981419266,HLA-B*15:02:01,HLA-B,1A,,Tier 1 VIP,217.25,Toxicity,17,21,phenytoin,drug reaction with eosinophilia and systemic s...,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
4,1451259580,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,,Tier 1 VIP,211.375,Toxicity,6,9,amitriptyline,Depressive Disorder,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [4]:
set(df_clinical_annotation["Phenotype Category"].values)

{'Dosage',
 'Efficacy',
 'Efficacy;Toxicity',
 'Metabolism/PK',
 'Other',
 'Toxicity'}

In [5]:
# use to fix wrong comma split
special_haplotype_name_list = ["G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham"]

dosage_variant_drug_list = []
efficacy_variant_drug_list = []
toxicity_variant_drug_list = []
metabolism_variant_drug_list = []
other_variant_drug_list = []

all_variant_list = []

for index, row in df_clinical_annotation.iterrows():
    variant = row["Variant/Haplotypes"]
    variant_list = []
    for shn in special_haplotype_name_list:
        if shn in variant:
            variant_list.append(shn)
            variant = variant.replace(shn, "")
    variant_list.extend([x.strip() for x in variant.split(",")])
    variant_list = list(filter(lambda x: x!= "", variant_list))
    all_variant_list.extend(variant_list)
    
    level = row["Level of Evidence"]
    phenotype_category = row["Phenotype Category"]
    drug = row["Drug(s)"]
    drug_list = [x.strip() for x in re.split(r";|,|/", drug)]
    
    for v in variant_list:
        for d in drug_list:
            if "Dosage" in phenotype_category:
                    dosage_variant_drug_list.append((v, level, d))
            elif "Efficacy" in phenotype_category:
                    efficacy_variant_drug_list.append((v, level, d))
            elif "Metabolism" in phenotype_category:
                    metabolism_variant_drug_list.append((v, level, d))
            elif "Toxicity" in phenotype_category:
                    toxicity_variant_drug_list.append((v, level, d))
            elif "Other" in phenotype_category:
                    other_variant_drug_list.append((v, level, d))

dosage_variant_drug_list = list(set(dosage_variant_drug_list))
efficacy_variant_drug_list = list(set(efficacy_variant_drug_list))
toxicity_variant_drug_list = list(set(toxicity_variant_drug_list))
metabolism_variant_drug_list = list(set(metabolism_variant_drug_list))
other_variant_drug_list = list(set(other_variant_drug_list))

In [6]:
print(len(dosage_variant_drug_list))
print(len(efficacy_variant_drug_list))
print(len(toxicity_variant_drug_list))
print(len(metabolism_variant_drug_list))
print(len(other_variant_drug_list))

50
107
516
327
15


In [7]:
# all variant / haplotype in clinical data
len(set(all_variant_list))

246

### 过滤一下有哪些Gene出现在了clinical annotation当中

In [8]:
# get gene symbols from variant table
gene_symbol_list = []
for x in set(pd.read_csv('variants/variants.tsv', sep='\t', 
                         error_bad_lines=False).fillna("")["Gene Symbols"].values):
    gene_symbol_list.extend(x.split(","))
gene_symbol_list = list(filter(lambda x: x!= "", set(gene_symbol_list)))
len(gene_symbol_list)

1949

In [9]:
# filter all genes mentioned on clinical annotation, 
# help to reduce workload for generating position <-> haplotype mapping.
mentioned_gene_list = []
for gene in gene_symbol_list:
    for vh in all_variant_list:
        if gene in vh:
            mentioned_gene_list.append(gene)
            break

# mentioned genes:
print(list(set(df_clinical_annotation["Gene"].values)))

['', 'F5', 'CACNA1S', 'UGT1A1', 'CYP2D6', 'IFNL3;IFNL4', 'SLCO1B1', 'MT-ND1;MT-RNR1', 'HLA-A', 'VKORC1', 'HLA-B', 'IFNL4', 'G6PD', 'NUDT15', 'CYP3A4', 'NAT2', 'CYP2A6', 'CFTR', 'TPMT', 'IFNL3', 'DPYD', 'RYR1', 'CYP2C9', 'CYP2B6', 'CYP4F2', 'CYP3A5', 'MT-RNR1', 'EGFR', 'CYP2C19']


In [10]:
# mentioned genes, this one is correct
print(mentioned_gene_list)

['HLA-A', 'CYP2C9', 'CYP2A6', 'G6PD', 'UGT1A1', 'UGT1A', 'CYP3A4', 'NUDT15', 'HLA-B', 'CYP2B6', 'TPMT', 'NAT2', 'CYP3A', 'CYP2C19', 'CYP2D6', 'CYP3A5']


In [11]:
dosage_variant_drug_list[:2]

[('CYP2D6*5', '1A', 'imipramine'), ('NUDT15*3', '1A', 'mercaptopurine')]

In [16]:
df_clinical_drug_variant_annotation = pd.concat([
    pd.DataFrame(dosage_variant_drug_list, 
                 columns=["variant", "evidence", "drug"]),
    pd.DataFrame(efficacy_variant_drug_list,
                 columns=["variant", "evidence", "drug"]),
    pd.DataFrame(toxicity_variant_drug_list,
                 columns=["variant", "evidence", "drug"]),
    pd.DataFrame(metabolism_variant_drug_list,
                 columns=["variant", "evidence", "drug"]),
    pd.DataFrame(other_variant_drug_list,
                 columns=["variant", "evidence", "drug"]),
], axis=0, ignore_index=True)

phenotype_list = ["dosage"] * len(dosage_variant_drug_list) + ["efficacy"] * len(efficacy_variant_drug_list) + \
                 ["toxicity"] * len(toxicity_variant_drug_list) + ["metabolism"] * len(metabolism_variant_drug_list) + \
                 ["other"] * len(other_variant_drug_list)

df_clinical_drug_variant_annotation = df_clinical_drug_variant_annotation.assign(
    phenotype=phenotype_list
).assign(
    data_source=["clinical_annotation"] * len(phenotype_list))

In [17]:
df_clinical_drug_variant_annotation.to_csv("processed/clinical_drug_variant_annotation.csv", index=False)