In [1]:
import pandas as pd
import sys
import numpy as np

In [2]:
df_node = pd.read_csv("train/node.csv", dtype=str)
df_relation = pd.read_csv("train/relation.csv", dtype=str)
df_triple = pd.read_csv("train/triple.csv", dtype=str)

In [7]:
triple_list = df_triple.values.tolist()
triple_count = len(df_triple)
print("triple count: {}".format(triple_count))

triple count: 262019


In [16]:
seed = np.arange(triple_count)
np.random.shuffle(seed)

train_count = int(triple_count * 0.9)
valid_count = int(triple_count * 0.05)
train_index = seed[:train_count].tolist()
valid_index = seed[train_count:train_count+valid_count].tolist()
test_index = seed[train_count+valid_count:].tolist()

In [17]:
with open("train/dglke_train.tsv", "w+") as f:
    for index in train_index:
        f.writelines("{}\t{}\t{}\n".format(triple_list[index][0], triple_list[index][1], triple_list[index][2]))
        
with open("train/dglke_valid.tsv", "w+") as f:
    for index in valid_index:
        f.writelines("{}\t{}\t{}\n".format(triple_list[index][0], triple_list[index][1], triple_list[index][2]))
        
with open("train/dglke_test.tsv", "w+") as f:
    for index in test_index:
        f.writelines("{}\t{}\t{}\n".format(triple_list[index][0], triple_list[index][1], triple_list[index][2]))


dglke_train --dataset pgkb --data_path ./train --data_files dglke_train.tsv dglke_valid.tsv dglke_test.tsv --format raw_udd_hrt --model_name TransE_l2 --batch_size 2048 --neg_sample_size 256 --hidden_dim 400 --gamma 12.0 --lr 0.1 --max_step 100000 --log_interval 1000 --batch_size_eval 16 -adv --regularization_coef 1.00E-07 --test --num_thread 1 --num_proc 1 --neg_sample_size_eval 10000 --async_update

In [2]:
node_embedding = np.load("ckpts/TransE_l2_pgkb_5/pgkb_TransE_l2_entity.npy")
relation_embedding = np.load("ckpts/TransE_l2_pgkb_5/pgkb_TransE_l2_relation.npy")

In [3]:
print(node_embedding.shape)
print(relation_embedding.shape)

(81510, 400)
(14, 400)


In [5]:
df_entity_idx = pd.read_csv("train/entities.tsv", sep="\t", header=None, names=["idx", "entity_id"], dtype=str).fillna("")
df_node = pd.read_csv("train/node.csv", dtype=str).fillna("")

In [6]:
df_entity = pd.merge(df_entity_idx, df_node, how="left", left_on=["entity_id"], right_on=["node_id"])

In [8]:
drug_id_entity_dict = {}
drug_idx_id_dict = {}
count = 0
for index, row in df_entity.iterrows():
    if row["labels"] == "chemical":
        drug_id_entity_dict[row["idx"]] = [row["entity_id"], row["name"], row["labels"]]
        drug_idx_id_dict[count] = row["idx"]
        count += 1

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
len(drug_id_entity_dict)

947

In [10]:
select_list = []
for key in drug_id_entity_dict.keys():
    select_list.append(node_embedding[int(key)])
select_embedding = np.array(select_list)

In [11]:
drug_similarity = cosine_similarity(select_embedding)

In [12]:
drug_similarity.shape

(947, 947)

In [13]:
result_dict = {}
for idx in range(drug_similarity.shape[0]):
    sort_list = sorted([[i, x] for i, x in enumerate(drug_similarity[idx])], key=lambda x: x[1], reverse=True)
    sort_list = list(filter(lambda x: x[0] != idx, sort_list))
    sort_list = list(map(lambda x: [drug_idx_id_dict[x[0]], x[1]], sort_list))[:3]
    result_dict[drug_idx_id_dict[idx]] = sort_list

In [14]:
drug_list = []
drug_match_list = []
score_match_list = []
for key, value in result_dict.items():
    drug = drug_id_entity_dict[key][1]
    md_list = []
    ms_list = []
    for val in value:
        matched_drug = drug_id_entity_dict[val[0]][1]
        match_score = val[1]
        md_list.append(matched_drug)
        ms_list.append(match_score)
    drug_match_list.extend(md_list)
    score_match_list.extend(ms_list)
    drug_list.extend([drug] * len(md_list))

In [19]:
# for chemical
df_match = pd.DataFrame({
    "chemical": drug_list,
    "matched_chemical": drug_match_list,
    "score": score_match_list
}) # .to_csv("chemical_similarity.csv", index=False)

In [89]:
# for drug
df_match = pd.DataFrame({
    "drug": drug_list,
    "matched_drug": drug_match_list,
    "score": score_match_list
}) # .to_csv("drug_similarity.csv", index=False)

In [21]:
from py2neo import Graph 
session = Graph("neo4j://172.16.227.27:7687", auth=("neo4j", "123456"))

In [95]:
# 药物链路 相同化合物
link_template = """
match (d1:drug {display: "氟哌啶醇注射液"})-[r]->(n:disease)<-[]-(d2:drug {display: "氟哌啶醇片"})
return d1.display as drug_name_1, n.display as link_node_name, labels(n)[0] as link_node_label, d2.display as drug_name_2
union
match (d1:drug {display: "氟哌啶醇注射液"})<-[]-(n:chemical)-[]->(d2:drug {display: "氟哌啶醇片"})
return d1.display as drug_name_1, n.display as link_node_name, labels(n)[0] as link_node_label, d2.display as drug_name_2
"""

result_list = session.run(link_template).data()
df_link = pd.DataFrame(result_list)

In [103]:
stats_dict = {}
for index, row in df_link.groupby(["link_node_label"])["link_node_name"].count().reset_index().iterrows():
    if row["link_node_label"] == "chemical":
        stats_dict["common_chemical_count"] = row["link_node_name"]
    if row["link_node_label"] == "disease":
        stats_dict["common_indication_count"] = row["link_node_name"]

In [104]:
stats_dict

{'common_chemical': 1, 'common_indication': 22}

In [115]:
indication_link_list = []
for index, row in df_link[df_link["link_node_label"] == "disease"].iterrows():
    indication_link_list.append("{}-[适应症]->{}<-[适应症]-{}".format(
        row["drug_name_1"], row["link_node_name"], row["drug_name_2"]
    ))

In [113]:
chemical_link_list = []
for index, row in df_link[df_link["link_node_label"] == "chemical"].iterrows():
    chemical_link_list.append("{}-[化合物]->{}<-[化合物]-{}".format(
        row["drug_name_1"], row["link_node_name"], row["drug_name_2"]
    ))

In [114]:
chemical_link_list

['氟哌啶醇注射液-[化合物]->haloperidol<-[化合物]-氟哌啶醇片']

In [124]:
link_template = """
match (d1:drug {{display: "{d1}"}})-[]->(n:disease)<-[]-(d2:drug {{display: "{d2}"}})
return d1.display as drug_name_1, n.display as link_node_name, labels(n)[0] as link_node_label, d2.display as drug_name_2
union
match (d1:drug {{display: "{d1}"}})<-[]-(n:chemical)-[]->(d2:drug {{display: "{d2}"}})
return d1.display as drug_name_1, n.display as link_node_name, labels(n)[0] as link_node_label, d2.display as drug_name_2
"""

common_chemical_count_list = []
common_indication_count_list = []
indication_links = []
chemical_links = []

for index, row in df_match.iterrows():
    link_query = link_template.format(d1=row["drug"], d2=row["matched_drug"])
    result_list = session.run(link_query).data()
    df_link = pd.DataFrame(result_list)
    chemical_count = 0
    indication_count = 0
    try:
        for index, row in df_link.groupby(["link_node_label"])["link_node_name"].count().reset_index().iterrows():
            if row["link_node_label"] == "chemical":
                chemical_count = row["link_node_name"]
            if row["link_node_label"] == "disease":
                indication_count = row["link_node_name"]
    except:
        pass
    
    common_chemical_count_list.append(chemical_count)
    common_indication_count_list.append(indication_count)
            
    indication_link_list = []
    try:
        for index, row in df_link[df_link["link_node_label"] == "disease"].iterrows():
            indication_link_list.append("{}-[适应症]->{}<-[适应症]-{}".format(
                row["drug_name_1"], row["link_node_name"], row["drug_name_2"]
            ))
    except:
        pass
    indication_links.append("\n".join(indication_link_list))
        
    chemical_link_list = []
    try:
        for index, row in df_link[df_link["link_node_label"] == "chemical"].iterrows():
            chemical_link_list.append("{}-[化合物]->{}<-[化合物]-{}".format(
                row["drug_name_1"], row["link_node_name"], row["drug_name_2"]
            ))
    except:
        pass
    chemical_links.append("\n".join(chemical_link_list))

In [128]:
df_match = df_match.assign(common_chemical_count=common_chemical_count_list)
df_match = df_match.assign(common_indication_count=common_indication_count_list)
chemical_links = [x.replace("\n", " | ") for x in chemical_links]
indication_links = [x.replace("\n", " | ") for x in indication_links]
df_match = df_match.assign(chemical_link=chemical_links)
df_match = df_match.assign(indication_link=indication_links)

In [129]:
df_match.to_csv("similar_drug_and_link.csv", index=False)

In [70]:
# 相似化合物链路
# 相同影响基因
link_template = """
match (c1:chemical {{display: "{c1}"}})<-[]-(ge:gene)-[]->(c2:chemical {{display: "{c2}"}})
return c1.display as chemical_name_1, ge.display as gene_name, c2.display as chemical_name_2
"""

common_gene_count_list = []
gene_links = []

for index, row in df_match.iterrows():
    link_query = link_template.format(c1=row["chemical"], c2=row["matched_chemical"])
    result_list = session.run(link_query).data()
    df_link = pd.DataFrame(result_list)
#     gene_count = 0
#     try:
#         for index, row in df_link.groupby(["gene_name"])["gene_name"].count().reset_index(name="count").iterrows():
#             gene_count = row["count"]
#     except:
#         pass
    
#     common_gene_count_list.append(gene_count)
        
    gene_link_list = []
    try:
        for index, row in df_link.iterrows():
            gene_link_list.append(row["gene_name"])
    except:
        pass
    common_gene_count_list.append(len(set(gene_link_list)))
    gene_links.append(" | ".join(set(gene_link_list)))

In [71]:
df_match = df_match.assign(common_gene_count=common_gene_count_list)
df_match = df_match.assign(gene_link=gene_links)

In [72]:
# check empty gene name
df_match.to_csv("similar_chemical_and_link.csv", index=False)
df = pd.read_csv("processed/drug_gene_label.csv").fillna("")
print(set(df[df["gene"] == ""]["drug"].values))

{'isatuximab-irfc', 'lenalidomide', 'duvelisib', 'vardenafil', 'ibrutinib', 'desloratadine', 'lenvatinib', 'sildenafil', 'crizanlizumab-tmca'}


In [73]:
empty_drug = ['isatuximab-irfc', 'lenalidomide', 'duvelisib', 
              'vardenafil', 'ibrutinib', 'desloratadine', 
              'lenvatinib', 'sildenafil', 'crizanlizumab-tmca']

df_match[(~df_match.chemical.isin(empty_drug) & ~df_match.matched_chemical.isin(empty_drug))
        ].to_csv("similar_chemical_and_link.csv", index=False)

In [74]:
pd.read_csv("similar_chemical_and_link.csv").fillna("")

Unnamed: 0,chemical,matched_chemical,score,common_gene_count,gene_link
0,amikacin,paromomycin,0.639231,1,MT-RNR1
1,amikacin,plazomicin,0.628835,1,MT-RNR1
2,amikacin,streptomycin,0.565314,1,MT-RNR1
3,nitrofurantoin,mefloquine,0.610967,1,G6PD
4,nitrofurantoin,phenazopyridine,0.502346,1,G6PD
...,...,...,...,...,...
2809,risdiplam,olaratumab,0.328720,0,
2810,risdiplam,eletriptan,0.313001,0,
2811,acetaminophen; tramadol,olaratumab,0.484772,0,
2812,acetaminophen; tramadol,eletriptan,0.399291,1,CYP2D6
