In [1]:
import pandas as pd
import numpy as np
import os



##########设置PrimeKG根目录##########

root_dir = "/playpen/hongxuan/drug_repurpose/PrimeKG"





os.chdir(root_dir)
edges = pd.read_csv("edges.csv")
nodes = pd.read_csv("nodes.csv")

node_id_to_name = dict(zip(nodes['node_index'], nodes['node_name'])) # node id 是ontology标志符
node_id_to_type = dict(zip(nodes['node_index'], nodes['node_type']))
node_id_to_source = dict(zip(nodes['node_index'], nodes['node_source']))


direct_relation_map = {
    'indication': 'treats',
    'contraindication': 'contraindicated_for',
    'off-label use': 'treats_off_label',
    'side effect': 'side_effect',
    'synergistic interaction': 'synergistic_interaction',
    'phenotype absent': 'does_not_present',
    'phenotype present': 'presents_with',
}

direct_edges = edges[edges['display_relation'].isin(direct_relation_map.keys())].copy()
direct_edges = direct_edges[
    (direct_edges['x_index'].map(node_id_to_type) == 'drug') &
    (direct_edges['y_index'].map(node_id_to_type).isin(['drug', 'disease', 'effect/phenotype']))
]

drug_effect_direct = pd.DataFrame({
    'drug_id': direct_edges['x_index'], 
    'drug_name': direct_edges['x_index'].map(node_id_to_name),
    'target_id': direct_edges['y_index'],
    'target_name': direct_edges['y_index'].map(node_id_to_name),
    'target_type': direct_edges['y_index'].map(node_id_to_type),
    'effect': direct_edges['display_relation'].map(direct_relation_map),
    'effect_type': direct_edges['display_relation'],
})


In [2]:
drug_feature = pd.read_csv("drug_features.csv")
disease_feature = pd.read_csv("disease_features.csv")

# 合并的药物特征
drug_features_to_merge = [
    'node_index', 'indication', 'mechanism_of_action', 'pharmacodynamics',
    'atc_1', 'atc_2', 'atc_3', 'atc_4', 'description', 'category', 'group'
]

# 合并的疾病特征
disease_features_to_merge = [
    'node_index', 'mondo_definition', 'umls_description', 'orphanet_clinical_description',
    'mayo_symptoms', 'mayo_causes', 'mayo_risk_factors'
]

# 合并药物特征
drug_effect_direct = drug_effect_direct.merge(
    drug_feature[drug_features_to_merge],
    left_on='drug_id',
    right_on='node_index',
    how='left'
).drop(columns=['node_index']) 

# 合并疾病特征（仅当target_type为'disease'时）
# 筛选出target_type为'disease'的行
disease_targets = drug_effect_direct[drug_effect_direct['target_type'] == 'disease']

# 合并疾病特征
disease_targets = disease_targets.merge(
    disease_feature[disease_features_to_merge],
    left_on='target_id',
    right_on='node_index',
    how='left'
).drop(columns=['node_index'])

# 将合并后的疾病特征放回原dataframe
drug_effect_direct = drug_effect_direct.merge(
    disease_targets[['drug_id', 'target_id'] + disease_features_to_merge[1:]],
    on=['drug_id', 'target_id'],
    how='left'
)

# 药物类型, one-hot encoding
group_types = ['approved', 'investigational', 'experimental', 'vet_approved', 
               'nutraceutical', 'illicit', 'withdrawn']

result_array = np.zeros((len(drug_effect_direct), len(group_types)), dtype=np.int8)
for i, drug_type in enumerate(group_types):
    pattern = r'\b' + drug_type + r'\b'
    result_array[:, i] = drug_effect_direct['group'].str.contains(
        pattern, regex=True, na=False).astype(np.int8).values
result_df = pd.DataFrame(
    result_array,
    index=drug_effect_direct.index,
    columns=[f'drug_{drug}' for drug in group_types]
)
drug_effect_direct = pd.concat([drug_effect_direct, result_df], axis=1)

In [3]:
GROUP_STR = {
    1: ' is anatomically related to ',
    2: ' is in the therapeutic group of ',
    3: ' is pharmacologically related to ',
    4: 'The chemical and functional group of  is '
}

GROUP_NAME = {
    1: 'Anatomical Group',
    2: 'Therapeutic Group',
    3: 'Pharmacological Group',
    4: 'Chemical and Functional Group'
}

def group_category(df):
    if 'category' not in df.columns:
        return
    
    mask = df['category'].notna()
    if mask.any():
        values = df.loc[mask, 'category']
        
        has_prefix = values.str.contains(" is part of ", na=False)
        
        result = values.copy()
        if has_prefix.any():
            result.loc[has_prefix] = values.loc[has_prefix].str.split(" is part of ").str[-1]
        
        result = result.str.rstrip('.')
        del df['category']
        df.loc[mask, 'drug_category'] = result
def group_atc_cols(df, atc_num):
    """转换ATC列为对应的分组列，处理'and other'特殊情况"""
    col_name = f'atc_{atc_num}'
    new_col_name = GROUP_NAME[atc_num]
    atc_string = GROUP_STR[atc_num]
    
    if col_name in df.columns:
        mask = df[col_name].notna()
        
        if mask.any():
            values = df.loc[mask, col_name]
            has_prefix = values.str.contains(atc_string, na=False)
            
            result = values.copy()
            
            if has_prefix.any():
                result.loc[has_prefix] = values.loc[has_prefix].str.split(atc_string).str[-1]
            
            result = result.str.rstrip('.')
            
            def process_and_other(text):
                if pd.isna(text):
                    return text
                # 检查是否包含"and other"
                if " and other " in text:
    
                    higher_category = text.split(" and other ")[-1]
                    return higher_category
                else:
        
                    return " ; ".join([item.strip() for item in text.split(" and ")])
            
            result = result.apply(process_and_other)
            del df[col_name]
            df.loc[mask, new_col_name] = result
            
# 应用到所有ATC级别
for atc_num in range(1, 5):
    group_atc_cols(drug_effect_direct, atc_num)
group_category(drug_effect_direct)

In [16]:
# 整理后的列名映射
column_mapping = {
    # 药物基本信息
    'drug_id': 'drug_id', 
    'drug_name': 'drug_name',  
    
    # 疾病/目标基本信息
    # 'target_id': 'disease_id',
    # 'target_name': 'disease_name', 
    # 'target_type': 'target_type',  
    
    # 关系信息
    # 'effect': 'relation_type',
    'effect_type': 'relation_subtype',
    
    # 药物特征 - 文本描述
    'indication': 'drug_indication',
    'mechanism_of_action': 'drug_mechanism',
    'pharmacodynamics': 'drug_pharmacodynamics',
    'description': 'drug_description',
    # 'group': 'drug_group_raw',
    
    # 药物特征 - 分类信息
    'Anatomical Group': 'drug_anatomical_group',
    'Therapeutic Group': 'drug_therapeutic_group',
    'Pharmacological Group': 'drug_pharmacological_group',
    'Chemical and Functional Group': 'drug_chemical_and_functional_group',
    'drug_category': 'drug_category',
    
    # 药物特征 - 药物类型（One-hot编码）
    'drug_approved': 'drug_is_approved',
    'drug_investigational': 'drug_is_investigational',
    'drug_experimental': 'drug_is_experimental',
    'drug_vet_approved': 'drug_is_vet_approved',
    'drug_nutraceutical': 'drug_is_nutraceutical',
    'drug_illicit': 'drug_is_illicit',
    'drug_withdrawn': 'drug_is_withdrawn',
    
    # 疾病特征
    'mondo_definition': 'disease_mondo_definition',
    'umls_description': 'disease_umls_description',
    'orphanet_clinical_description': 'disease_clinical_description',
    'mayo_symptoms': 'disease_symptoms',
    'mayo_causes': 'disease_causes',
    'mayo_risk_factors': 'disease_risk_factors'
}

In [18]:
drug_effect_direct.columns

Index(['drug_id', 'drug_name', 'target_id', 'target_name', 'target_type',
       'effect', 'effect_type', 'indication', 'mechanism_of_action',
       'pharmacodynamics', 'description', 'group', 'mondo_definition',
       'umls_description', 'orphanet_clinical_description', 'mayo_symptoms',
       'mayo_causes', 'mayo_risk_factors', 'drug_approved',
       'drug_investigational', 'drug_experimental', 'drug_vet_approved',
       'drug_nutraceutical', 'drug_illicit', 'drug_withdrawn',
       'Anatomical Group', 'Therapeutic Group', 'Pharmacological Group',
       'Chemical and Functional Group', 'drug_category'],
      dtype='object')

In [19]:
drug_effect_direct.drop(['effect', 'group'], axis=1, inplace=True)
drug_effect_direct = drug_effect_direct.rename(columns=column_mapping)

In [21]:
drug_effect_direct.tail(5)

Unnamed: 0,drug_id,drug_name,target_id,target_name,target_type,relation_subtype,drug_indication,drug_mechanism,drug_pharmacodynamics,drug_description,...,drug_is_experimental,drug_is_vet_approved,drug_is_nutraceutical,drug_is_illicit,drug_is_withdrawn,drug_anatomical_group,drug_therapeutic_group,drug_pharmacological_group,drug_chemical_and_functional_group,drug_category
3262389,15288,Interferon alfa-2b,25235,Ileus,effect/phenotype,side effect,"For the treatment of hairy cell leukemia, mali...",Interferon alpha binds to type I interferon re...,"Upregulates the expression of MHC I proteins, ...",Interferon alpha 2b (human leukocyte clone hif...,...,0,0,0,0,0,antineoplastic ; immunomodulating agents,immunostimulants,immunostimulants,interferons,"Adjuvants, Immunologic ; Alfa Interferons ; An..."
3262390,15288,Interferon alfa-2b,24289,Dysphonia,effect/phenotype,side effect,"For the treatment of hairy cell leukemia, mali...",Interferon alpha binds to type I interferon re...,"Upregulates the expression of MHC I proteins, ...",Interferon alpha 2b (human leukocyte clone hif...,...,0,0,0,0,0,antineoplastic ; immunomodulating agents,immunostimulants,immunostimulants,interferons,"Adjuvants, Immunologic ; Alfa Interferons ; An..."
3262391,15288,Interferon alfa-2b,23834,Coronary artery atherosclerosis,effect/phenotype,side effect,"For the treatment of hairy cell leukemia, mali...",Interferon alpha binds to type I interferon re...,"Upregulates the expression of MHC I proteins, ...",Interferon alpha 2b (human leukocyte clone hif...,...,0,0,0,0,0,antineoplastic ; immunomodulating agents,immunostimulants,immunostimulants,interferons,"Adjuvants, Immunologic ; Alfa Interferons ; An..."
3262392,15288,Interferon alfa-2b,84853,Excessive daytime somnolence,effect/phenotype,side effect,"For the treatment of hairy cell leukemia, mali...",Interferon alpha binds to type I interferon re...,"Upregulates the expression of MHC I proteins, ...",Interferon alpha 2b (human leukocyte clone hif...,...,0,0,0,0,0,antineoplastic ; immunomodulating agents,immunostimulants,immunostimulants,interferons,"Adjuvants, Immunologic ; Alfa Interferons ; An..."
3262393,15288,Interferon alfa-2b,23268,Hypoventilation,effect/phenotype,side effect,"For the treatment of hairy cell leukemia, mali...",Interferon alpha binds to type I interferon re...,"Upregulates the expression of MHC I proteins, ...",Interferon alpha 2b (human leukocyte clone hif...,...,0,0,0,0,0,antineoplastic ; immunomodulating agents,immunostimulants,immunostimulants,interferons,"Adjuvants, Immunologic ; Alfa Interferons ; An..."


In [None]:
drug_effect_direct.to_csv("drug_effect_direct.csv", index=False)