## Init Folder

In [6]:
import os
import json
import pandas as pd


In [None]:
NODE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/nodes.csv"
df_node = pd.read_csv(NODE_FILE)

unique_node_types = df_node['node_type'].nunique()
print(f"Number of unique node types: {unique_node_types}")

print("Unique node types:")
print(df_node['node_type'].unique())

print("\nCount of each node type:")
print(df_node['node_type'].value_counts())

print(df_node)

source_counts = df_node.groupby(['node_type', 'node_source']).size().reset_index(name='count')
print(source_counts)

EDGE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/edges.csv"  
df_edges = pd.read_csv(EDGE_FILE)

unique_relations = df_edges['relation'].nunique()
print(f"Number of unique relation types: {unique_relations}")

print("\nUnique relation types:")
print(df_edges['relation'].unique())

print("\nRelation counts:")
print(df_edges['relation'].value_counts())

Number of unique node types: 10
Unique node types:
['gene/protein' 'drug' 'effect/phenotype' 'disease' 'biological_process'
 'molecular_function' 'cellular_component' 'exposure' 'pathway' 'anatomy']

Count of each node type:
node_type
biological_process    28642
gene/protein          27671
disease               17080
effect/phenotype      15311
anatomy               14035
molecular_function    11169
drug                   7957
cellular_component     4176
pathway                2516
exposure                818
Name: count, dtype: int64
        node_index       node_id     node_type  \
0                0          9796  gene/protein   
1                1          7918  gene/protein   
2                2          8233  gene/protein   
3                3          4899  gene/protein   
4                4          5297  gene/protein   
...            ...           ...           ...   
129370      129370  R-HSA-936837       pathway   
129371      129371  R-HSA-997272       pathway   
129372   

## Clean all edgs (They are some duplciate itmes -> Indication & Contradication)
1. Get all indication egdes: indication_edges
2. Get all contraindication egdes: contraindication_edges


In [8]:
print(df_edges.head())

          relation display_relation  x_index  y_index
0  protein_protein              ppi        0     8889
1  protein_protein              ppi        1     2798
2  protein_protein              ppi        2     5646
3  protein_protein              ppi        3    11592
4  protein_protein              ppi        4     2122


### Indication

In [9]:
indication_relation = "indication"

# Load known indications
indication_edges = df_edges[df_edges['relation'] == indication_relation][['x_index', 'y_index']]
print(indication_edges)

         x_index  y_index
346730     16687    33577
346731     16687    36035
346764     20297    33577
346765     20297    36035
346768     16693    33577
...          ...      ...
5776153    84333    14471
5776154    27527    16634
5776155    38622    16634
5776156    28673    16634
5776158    39497    17237

[18776 rows x 2 columns]


In [12]:
merged = indication_edges.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='x_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'x_name',
    'node_type': 'x_type'
}).drop(columns=['node_index'])

merged = merged.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='y_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'y_name',
    'node_type': 'y_type'
}).drop(columns=['node_index'])

def clarify_roles(row):
    if row['x_type'] == 'drug':
        return pd.Series({
            'drug_index': row['x_index'],
            'drug_name': row['x_name'],
            'disease_index': row['y_index'],
            'disease_name': row['y_name']
        })
    else:
        return pd.Series({
            'drug_index': row['y_index'],
            'drug_name': row['y_name'],
            'disease_index': row['x_index'],
            'disease_name': row['x_name']
        })

indication_edges_ordered = merged.apply(clarify_roles, axis=1)
indication_edges_ordered = indication_edges_ordered[['drug_index', 'disease_index']]
indication_edges_ordered = indication_edges_ordered.drop_duplicates()
indication_edges_ordered['relationship'] = 'indication'
indication_edges_ordered

Unnamed: 0,drug_index,disease_index,relationship
0,16687,33577,indication
1,16687,36035,indication
2,20297,33577,indication
3,20297,36035,indication
4,16693,33577,indication
...,...,...,...
9383,14471,84333,indication
9384,16634,27527,indication
9385,16634,38622,indication
9386,16634,28673,indication


### Contradication

In [13]:
contraindication_relation = "contraindication"

contraindication_edges = df_edges[df_edges['relation'] == contraindication_relation][['x_index', 'y_index']]
print(contraindication_edges)

         x_index  y_index
346728     15193    33577
346729     15193    36035
346732     14483    33577
346733     14483    36035
346734     16476    33577
...          ...      ...
5776145    35751    14251
5776146    35846    20456
5776147    35751    20456
5776148    27446    17286
5776157    84334    18277

[61350 rows x 2 columns]


In [15]:
merged = contraindication_edges.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='x_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'x_name',
    'node_type': 'x_type'
}).drop(columns=['node_index'])

merged = merged.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='y_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'y_name',
    'node_type': 'y_type'
}).drop(columns=['node_index'])

def clarify_roles(row):
    if row['x_type'] == 'drug':
        return pd.Series({
            'drug_index': row['x_index'],
            'drug_name': row['x_name'],
            'disease_index': row['y_index'],
            'disease_name': row['y_name']
        })
    else:
        return pd.Series({
            'drug_index': row['y_index'],
            'drug_name': row['y_name'],
            'disease_index': row['x_index'],
            'disease_name': row['x_name']
        })

contraindication_edges_ordered = merged.apply(clarify_roles, axis=1)
contraindication_edges_ordered = contraindication_edges_ordered[['drug_index', 'disease_index']]
contraindication_edges_ordered = contraindication_edges_ordered.drop_duplicates()
contraindication_edges_ordered['relationship'] = 'contraindication'
contraindication_edges_ordered

Unnamed: 0,drug_index,disease_index,relationship
0,15193,33577,contraindication
1,15193,36035,contraindication
2,14483,33577,contraindication
3,14483,36035,contraindication
4,16476,33577,contraindication
...,...,...,...
30670,14251,35751,contraindication
30671,20456,35846,contraindication
30672,20456,35751,contraindication
30673,17286,27446,contraindication


### Combine All DATA

In [17]:
combined_dd_edges = pd.concat([indication_edges_ordered, contraindication_edges_ordered], ignore_index=True)
combined_dd_edges = combined_dd_edges.drop_duplicates(subset=['drug_index', 'disease_index'], keep='first')
combined_dd_edges

Unnamed: 0,drug_index,disease_index,relationship
0,16687,33577,indication
1,16687,36035,indication
2,20297,33577,indication
3,20297,36035,indication
4,16693,33577,indication
...,...,...,...
40057,14251,35846,contraindication
40058,14251,35751,contraindication
40059,20456,35846,contraindication
40060,20456,35751,contraindication
