In [1]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_parquet("\\".join(os.path.dirname(os.path.abspath("__file__")).split("\\")[0:-1]) + "\\data\\Structured data\\final_merged_drugs.parquet", engine="fastparquet", index=False)

In [3]:
# Function to clean, explode, and create a DataFrame for a specific edge
def explode_and_create_df(col1, col2, edge_name):
    temp_df = df[[col1, col2]].copy()

    # Drop rows with null values in either column
    temp_df.dropna(subset=[col1, col2], inplace=True)
    temp_df = temp_df[temp_df[col2].str.lower() != 'nan']
    temp_df = temp_df[temp_df[col2].str.lower() != np.nan]

    # Drop duplicates
    temp_df.drop_duplicates(inplace=True)

    # Replace consecutive separators with a single separator and split
    temp_df[col2] = temp_df[col2].str.replace(r'\|\|+', '|', regex=True).str.split('|')

    # Explode the DataFrame based on col1 and col2
    temp_df = temp_df.explode(col1).explode(col2)

    # Remove empty strings if any
    temp_df = temp_df[temp_df[col1].str.strip() != '']
    temp_df = temp_df[temp_df[col2].str.strip() != '']

    # Rename columns and create an edge column
    temp_df.rename(columns={col1: 'source', col2: 'target'}, inplace=True)
    temp_df['edge'] = edge_name

    return temp_df


In [4]:
# Function to clean, match, explode, and create a DataFrame
def match_explode_and_create_df(source_col, target_col, edge_name):
    temp_df = df[[source_col, target_col]].copy()
    temp_df.dropna(subset=[source_col, target_col], inplace=True)
    temp_df = temp_df[temp_df[target_col].str.lower() != 'nan']
    temp_df = temp_df[temp_df[target_col].str.lower() != np.nan]
    temp_df.drop_duplicates(inplace=True)
    # Split each column by '||' and strip whitespace
    temp_df[source_col] = temp_df[source_col].astype(str).str.strip().str.split('\|\|')
    temp_df[target_col] = temp_df[target_col].astype(str).str.strip().str.split('\|\|')

    # List to store the new rows
    new_rows = []

    # Iterate over each row and create matched pairs
    for _, row in tqdm(temp_df.iterrows()):
        sources = row[source_col]
        targets = row[target_col]

        #if len(sources) == len(targets):
        for source, target in zip(sources, targets):
            new_rows.append({'source': source, 'target': target, 'edge': edge_name})

    # Create a new DataFrame from the list of new rows
    exploded_df = pd.DataFrame(new_rows)

    return exploded_df


In [5]:
def create_genes_df(df, source_col, target_col, edge_name, gene_info_cols=None):
    # Create a new DataFrame
    new_rows = []

    for _, row in df.iterrows():
        source = row[source_col]
        target = row[target_col]

        # Skip if either source or target is NaN
        if pd.isna(source) or pd.isna(target):
            continue

        # Initialize the gene information string
        gene_info = ""

        # Check if gene information columns are provided
        if gene_info_cols:
            gene_info_pieces = []
            for col in gene_info_cols:
                if pd.notna(row[col]):
                    gene_info_pieces.append(f"{col}:- {row[col]}")
            gene_info = "<br>".join(gene_info_pieces)

        new_row = {
            'source': source,
            'target': target,
            'edge': edge_name,
            'Full Gene Name': gene_info
        }

        new_rows.append(new_row)

    return pd.DataFrame(new_rows)


In [6]:
# Explode the specified columns with cleaning
synonyms_df = explode_and_create_df('name','synonym', 'has synonym')
diseases_df = explode_and_create_df('name','Treated Diseases/Conditions', 'used for')
side_effects_df = explode_and_create_df('name','Side effects', 'has side effect')
kingdom_df = explode_and_create_df('name','kingdom', 'Kingdom')
supclass_df = explode_and_create_df('name','subclass', 'Subclass')
superclass_df = explode_and_create_df('name','superclass', 'Superclass')
class_df = explode_and_create_df('name','class', 'Class')
marketed_name_df = explode_and_create_df('name', 'medicine name', 'marketed name')
manufacturer_df = match_explode_and_create_df('medicine name', 'manufacturer', 'manufacturer')
#medicine_source_df = match_explode_and_create_df('medicine name', 'medicine source', 'source')
country_df = match_explode_and_create_df('manufacturer', 'Country of manufacture', 'country')
gene_info_columns = ['Name', 'GenBank Protein ID', 'GenBank Gene ID', 'UniProt ID', 'Uniprot Title', 'GenAtlas ID']
gene_name_df = create_genes_df(df, 'name', 'Gene Name', 'Targets gene', gene_info_cols=gene_info_columns)
species_df = create_genes_df(df, 'name', 'Species', 'Effective in species', gene_info_cols=None)

3915it [00:00, 15233.56it/s]
3645it [00:00, 15338.72it/s]


In [7]:
# Concatenate all DataFrames
knowledge_graph = pd.concat([synonyms_df, diseases_df, side_effects_df, kingdom_df, supclass_df, superclass_df, class_df, marketed_name_df, manufacturer_df, country_df, gene_name_df, species_df])
# Merge the 'description' column from df into knowledge_graph on 'name' and 'source'
knowledge_graph = knowledge_graph.merge(df[['name', 'description']], left_on='source', right_on='name', how='left')


# Drop the extra 'name' column if not needed
knowledge_graph.drop('name', axis=1, inplace=True)
knowledge_graph.reset_index(inplace=True, drop=True)

# Drop rows where source equals target for 'has synonym' relationship
condition = (knowledge_graph['source'] == knowledge_graph['target']) & (knowledge_graph['edge'] == 'has synonym')
knowledge_graph = knowledge_graph.drop(knowledge_graph[condition].index)
knowledge_graph.reset_index(inplace=True, drop=True)
# Save the knowledge graph to a new CSV file
knowledge_graph.to_parquet("\\".join(os.path.dirname(os.path.abspath("__file__")).split("\\")[0:-1]) + "\\data\\Structured data\\knowledge_graph.parquet", engine="fastparquet", index=False)
