# Biomedical Knowledge Graph Analysis

This notebook contains exploratory data analysis and visualization for the biomedical knowledge graph thesis project.

## Project Overview
- **Objective**: Build and analyze biomedical knowledge graphs from PubTator data
- **Data Sources**: PubTator XML files, Neo4j graph database
- **Technologies**: Python, MongoDB, Neo4j, OpenAI API

## Notebook Contents
1. Data exploration and preprocessing
2. Entity and relationship analysis  
3. Knowledge graph statistics
4. Visualization experiments

In [None]:
# Install required packages
!pip install pymongo requests pandas plotly networkx neo4j

# Import required libraries
import pandas as pd
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed and imported successfully!")



Note: you may need to restart the kernel to use updated packages.


Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
import os
import pandas as pd

# Data Loading and Processing Example
# This section demonstrates how to load and process extracted drug data

def load_json_file(file_path):
    """
    Load and process JSON data with proper error handling
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
            
        # Fix potential JSON formatting issues
        file_content = file_content.replace("'", "\"")
        
        try:
            data = json.loads(file_content)
            print(f"✅ Successfully loaded data from {file_path}")
            return data
        except json.JSONDecodeError as e:
            print(f"❌ Error decoding JSON: {e}")
            print("File content snippet:", file_content[:200])
            return None
            
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return None

# Example: Load drug properties data
file_path = './labels DRUG properties.txt'

if os.path.exists(file_path):
    data = load_json_file(file_path)
    
    if data:
        # Extract NAME and drug_id safely
        extracted_data = []
        for item in data:
            try:
                if 'properties' in item and 'NAME' in item['properties']:
                    name = item['properties']['NAME'][0] if item['properties']['NAME'] else 'Unknown'
                    drug_id = item['properties'].get('drug_id', 'Unknown')
                    extracted_data.append({'NAME': name, 'drug_id': drug_id})
            except (KeyError, IndexError, TypeError) as e:
                print(f"⚠️ Skipping malformed item: {e}")
        
        # Create DataFrame
        df = pd.DataFrame(extracted_data)
        print(f"📊 Created DataFrame with {len(df)} drug entries")
        print("\nFirst 5 entries:")
        print(df.head())
        
        # Basic statistics
        print(f"\n📈 Dataset Statistics:")
        print(f"- Total drugs: {len(df)}")
        print(f"- Unique names: {df['NAME'].nunique()}")
        print(f"- Records with drug_id: {df['drug_id'].notna().sum()}")
    else:
        print("❌ Failed to load data")
else:
    print(f"⚠️ Sample file not found: {file_path}")
    print("This is expected if running the notebook independently")


[{'NAME': 'Clofibrate', 'drug_id': 'DB00636'}, {'NAME': 'Astemizole', 'drug_id': 'DB00637'}, {'NAME': 'Butoconazole', 'drug_id': 'DB00639'}, {'NAME': 'Adenosine', 'drug_id': 'DB00640'}, {'NAME': 'Simvastatin', 'drug_id': 'DB00641'}, {'NAME': 'Pemetrexed', 'drug_id': 'DB00642'}, {'NAME': 'Mebendazole', 'drug_id': 'DB00643'}, {'NAME': 'Gonadorelin', 'drug_id': 'DB00644'}, {'NAME': 'Dyclonine', 'drug_id': 'DB00645'}, {'NAME': 'Dextropropoxyphene', 'drug_id': 'DB00647'}, {'NAME': 'Mitotane', 'drug_id': 'DB00648'}, {'NAME': 'Stavudine', 'drug_id': 'DB00649'}, {'NAME': 'Leucovorin', 'drug_id': 'DB00650'}, {'NAME': 'Dyphylline', 'drug_id': 'DB00651'}, {'NAME': 'Magnesium_sulfate', 'drug_id': 'DB00653'}, {'NAME': 'Latanoprost', 'drug_id': 'DB00654'}, {'NAME': 'Estrone', 'drug_id': 'DB00655'}, {'NAME': 'Trazodone', 'drug_id': 'DB00656'}, {'NAME': 'Mecamylamine', 'drug_id': 'DB00657'}, {'NAME': 'Sevelamer', 'drug_id': 'DB00658'}, {'NAME': 'Acamprosate', 'drug_id': 'DB00659'}, {'NAME': 'Metaxalon

Unnamed: 0,NAME,drug_id
0,Clofibrate,DB00636
1,Astemizole,DB00637
2,Butoconazole,DB00639
3,Adenosine,DB00640
4,Simvastatin,DB00641
5,Pemetrexed,DB00642
6,Mebendazole,DB00643
7,Gonadorelin,DB00644
8,Dyclonine,DB00645
9,Dextropropoxyphene,DB00647


In [None]:
import requests
import json
import pandas as pd
import time

# Create a new column to store PMIDs
df['pmids'] = None

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    drug_name = row['NAME']
    drug_id = row['drug_id']
    
    print(f"Processing {drug_name} ({index+1}/{len(df)})")
    
    url = f"https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/?text={drug_name}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        data = response.json()
        
        # Extract all pmid values
        pmids = [result["pmid"] for result in data.get("results", [])]
        print(f"Extracted {len(pmids)} PMIDs for {drug_name}")
        
        # Store PMIDs in the DataFrame
        df.at[index, 'pmids'] = pmids
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(0.5)
        
    except (requests.RequestException, json.JSONDecodeError) as e:
        print(f"Error processing {drug_name}: {e}")
        df.at[index, 'pmids'] = []


df

# Knowledge Graph Analysis and Statistics

def analyze_knowledge_graph():
    """
    Analyze knowledge graph data and provide comprehensive statistics
    """
    try:
        from neo4j import GraphDatabase
        
        # Neo4j connection (update credentials as needed)
        uri = "bolt://localhost:7690"
        user = "neo4j"
        password = "12345678"
        database = "expansion"
        
        driver = GraphDatabase.driver(uri, auth=(user, password))
        
        with driver.session(database=database) as session:
            print("🔗 Connected to Neo4j Knowledge Graph")
            
            # Node statistics by label
            node_stats = session.run("""
                MATCH (n)
                RETURN labels(n) as label, count(n) as count
                ORDER BY count DESC
            """)
            
            print("\n📊 Node Statistics by Type:")
            print("-" * 40)
            for record in node_stats:
                labels = record["label"]
                count = record["count"]
                label_str = ":".join(labels) if labels else "Unlabeled"
                print(f"{label_str:20} {count:>8}")
            
            # Relationship statistics
            rel_stats = session.run("""
                MATCH ()-[r]->()
                RETURN type(r) as relationship_type, count(r) as count
                ORDER BY count DESC
                LIMIT 15
            """)
            
            print("\n🔗 Top Relationship Types:")
            print("-" * 50)
            for record in rel_stats:
                rel_type = record["relationship_type"]
                count = record["count"]
                print(f"{rel_type:35} {count:>8}")
            
            # Total counts
            total_nodes = session.run("MATCH (n) RETURN count(n) as count").single()["count"]
            total_rels = session.run("MATCH ()-[r]->() RETURN count(r) as count").single()["count"]
            
            print(f"\n🎯 Overall Statistics:")
            print(f"Total Nodes: {total_nodes:,}")
            print(f"Total Relationships: {total_rels:,}")
            print(f"Graph Density: {total_rels/(total_nodes*(total_nodes-1)):.6f}")
            
        driver.close()
        return True
        
    except Exception as e:
        print(f"❌ Error connecting to knowledge graph: {e}")
        print("💡 Make sure Neo4j is running and credentials are correct")
        return False

# Run analysis if Neo4j is available
print("🧬 Starting Knowledge Graph Analysis...")
analyze_knowledge_graph()

Processing Clofibrate (1/25)
Extracted 10 PMIDs for Clofibrate
Processing Astemizole (2/25)
Extracted 10 PMIDs for Astemizole
Processing Butoconazole (3/25)
Extracted 10 PMIDs for Butoconazole
Processing Adenosine (4/25)
Extracted 10 PMIDs for Adenosine
Processing Simvastatin (5/25)
Extracted 10 PMIDs for Simvastatin
Processing Pemetrexed (6/25)
Extracted 10 PMIDs for Pemetrexed
Processing Mebendazole (7/25)
Extracted 10 PMIDs for Mebendazole
Processing Gonadorelin (8/25)
Extracted 10 PMIDs for Gonadorelin
Processing Dyclonine (9/25)
Extracted 10 PMIDs for Dyclonine
Processing Dextropropoxyphene (10/25)
Extracted 10 PMIDs for Dextropropoxyphene
Processing Mitotane (11/25)
Extracted 10 PMIDs for Mitotane
Processing Stavudine (12/25)
Extracted 10 PMIDs for Stavudine
Processing Leucovorin (13/25)
Extracted 10 PMIDs for Leucovorin
Processing Dyphylline (14/25)
Extracted 10 PMIDs for Dyphylline
Processing Magnesium_sulfate (15/25)
Extracted 0 PMIDs for Magnesium_sulfate
Processing Latanopro

Unnamed: 0,NAME,drug_id,pmids
0,Clofibrate,DB00636,"[39633493, 37441564, 36982395, 35820666, 35281..."
1,Astemizole,DB00637,"[35721972, 33932547, 39929397, 31973216, 32515..."
2,Butoconazole,DB00639,"[36440171, 38499054, 38217387, 36430288, 33990..."
3,Adenosine,DB00640,"[38336455, 38994361, 32012688, 32823628, 36428..."
4,Simvastatin,DB00641,"[33240647, 39313951, 37864028, 34606062, 33824..."
5,Pemetrexed,DB00642,"[39886208, 33420970, 32726929, 33042801, 31935..."
6,Mebendazole,DB00643,"[36135907, 36983553, 33603987, 36830117, 34327..."
7,Gonadorelin,DB00644,"[34215488, 32368280, 31731249, 33423109, 35067..."
8,Dyclonine,DB00645,"[33671285, 32932603, 38396706, 36499751, 37904..."
9,Dextropropoxyphene,DB00647,"[37864449, 31651743, 32082150, 35611671, 31918..."


In [None]:
import requests
import json
import pandas as pd
import time

# Create a DataFrame for the single drug "Haloperidol"
data = {'NAME': ['Astemizole']}  # Replace with the actual drug name if needed
df = pd.DataFrame(data)

# Create a list to store PMIDs
pmid_list = []

# Process the single drug "Haloperidol"
for index, row in df.iterrows():
    drug_name = row['NAME']
    
    print(f"Processing {drug_name} ({index+1}/{len(df)})")
    
    url = f"https://www.ncbi.nlm.nih.gov/research/pubtator3-api/search/?text={drug_name}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        data = response.json()
        
        # Extract all pmid values
        pmids = [result["pmid"] for result in data.get("results", [])]
        print(f"Extracted {len(pmids)} PMIDs for {drug_name}")
        
        # Add PMIDs to the list
        pmid_list.extend(pmids)
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(0.5)
        
    except (requests.RequestException, json.JSONDecodeError) as e:
        print(f"Error processing {drug_name}: {e}")

# Print the list of PMIDs
print("PMID List:", pmid_list)

# Knowledge Graph Visualization Examples

import plotly.graph_objects as go
import plotly.express as px
import networkx as nx

def create_sample_visualization():
    """
    Create sample visualizations for the knowledge graph
    """
    
    # Sample network data
    sample_nodes = [
        {'id': 'Drug1', 'type': 'DRUG', 'name': 'Aspirin'},
        {'id': 'Disease1', 'type': 'DISEASE', 'name': 'Heart Disease'},
        {'id': 'Gene1', 'type': 'GENE', 'name': 'COX1'},
        {'id': 'Protein1', 'type': 'PROTEIN', 'name': 'COX-1'}
    ]
    
    sample_edges = [
        {'source': 'Drug1', 'target': 'Disease1', 'relation': 'TREATS'},
        {'source': 'Drug1', 'target': 'Protein1', 'relation': 'INHIBITS'},
        {'source': 'Gene1', 'target': 'Protein1', 'relation': 'ENCODES'},
        {'source': 'Protein1', 'target': 'Disease1', 'relation': 'ASSOCIATED_WITH'}
    ]
    
    # Create NetworkX graph
    G = nx.Graph()
    
    # Add nodes
    for node in sample_nodes:
        G.add_node(node['id'], **node)
    
    # Add edges
    for edge in sample_edges:
        G.add_edge(edge['source'], edge['target'], relation=edge['relation'])
    
    # Generate layout
    pos = nx.spring_layout(G, k=3, iterations=50)
    
    # Create traces for plotly
    edge_trace = go.Scatter(
        x=[], y=[], 
        line=dict(width=2, color='gray'),
        hoverinfo='none',
        mode='lines'
    )
    
    # Add edges to trace
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_trace['x'] += (x0, x1, None)
        edge_trace['y'] += (y0, y1, None)
    
    # Create node trace
    node_trace = go.Scatter(
        x=[pos[node][0] for node in G.nodes()],
        y=[pos[node][1] for node in G.nodes()],
        mode='markers+text',
        hoverinfo='text',
        text=[G.nodes[node]['name'] for node in G.nodes()],
        textposition="middle center",
        marker=dict(
            size=50,
            color=[hash(G.nodes[node]['type']) for node in G.nodes()],
            colorscale='viridis',
            line=dict(width=2, color='white')
        )
    )
    
    # Create figure
    fig = go.Figure(
        data=[edge_trace, node_trace],
        layout=go.Layout(
            title='Sample Biomedical Knowledge Graph',
            titlefont_size=16,
            showlegend=False,
            hovermode='closest',
            margin=dict(b=20,l=5,r=5,t=40),
            annotations=[
                dict(
                    text="Sample visualization of drug-disease-gene relationships",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002,
                    xanchor='left', yanchor='bottom',
                    font=dict(size=12)
                )
            ],
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )
    )
    
    return fig

# Create and display the visualization
print("🎨 Creating sample knowledge graph visualization...")
fig = create_sample_visualization()
fig.show()

print("✅ Sample visualization created!")
print("💡 This demonstrates the type of interactive graphs you can create with your data")

Processing Astemizole (1/1)
Extracted 10 PMIDs for Astemizole
PMID List: [35721972, 33932547, 39929397, 31973216, 32515409, 33450182, 37419511, 34049561, 34413963, 38628794]


In [3]:
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:28017/")
db = client["pubtator"]  
collection = db["PubTator3"]  

pmids= {

    "Astemizole":[35721972, 33932547, 39929397, 31973216, 32515409, 33450182, 37419511, 34049561, 34413963, 38628794]
}
# Check if each PMID exists in the database
for medication, ids in pmids.items():
    print(f"Checking PMIDs for {medication}:")
    
    # Add the "|None" suffix to each PMID
    query_ids = [f"{pmid}|None" for pmid in ids]
    print(f"Querying for PMIDs: {query_ids}")
    
    # Query the database
    result = collection.find({"_id": {"$in": query_ids}})
    existing_pmids = [doc["_id"] for doc in result]
    print(f"Existing PMIDs: {existing_pmids}") 

Checking PMIDs for Astemizole:
Querying for PMIDs: ['35721972|None', '33932547|None', '39929397|None', '31973216|None', '32515409|None', '33450182|None', '37419511|None', '34049561|None', '34413963|None', '38628794|None']
Existing PMIDs: ['31973216|None', '33932547|None', '34049561|None', '37419511|None', '38628794|None']


In [4]:
import pymongo
import json
from pprint import pprint

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:28017/")
db = client["pubtator"]  
collection = db["PubTator3"]  

pmids = {
    "Astemizole":[35721972, 33932547, 39929397, 31973216, 32515409, 33450182, 37419511, 34049561, 34413963, 38628794]
}

# Dictionary to store all document data by medication
medication_data = {}

# Check if each PMID exists in the database and extract full information
for medication, ids in pmids.items():
    print(f"\nRetrieving complete information for {medication}:")
    medication_data[medication] = []
    
    # Add the "|None" suffix to each PMID
    query_ids = [f"{pmid}|None" for pmid in ids]
    
    # Query the database for complete documents
    result = collection.find({"_id": {"$in": query_ids}})
    
    # Store all document data
    for doc in result:
        medication_data[medication].append(doc)
        print(f"Retrieved data for PMID: {doc['_id']}")
    
    print(f"Total documents retrieved for {medication}: {len(medication_data[medication])}")

# Example of accessing the full data for the first document of each medication
for medication, documents in medication_data.items():
    if documents:
        print(f"\nSample data for {medication} (first document):")
        pprint(documents[0])
        
        # Option to save to file
        output_file = f"{medication}_pubtator_data.json"
        with open(output_file, 'w') as f:
            json.dump(documents, f, default=str, indent=2)
        print(f"Saved all {len(documents)} documents for {medication} to {output_file}")


Retrieving complete information for Astemizole:
Retrieved data for PMID: 31973216|None
Retrieved data for PMID: 33932547|None
Retrieved data for PMID: 34049561|None
Retrieved data for PMID: 37419511|None
Retrieved data for PMID: 38628794|None
Total documents retrieved for Astemizole: 5

Sample data for Astemizole (first document):
{'_id': '31973216|None',
 'id': '31973216',
 'infons': {},
 'passages': [{'annotations': [{'id': '30147',
                                'infons': {'NCBI Homologene': '68242',
                                           'biotype': 'gene',
                                           'database': None,
                                           'identifier': '3756',
                                           'normalized': ['3756'],
                                           'normalized_id': '3756',
                                           'type': 'Gene',
                                           'valid': True},
                                'locations': [{'