In [7]:
# Install the OpenAI and LangChain libraries
# - `openai`: Provides access to OpenAI's GPT models for tasks like text generation, embeddings, and completions.
# - `langchain`: A framework for building applications using large language models (LLMs).
#                Includes tools for chaining prompts, memory, and integrations like knowledge graphs.
!pip install -q openai langchain
# Attempt to install the LangChain Community library
# - `langchain-community`: This may refer to a community-supported version or extensions of LangChain.
#   Ensure this package exists and is maintained if errors occur during installation.
!pip install -q langchain-community

This script initializes the OpenAI API client and defines a function to interact with the GPT model. The get_chat_response function sends a user-provided text input to the GPT model (gpt-3.5-turbo) and returns the model's response.

In [8]:
import os
from openai import OpenAI

# Set the API key in the environment variable
os.environ["OPENAI_API_KEY"] = "sk-MNL1gYbV6CyXkh2rwPxao_D7n8nSxwW4_0wozr5sUtT3BlbkFJoEpwVXUH_Z3deg71NI-mM8QqSOkOGzQ5WDXmQ8FQEA" # Replace with your actual API key

client = OpenAI()

def get_chat_response(text):
    """
    This function takes a text input and returns the chat completion message.
    """
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": text,
            }
        ],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content


In [None]:
# Install and update required libraries
!pip install -q langchain langchain-community rdflib SPARQLWrapper
!pip install -q langchain-openai  # Updated package for OpenAI integration
!pip install -q openai  # Ensure openai library is installed

# Import libraries
import os
import re
import json
import random
import textwrap
import urllib.parse
import unicodedata
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx

from collections import defaultdict
from google.colab import drive

# LangChain & OpenAI
from langchain.graphs.networkx_graph import NetworkxEntityGraph, KnowledgeTriple
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph as CommunityNetworkxEntityGraph
from langchain.chains import GraphQAChain
from langchain_openai import OpenAI as LangChainOpenAI  # Updated import for LangChain
from openai import OpenAI  # Import for OpenAI client
from langchain.prompts import PromptTemplate  # To define templates for LLM prompts

# Similarity & Distance Metrics
from scipy.spatial.distance import cosine
from scipy.stats import wasserstein_distance, rankdata
from sklearn.metrics.pairwise import cosine_similarity

# Machine Learning Models & Metrics
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    roc_curve,
    auc,
    roc_auc_score
)


In [None]:
!pip install  rdflib  SPARQLWrapper

This script defines a knowledge graph using a set of triples representing entities (nodes) and their relationships (edges). The triples are categorized into parts based on themes, such as LLMs in the legal context, RAG integration, collaborations, and key people involved. The knowledge graph is constructed programmatically by adding these triples into the graph index, which allows for efficient querying and analysis.

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Set the API key and initialize OpenAI client
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])  # Initialize OpenAI client

# Paths
file_path = "/content/drive/MyDrive/PrimeKG_Data/final_test.json"
output_dir = "/content/drive/MyDrive/PrimeKG_Data/"

# Embedding cache and model
embedding_cache = {}
EMBEDDING_MODEL = "text-embedding-ada-002"

def get_chat_response(text):
    """Send text to GPT model and return response."""
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": text}],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content

def load_primekg_data(file_path):
    """Load PrimeKG data from JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Successfully loaded {len(data)} entries from {file_path}")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return []


def is_diabetes_related(text):
    """Determine if text is related to diabetes."""
    core_diabetes_terms = ['diabetes', 'diabetic', 'insulin', 'glucose', 'type 1 diabetes', 'type 2 diabetes', 't1d', 't2d']
    secondary_terms = ['hyperglycemia', 'hypoglycemia', 'glycemic', 'a1c', 'hemoglobin a1c', 'gestational diabetes', 'prediabetes', 'metabolic syndrome', 'pancreatic', 'islet', 'beta cell', 'metformin', 'glucagon', 'diabetic retinopathy', 'diabetic nephropathy', 'diabetic neuropathy']
    text_lower = text.lower()
    for term in core_diabetes_terms:
        if term in text_lower:
            return 2
    for term in secondary_terms:
        if term in text_lower:
            return 1
    return 0

def extract_diabetes_knowledge_graph(data, max_parts=10):
    """Extract diabetes-related knowledge graph from PrimeKG data."""
    all_diabetes_triples = []
    for entry in data:
        triples = entry.get('value', [])
        if not triples:
            continue
        for triple in triples:
            if len(triple) == 3:
                subject, predicate, obj = triple
                subject_decoded = urllib.parse.unquote(subject)
                predicate_decoded = urllib.parse.unquote(predicate)
                obj_decoded = urllib.parse.unquote(obj)
                total_score = is_diabetes_related(subject_decoded) + is_diabetes_related(predicate_decoded) + is_diabetes_related(obj_decoded)
                if total_score > 0:
                    all_diabetes_triples.append((triple, total_score))

    all_diabetes_triples.sort(key=lambda x: x[1], reverse=True)
    G = nx.Graph()
    for (triple, _) in all_diabetes_triples:
        subject, predicate, obj = triple
        subject_decoded = urllib.parse.unquote(subject)
        obj_decoded = urllib.parse.unquote(obj)
        G.add_edge(subject_decoded, obj_decoded, relation=predicate)

    connected_components = list(nx.connected_components(G))
    connected_components.sort(key=len, reverse=True)
    components_to_use = connected_components[:max_parts]

    nodes_to_keep = set()
    for component in components_to_use:
        nodes_to_keep.update(component)

    filtered_triples = []
    for (triple, score) in all_diabetes_triples:
        subject, predicate, obj = triple
        subject_decoded = urllib.parse.unquote(subject)
        obj_decoded = urllib.parse.unquote(obj)
        if subject_decoded in nodes_to_keep and obj_decoded in nodes_to_keep:
            filtered_triples.append(triple)

    kg = []
    portion_indices = {}
    triple_index = 0
    portion_counter = 1

    print("\nStructured Diabetes Knowledge Graph (Focused on Strong Connections):\n")
    component_to_triples = defaultdict(list)
    for triple in filtered_triples:
        subject, _, obj = triple
        subject_decoded = urllib.parse.unquote(subject)
        obj_decoded = urllib.parse.unquote(obj)
        for i, component in enumerate(components_to_use):
            if subject_decoded in component and obj_decoded in component:
                component_to_triples[i].append(triple)
                break

    for component_idx, triples in component_to_triples.items():
        if not triples or portion_counter > max_parts:
            continue
        triples = triples[:20]  # Limit to 20 triples per part
        start_index = triple_index
        end_index = start_index + len(triples)
        portion_indices[f"Part {portion_counter}"] = range(start_index, end_index)
        print(f"\n# Part {portion_counter}")
        for triple in triples:
            subject, predicate, obj = triple
            subject_decoded = urllib.parse.unquote(subject)
            predicate_decoded = urllib.parse.unquote(predicate)
            obj_decoded = urllib.parse.unquote(obj)
            print(f"({subject_decoded}) → ({predicate_decoded}) → ({obj_decoded})")
            kg.append((subject_decoded, predicate_decoded, obj_decoded))
            triple_index += 1
        print("-" * 80)
        portion_counter += 1
        if portion_counter > max_parts:
            break

    return kg, portion_indices

def save_knowledge_graph(kg, output_file=os.path.join(output_dir, "focused_diabetes_kg_accuracy.txt")):
    """Save knowledge graph to a file."""
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            for triple in kg:
                f.write(f"( {triple[0]} , {triple[1]} , {triple[2]} )\n\n")
        print(f"\nFocused Diabetes Knowledge Graph saved as '{output_file}'.")
    except Exception as e:
        print(f"Error saving knowledge graph: {e}")


# Main execution
if __name__ == "__main__":
    # Load and process PrimeKG data
    primekg_data = load_primekg_data(file_path)
    if not primekg_data:
        print("No data loaded.")
        exit()
    kg, portion_indices = extract_diabetes_knowledge_graph(primekg_data, max_parts=10)
    if not kg:
        print("No diabetes-related information found.")
        exit()
    print(f"\nFound {len(kg)} strongly connected diabetes-related triples in {len(portion_indices)} parts.")
    print("\nPortion Indices:\n")
    for part, index_range in portion_indices.items():
        print(f"{part}: {index_range}")
    save_knowledge_graph(kg)
    print("\nFinal Focused Diabetes Knowledge Graph List:\n")
    for triple in kg:
        print("(", triple[0], ",", triple[1], ", ", triple[2], ")")
    print("Original KG node count:", len(set(node for triple in kg for node in (triple[0], triple[2]))))

    # Build graph
    graph = NetworkxEntityGraph()
    for node1, relation, node2 in kg:
        graph.add_triple(KnowledgeTriple(node1, relation, node2))

 

In [None]:
part_indices = {
    "Part 1": range(0, 21),   # 20 original + 1 augmentation
    "Part 2": range(21, 24),  # 3 triples
    "Part 3": range(24, 25),  # 1 triple
    "Part 4": range(25, 29),  # 4 triples
    "Part 5": range(29, 31),  # 2 triples
    "Part 6": range(31, 32),  # 1 triple
    "Part 7": range(32, 33),  # 1 triple
    "Part 8": range(33, 35),  # 2 triples
    "Part 9": range(35, 37),  # 2 triples
    "Part 10": range(37, 39)  # 2 triples
}
part_names_added = list(part_indices.keys())

Visualizes the knowledge graph as a directed graph using NetworkX and Matplotlib. Nodes represent entities, and edges depict relationships with labels for clarity. The layout uses spring positioning with increased spacing for readability. Custom node colors and labeled edges enhance the visualization, displayed without axes.

In [None]:
def wrap_text(text, max_words=8):
    """Wrap text if it contains more than `max_words` words."""
    words = text.split()
    return "Explanation" if len(words) > max_words else text

def visualize_graph_with_chains(kg, part_indices):
    """
    Visualize a directed graph highlighting nodes and edges by chain membership.

    Parameters:
        kg (list of tuples): The knowledge graph as a list of (node1, relation, node2).
        part_indices (dict): A dictionary where keys are chain names and values are lists of indices
                             corresponding to the `kg` entries in each chain.
    Returns:
        None
    """
    # Create graph
    G = nx.DiGraph()
    for node1, relation, node2 in kg:
        G.add_edge(node1, node2, label=relation)

    # Generate positions for the graph
    pos = nx.spring_layout(G, k=8, iterations=100, seed=0)

    # Define color maps
    chain_cmap = mcolors.LinearSegmentedColormap.from_list('chain_colors', ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854'])
    chain_norm = mcolors.Normalize(vmin=0, vmax=len(part_indices) - 1)

    # Initialize node and edge colors
    node_colors = ['lightblue'] * len(G.nodes())
    edge_colors = ['gray'] * len(G.edges())

    # Step 1: Assign distinct colors for each chain
    chain_color_map = {}
    for i, (chain_name, indices) in enumerate(part_indices.items()):
        color = chain_cmap(chain_norm(i))
        chain_color_map[chain_name] = color

    # Step 2: Color nodes based on the chain they belong to
    node_chain_map = {}
    for chain_name, indices in part_indices.items():
        color = chain_color_map[chain_name]
        for idx in indices:
            node1, relation, node2 = kg[idx]

            # Update node colors based on chain
            if node1 in G.nodes:
                node_chain_map[node1] = chain_name
                node_colors[list(G.nodes).index(node1)] = color
            if node2 in G.nodes:
                node_chain_map[node2] = chain_name
                node_colors[list(G.nodes).index(node2)] = color

    # Step 3: Assign edge colors based on the chain
    for i, (node1, node2) in enumerate(G.edges()):
        for chain_name, indices in part_indices.items():
            color = chain_color_map[chain_name]
            for idx in indices:
                n1, _, n2 = kg[idx]
                if (node1, node2) == (n1, n2):
                    edge_colors[i] = color
                    break

    # Apply label filtering
    wrapped_labels = {node: wrap_text(node) for node in G.nodes()}

    # Create the figure with subplots
    fig, axs = plt.subplots(1, 2, figsize=(20, 8), dpi=600)

    # Left: Original Knowledge Graph
    nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=1200, ax=axs[0])
    nx.draw_networkx_edges(G, pos, edge_color='gray', width=1.2, ax=axs[0])
    nx.draw_networkx_labels(G, pos, labels=wrapped_labels, font_size=6, ax=axs[0])
    edge_labels = nx.get_edge_attributes(G, 'label')
    wrapped_edge_labels = {edge: wrap_text(label) for edge, label in edge_labels.items()}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=wrapped_edge_labels, font_size=6, ax=axs[0])
    axs[0].set_title("Original Knowledge Graph", fontsize=10)
    axs[0].axis('off')

    # Right: Highlighted Nodes Based on Chains
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=1200, ax=axs[1], edgecolors='black')
    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=1.5, ax=axs[1])
    nx.draw_networkx_labels(G, pos, labels=wrapped_labels, font_size=6, ax=axs[1])
    nx.draw_networkx_edge_labels(G, pos, edge_labels=wrapped_edge_labels, font_size=6, ax=axs[1])
    axs[1].set_title("Graph Highlighted by Chain Membership", fontsize=10)
    axs[1].axis('off')

    # Create a legend for chain colors
    handles = [plt.Line2D([0], [0], marker='o', color=color, markersize=10, linestyle='', label=chain_name)
               for chain_name, color in chain_color_map.items()]
    axs[1].legend(handles=handles, title="Chains", loc='upper right', fontsize=8)

    # Display the plot
    plt.show()

    # Print which nodes belong to which chain
    print("\n--- Node Chain Mapping ---")
    for node, chain in node_chain_map.items():
        print(f"Node '{node}' belongs to chain '{chain}'.")


In [None]:
visualize_graph_with_chains(kg, portion_indices)

Defines a function to perturb the knowledge graph by selectively removing triples belonging to specified parts. This allows testing the impact of missing information on downstream tasks or analysis. The function filters out triples associated with the indices of the parts to be removed and returns the modified knowledge graph.

In [None]:
def perturb_kg_by_removing_parts(kg, parts_to_remove):
    """
    Perturbs the knowledge graph by removing triples from the specified parts.

    Parameters:
    - kg: The full knowledge graph triples list
    - parts_to_remove: List of part names to remove

    Returns:
    - perturbed_kg: The perturbed KG without the specified parts
    """
    perturbed_kg = []

    # Collect indices of the triples to keep based on parts to remove
    indices_to_remove = set()
    for part in parts_to_remove:
        indices_to_remove.update(part_indices[part])

    # Add triples that are not in the indices to remove
    perturbed_kg = [triple for i, triple in enumerate(kg) if i not in indices_to_remove]

    return perturbed_kg

This function computes the embedding for a given text using a specified model. It processes the text by removing newline characters and queries the OpenAI embeddings API to generate a vector representation, useful for similarity comparisons and downstream tasks.

In [None]:
embedding_cache = {}
EMBEDDING_MODEL = "text-embedding-3-small"
def normalize_text(text):
    """
    Normalize text by removing excessive spaces, normalizing Unicode characters,
    and converting to lowercase.
    """
    text = text.replace("\n", " ").strip()  # Remove newlines and extra spaces
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase (optional but recommended)
    return text

def get_embedding(text):
    text = normalize_text(text)
    if text in embedding_cache:
        return embedding_cache[text]  # Return cached embedding
    embedding = client.embeddings.create(input=[text], model=EMBEDDING_MODEL).data[0].embedding
    embedding_cache[text] = embedding  # Store result in cache
    return embedding

Defines a function to query a GraphQAChain with a question and temperature setting, returning the answer and its embedding. The function initializes the chain with a specified graph and temperature, processes the question, and computes the embedding for the returned answer, facilitating downstream analysis or comparison

In [None]:
def get_answer_and_embedding(question: str, temp: float, graph):
    """
    Sends a question and temperature to the GraphQAChain and returns the original answer string
    and its embedding as separate outputs.

    Args:
        question (str): The question to ask the chain.
        temp (float): The temperature setting for the OpenAI model.
        graph: The graph object for the GraphQAChain.

    Returns:
        Tuple[str, list]: The original answer as a string and its embedding as a list.
    """
    # Initialize the GraphQAChain with the specified temperature
    chain = GraphQAChain.from_llm(OpenAI(temperature=temp), graph=graph, verbose=False)

    # Run the question through the chain to get the answer
    original_answer = chain.run(question)
    original_answer_str = str(original_answer)

    # Compute the embedding for the original answer
    original_answer_embedding = get_embedding(original_answer)

    # Return both answer and embedding separately
    return original_answer_str, original_answer_embedding


This function visualizes the explainability of a knowledge graph by displaying the original graph and an enhanced graph with nodes and edges colored based on their importance coefficients. It leverages a directed graph structure, wraps node labels for readability, adjusts node sizes based on connectivity, and applies a custom colormap to represent the significance of graph components. The visualization is presented in a two-panel layout, highlighting both the original structure and the explainability features derived from Simple SMILE GraphRAG analysis. A color bar provides a reference for importance coefficients

In [None]:
def plot_knowledge_graph_explainability(kg, part_indices, coeff):
    """Visualize knowledge graph with explainability features."""
    # Create graph
    G = nx.DiGraph()
    for node1, relation, node2 in kg:
        G.add_edge(node1, node2, label=relation)

    # Generate positions for the graph with increased spacing
    pos = nx.spring_layout(G, k=8, iterations=100, seed=0)

    # Create color map (blue for negative, red for positive)
    cmap = mcolors.LinearSegmentedColormap.from_list('red_blue', ['blue', '#d3d3d3', 'red'])
    norm = mcolors.Normalize(vmin=-1, vmax=1)

    # Adjust node size based on degree (number of connections)
    node_sizes = [1500 + 100 * G.degree(node) for node in G.nodes()]

    # Assign node and edge colors based on the importance of each part
    node_colors = []
    edge_colors = []
    for node in G.nodes():
        assigned = False
        for part_name, indices in part_indices.items():
            part_idx = int(part_name.split()[-1]) - 1
            if part_idx < len(coeff):  # Ensure coeff index is valid
                coeff_value = coeff[part_idx]
                color = cmap(norm(coeff_value))
                # Filter valid indices to avoid IndexError
                valid_indices = [i for i in indices if i < len(kg)]
                if not valid_indices:
                    continue
                if node in [kg[i][0] for i in valid_indices] or node in [kg[i][2] for i in valid_indices]:
                    node_colors.append(color)
                    assigned = True
                    break
            else:
                print(f"Warning: Invalid part_idx {part_idx} for {part_name}, coeff length {len(coeff)}")
        if not assigned:
            node_colors.append('#8da0cb')  # Default color if no match found

    for i, (node1, node2) in enumerate(G.edges()):
        assigned = False
        for part_name, indices in part_indices.items():
            part_idx = int(part_name.split()[-1]) - 1
            if part_idx < len(coeff):  # Ensure coeff index is valid
                coeff_value = coeff[part_idx]
                color = cmap(norm(coeff_value))
                if i in indices:
                    edge_colors.append(color)
                    assigned = True
                    break
            else:
                print(f"Warning: Invalid part_idx {part_idx} for {part_name}, coeff length {len(coeff)}")
        if not assigned:
            edge_colors.append('gray')  # Default color if no match found
  # Wrap the text labels
    wrapped_labels = {node: wrap_text(node) for node in G.nodes()}

    # Create the figure with subplots
    fig, axs = plt.subplots(1, 2, figsize=(24, 10), dpi=600, gridspec_kw={'width_ratios': [1, 1.3]})

    # Left: Original Knowledge Graph
    nx.draw_networkx_nodes(G, pos, node_color='#d3d3d3', node_size=node_sizes, ax=axs[0])
    nx.draw_networkx_edges(G, pos, edge_color='gray', width=1.5, ax=axs[0])
    nx.draw_networkx_labels(G, pos, labels=wrapped_labels, font_size=8, ax=axs[0])
    edge_labels = nx.get_edge_attributes(G, 'label')
    wrapped_edge_labels = {edge: wrap_text(label) for edge, label in edge_labels.items()}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=wrapped_edge_labels, font_size=8, ax=axs[0])
    axs[0].set_title("Original Diabetes Knowledge Graph", fontsize=12)
    axs[0].axis('off')

    # Right: SMILE Explainability with Node and Edge Colors
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, ax=axs[1])
    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=1.8, ax=axs[1])
    nx.draw_networkx_labels(G, pos, labels=wrapped_labels, font_size=8, ax=axs[1])
    nx.draw_networkx_edge_labels(G, pos, edge_labels=wrapped_edge_labels, font_size=8, ax=axs[1])
    axs[1].set_title("Simple SMILE GraphRAG Explainability", fontsize=12)
    axs[1].axis('off')

    # Show the color bar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    fig.colorbar(sm, ax=axs[1], label='Importance Coefficients')

    # Save the figure
    plt.savefig(os.path.join(output_dir, 'knowledge_graph_explainability_improved.png'), bbox_inches='tight')

    # Display the plot
    plt.show()

Defines the question to query the GraphQAChain or knowledge retrieval system. Here, the question seeks information about it, a framework that integrates external knowledge bases to improve the accuracy and reliability of AI-generated responses

In [None]:
question = "What are the key biological and medical factors involved in diabetes?"
#Portion 3

This snippet sets the temperature parameter to 0 for deterministic response generation and queries the GraphQAChain with the question. The function get_answer_and_embedding returns the original answer as a string along with its embedding. The answer is then printed for review.


In [None]:
# Initialize the LLM, passed graph here instaed of kg

from langchain_openai import OpenAI
#llm = OpenAI(temperature=0, api_key=os.environ["OPENAI_API_KEY"])
llm = OpenAI(temperature=0, api_key=os.environ["OPENAI_API_KEY"])
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)

In [None]:
# Define the original vector (all parts present)
original = np.array([1, 1, 1, 1, 1,1, 1, 1, 1, 1])
original = original.reshape(1, -1)  # Shape becomes (1, 10)

This function calculates the importance coefficients for perturbations on a knowledge graph while providing
detailed logging at each iteration. It removes random parts of the knowledge graph, generates perturbed
responses, computes similarities to the original answer, and fits a linear regression model to calculate
coefficients. The function includes:

- Temperature (`temp`) parameter to adjust the behavior of the GraphQAChain.
- Iterative logs showing the removed parts, perturbed responses, and calculated similarities.
- A summary of all similarities and weights after processing.
The coefficients provide insights into the contribution of each part of the knowledge graph to response fidelity.

In [None]:
def calculate_coefficients_print_Temerature(temp, original, kg, part_names, question, original_answer_embedding, original_answer_str):
    """Calculate coefficients for perturbations on the knowledge graph."""
    similarities_wd = []
    perturbations_vect2 = []
    perturbation_texts = []
    generated_embeddings = []
    epsilon = 1e-6

    for i in range(20):
        perturbation_vector = original.copy().flatten()
        num_parts_to_remove = random.randint(1, len(part_names))
        parts_to_remove_indices = random.sample(range(len(part_names)), num_parts_to_remove)
        for part_idx in parts_to_remove_indices:
            perturbation_vector[part_idx] = 0
        perturbations_vect2.append(perturbation_vector)
        parts_to_remove = [part_names[idx] for idx in parts_to_remove_indices]
        perturbed_kg = perturb_kg_by_removing_parts(kg, parts_to_remove)
        graph_temp = NetworkxEntityGraph()
        for (node1, relation, node2) in perturbed_kg:
            graph_temp.add_triple(KnowledgeTriple(node1, relation, node2))
        chain = GraphQAChain.from_llm(LangChainOpenAI(temperature=temp, api_key=os.environ["OPENAI_API_KEY"]), graph=graph_temp, verbose=False)
        temp_response = chain.run(question)
        perturbation_texts.append(temp_response)
        temp_response_embedding = get_embedding(temp_response)
        generated_embeddings.append(temp_response_embedding)
        similarity_wd = wasserstein_distance(original_answer_embedding, temp_response_embedding)
        similarities_wd.append(similarity_wd)
        print(f"Iteration {i + 1}")
        print(f"Parts removed: {parts_to_remove}")
        print(f"Original answer response: {original_answer_str}")
        print(f"Perturbed response: {temp_response}")
        print(f"Wasserstein distance with original answer: {similarity_wd}\n")

    perturbations_vect2 = np.array(perturbations_vect2)
    distances = sklearn.metrics.pairwise_distances(perturbations_vect2, original, metric='cosine').ravel()
    kernel_width = 0.25
    weights = np.sqrt(np.exp(-(distances**2) / kernel_width**2))
    print(f"similarities_wd: {similarities_wd}")
    print(f"Weights: {weights}")
    inverse_similarities_wd = [1.0 / (dist + epsilon) for dist in similarities_wd]
    min_value = min(inverse_similarities_wd)
    max_value = max(inverse_similarities_wd)
    if min_value == max_value:
        print("Warning: min_value and max_value are equal. Avoiding division by zero.")
        scaled_inverse_similarities_wd = [1.0 for _ in inverse_similarities_wd]
    else:
        scaled_inverse_similarities_wd = [(value - min_value) / (max_value - min_value) for value in inverse_similarities_wd]
    simpler_model = LinearRegression()
    simpler_model.fit(X=perturbations_vect2, y=scaled_inverse_similarities_wd, sample_weight=weights)
    coeff = simpler_model.coef_
    return coeff

# Accuracy: How well the explanation matches the model's actual decision-making process.

This function computes and visualizes the Receiver Operating Characteristic (ROC) curve for binary classification models.
It calculates the False Positive Rate (FPR), True Positive Rate (TPR), and Area Under the Curve (AUC) to evaluate the
discriminative power of the model. The ROC curve is plotted along with a diagonal line indicating random guessing,
providing an intuitive visual representation of model performance.

In [None]:
def plot_roc_curve(y_true, y_scores):
    """
    Computes the ROC curve, AUC, and plots the ROC curve.

    Parameters:
    - y_true: Ground truth binary labels (array-like).
    - y_scores: Predicted probabilities or scores (array-like).

    Returns:
    - roc_auc: The computed Area Under the Curve (AUC) value.
    """
    # Compute False Positive Rate (FPR), True Positive Rate (TPR), and thresholds
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)

    # Compute AUC
    roc_auc = auc(fpr, tpr)

    # Print AUC
    print(f"AUC: {roc_auc:.3f}")

    # Plot ROC Curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for random guessing
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

    return roc_auc


## Accuracy: Evaluate how well the explanation for " What is insulin-like growth factor receptor binding associated with?" (Temp 0) aligns with the ground truth using similarity metrics. We have  original_answer_embedding and original_answer_str calculated before.

In [None]:
question = "What is insulin-like growth factor receptor binding associated with?"

This script runs the `calculate_coefficients_print_Temerature` function with a temperature (`temp`) of 0,
which ensures deterministic outputs from the GraphQAChain. It processes the original knowledge graph,
perturbs it, and calculates coefficients through a linear regression model. The function provides detailed
iteration-wise logs, including removed parts, perturbed responses, similarities, and weights, offering a
comprehensive understanding of the model's behavior and the impact of perturbations on the response fidelity.

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)

In [None]:
part_names = list(portion_indices.keys())
temp= 0
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str = original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 16)  # Rounds to 3 decimal places
formatted_coeff

This script uses the `plot_roc_curve` function to compute and visualize the ROC curve for a binary classification problem.
The `y_true` array contains the ground truth labels, while `y_scores` holds the predicted probabilities or scores
(e.g., coefficients). The function calculates the AUC value and plots the ROC curve, providing a quantitative
and visual assessment of model performance. The computed AUC value is also printed for reference.

In [None]:
#  Ground truth (true labels) and predicted probabilities
y_true = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

This code defines a set of importance coefficients (`coeff`) representing the contribution of each part
of the knowledge graph to response fidelity. Using the `plot_knowledge_graph_explainability` function,
the graph is visualized with nodes and edges color-coded based on these coefficients. The visualization
highlights the most and least influential components, aiding in the interpretability of the knowledge graph.


In [None]:
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

Question 1: What is insulin-like growth factor receptor binding associated with?
Answer: Neonatal insulin-dependent diabetes mellitus, pancreatic A cell fate commitment, and pancreatic serous cystadenocarcinoma.
Nodes and Importance:
1.	Neonatal insulin-dependent diabetes mellitus (5): Directly associated via associated_with and ppi, part of the answer.
2.	pancreatic A cell fate commitment (5): Directly associated via associated_with and ppi, part of the answer.
3.	pancreatic serous cystadenocarcinoma (5): Directly associated via associated_with and interacts_with, part of the answer.
4.	glucose binding (2): Related to diabetes context but not directly connected.
5.	diabetic peripheral angiopathy (1): Related to diabetes but not directly connected to the queried node.


# Ordinary Accuracy

In [None]:
def evaluate_ranking(expert_ranks, coeff):
    # Compute model ranks (dense ranking)
    ranks = rankdata(-coeff, method='dense')
    max_rank = len(coeff)
    model_ranks = max_rank - ranks + 1

    # Unique expert ranks sorted descending
    unique_expert_ranks = np.unique(expert_ranks)[::-1]

    correct_nodes = np.zeros(len(expert_ranks), dtype=bool)
    assigned_model_ranks = set()

    for er in unique_expert_ranks:
        indices = np.where(expert_ranks == er)[0]
        group_size = len(indices)

        # Get top model indices not yet assigned
        sorted_model_indices = np.argsort(-coeff)
        available_model_indices = [i for i in sorted_model_indices if i not in assigned_model_ranks]

        # Pick top N for this expert group
        top_model_indices = available_model_indices[:group_size]

        # Mark expert nodes correct if their index is among top_model_indices
        for idx in indices:
            if idx in top_model_indices:
                correct_nodes[idx] = True

        assigned_model_ranks.update(top_model_indices)

    accuracy = np.sum(correct_nodes) / len(expert_ranks)

    return {
        "expert_ranks": expert_ranks,
        "model_ranks": model_ranks,
        "correct_nodes": correct_nodes,
        "accuracy": accuracy
    }


In [None]:
expert_ranks = np.array([5,2,0,0,0,0,0,0,0,1])
coeff = np.array([
 1.00000000e+00, -1.25267496e-16, -4.10518858e-16, -4.22998089e-16,
 -5.08328256e-16, -3.61100060e-17,  2.78943227e-16,  8.04963084e-16,
 -2.78443998e-16,  3.64695408e-16])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


# Accuracy: Assess the explanation for "What is insulin-like growth factor receptor binding associated with?#?" (Temp 1) by comparing it to the ground truth, accounting for potential variability due to higher randomness.

In [None]:
question = " What is insulin-like growth factor receptor binding associated with?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 1, 0, 0,0,0,0,0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([5,2,0,0,0,0,0,0,0,1])
coeff = np.array([
-0.41909715, -0.12444222, 0.25893775, 0.22546366, 0.05523701, 0.33234721, -0.21500425, -0.32306153, -0.17291924, 0.73665088])

results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for "What is pancreatic serous cystadenocarcinoma associated with?" (Temp 0) by comparing it with the verified ground truth for alignment and correctness.

Question 2: What is pancreatic serous cystadenocarcinoma associated with?
Answer: Insulin-like growth factor receptor binding.
Nodes and Importance:
1.	insulin-like growth factor receptor binding (5): Directly associated via associated_with and interacts_with, part of the answer.
2.	Neonatal insulin-dependent diabetes mellitus (3): Indirectly related via insulin-like growth factor receptor binding.
3.	pancreatic A cell fate commitment (3): Indirectly related via insulin-like growth factor receptor binding.
4.	glucose binding (2): Related to pancreatic context but not directly connected.
5.	diabetic peripheral angiopathy (1): Related to disease context but not directly connected.


In [None]:
question = "What is pancreatic serous cystadenocarcinoma associated with?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([1, 0, 0, 0, 0,0, 0, 0, 0, 0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
#new
expert_ranks = np.array([5,2,0,0,0,0,0,0,0,1])
coeff = np.array([ 1.00000000e+00,  4.32460370e-16, -4.65193875e-16,  7.98897405e-16,
  7.39498566e-17, -1.66026081e-16,  2.32506595e-16,  1.58584564e-16,
  1.85803433e-16,  2.48250970e-16])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for " What is pancreatic serous cystadenocarcinoma associated with?" (Temp 1) by comparing it with the ground truth, considering increased variability due to higher randomness.

In [None]:
question =  "What is pancreatic serous cystadenocarcinoma associated with?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff


In [None]:
#  Ground truth (true labels) and predicted probabilities
y_true = np.array([1, 0, 0, 0, 0,0, 0, 0, 0, 0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
# New
expert_ranks = np.array([5,2,0,0,0,0,0,0,0,1])
coeff = np.array([ 0.5915153,  -0.23995439,  0.53461353, -0.41884679, -0.47465784, -1.0540477,
 -0.13933652,  0.04903404,  0.17690602,  0.15272251])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=np.array([ 0.5915153,  -0.23995439,  0.53461353, -0.41884679, -0.47465784, -1.0540477,
 -0.13933652,  0.04903404,  0.17690602,  0.15272251])

# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for "What is glucagon receptor activity associated with ?" (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question = "What is glucagon receptor activity associated with and how it interacts through ppi with low-affinity glucose:proton symporter activity?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([1, 0, 0, 0, 0,0, 0, 0, 0, 0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([5,3,0,0,0,0,0,0,0,0])
coeff = np.array([ 1.00000000e+00 , 4.15616452e-16, -3.49388780e-17, -9.77839168e-17,
 -4.54138047e-16, -7.10951638e-17,  2.86195152e-16,  1.70500011e-16,
  5.84566019e-17, -1.37756010e-16])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff = np.array([ 1.00000000e+00 , 4.15616452e-16, -3.49388780e-17, -9.77839168e-17,
 -4.54138047e-16, -7.10951638e-17,  2.86195152e-16,  1.70500011e-16,
  5.84566019e-17, -1.37756010e-16])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "question = "What is glucagon receptor activity associated with??" (Temp 1) by comparing it with the ground truth, accounting for variability due to higher randomness.


In [None]:
question = "What is glucagon receptor activity associated with and how it interacts through ppi with low-affinity glucose:proton symporter activity?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
#  Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 1, 0, 0, 0,0, 0, 0, 0, 0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities
# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([5,3,0,0,0,0,0,0,0,0])
coeff = np.array([ 0.14268646 , 1.25637341 , 0.26232485, -0.41469921, -0.31810483,  0.28340172,
 -0.62466318, -0.00314635, -1.14613428 , 0.39074818])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff= formatted_coeff
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for "What is glucose binding ? " (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question = "What is glucose binding?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 1, 0, 0, 0,0, 0, 0, 0, 0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")


In [None]:
expert_ranks = np.array([3,5,0,0,0,0,0,0,0,0])
coeff = np.array([-0.71047292 , 0.63579153 , 0.33207805 , 0.1340547 , -0.14763288 , 0.00159955,
  0.09107587 ,-0.16028238 ,-0.34478889 , 0.0487201 ])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "WWhat is glucose binding?" (Temp 1) by comparing it with the ground truth, considering variability due to higher randomness.


In [None]:
question = "What is glucose binding?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 1, 0, 0,0,0, 0, 0, 0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([3,5,0,0,0,0,0,0,0,0])
coeff = np.array([ 0.099,  0.359 ,-0.139 ,-0.405, -0.031 , 0.409 , 0.016 ,-0.133 ,-0.015,  0.032])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff = np.array([ 0.099,  0.359 ,-0.139 ,-0.405, -0.031 , 0.409 , 0.016 ,-0.133 ,-0.015,  0.032])
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for "What has a synergistic interaction with Flurandrenolide drug?" (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question = " What has a synergistic interaction with Flurandrenolide drug?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)

In [None]:
temp= 0
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 1,0,0,0, 0, 0, 0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities
# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([2,0,5,0,0,3,0,0,0,0])
coeff = np.array([-1.69704774e-16,  1.00000000e+00 ,-2.35922393e-16, -1.11022302e-16,
 -3.05311332e-16 , 1.38777878e-16 ,-1.21430643e-16,  2.77555756e-16,
  6.93889390e-17 ,-6.66133815e-16])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "  What has a synergistic interaction with Flurandrenolide drug?" (Temp 1) by comparing it with the ground truth, considering variability due to higher randomness.


In [None]:
question = " What has a synergistic interaction with Flurandrenolide drug?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 1, 0,0,0, 0, 0, 0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities
# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([2,0,5,0,0,3,0,0,0,0])
coeff = np.array([-0.45619093 , 0.80383945,  0.6125232 , -0.47941525 ,-0.25911835 , 0.10919634,
 -0.04772877 ,-0.27216957 , 0.44748652 , 0.0988529 ])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for ""What is glucose 1-phosphate phosphorylation and mitochondrial genome maintenance, how do they interact through ppi?" (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question = "What is glucose 1-phosphate phosphorylation and mitochondrial genome maintenance, how do they interact through ppi?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
#  Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 1,0,0, 0, 0, 0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities
# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([2,3,0,5,0,0,0,3,0,0])
coeff = np.array([-1.99993048e-04 ,-2.19806295e-04 ,-7.75542612e-04 , 9.99665807e-01,
  7.46909364e-05, -3.12252069e-04 , 1.34243048e-05, -5.66323581e-05,
  2.64236108e-04 ,-3.19075465e-05])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff = np.array([-1.99993048e-04 ,-2.19806295e-04 ,-7.75542612e-04 , 9.99665807e-01,
  7.46909364e-05, -3.12252069e-04 , 1.34243048e-05, -5.66323581e-05,
  2.64236108e-04 ,-3.19075465e-05])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "Is there a ppi between glucose 1-phosphate phosphorylation and mitochondrial genome maintenance?" (Temp 1) by comparing it with the ground truth, considering variability due to higher randomness.


In [None]:
question = "Is there a ppi between glucose 1-phosphate phosphorylation and mitochondrial genome maintenance?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
#  Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 1,0,0, 0, 0, 0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities
# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([2,3,0,5,0,0,0,3,0,0])
coeff = np.array([-0.09415032 ,-0.01702873, -0.04560394 , 0.04941769 ,-0.31326786 , 0.08990631,
 -0.1761379,  -0.00540235 , 0.0698538 , -0.09797872])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff = np.array([-0.09415032 ,-0.01702873, -0.04560394 , 0.04941769 ,-0.31326786 , 0.08990631,
 -0.1761379,  -0.00540235 , 0.0698538 , -0.09797872])

# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for "What is UDP-glucose:glycoprotein glucosyltransferase activity associated with and how it interacts with serotonin:sodium symporter activity through ppi?"" (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question = "What is UDP-glucose:glycoprotein glucosyltransferase activity associated with and how it interacts with serotonin:sodium symporter activity through ppi?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff


In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 0,1,0, 0, 0, 0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([1,2,0,0,5,0,3,0,0,0])
coeff = np.array([ 0.01952165, -0.17876836, -0.20753339,  0.90628884 , 0.48769493, -0.147138,
  0.2049407 , -0.26831029 ,-0.03702298, -0.06081371])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "What is UDP-glucose:glycoprotein glucosyltransferase activity associated with and how it interacts with serotonin:sodium symporter activity through ppi?" (Temp 1) by comparing it with the ground truth, considering variability due to higher randomness.


In [None]:
question = "What is UDP-glucose:glycoprotein glucosyltransferase activity associated with and how it interacts with serotonin:sodium symporter activity through ppi?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff


In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 0,1,0, 0, 0, 0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities
# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([1,2,0,0,5,0,3,0,0,0])
coeff = np.array([-0.00293262, -0.08940879 , 0.17559307 ,-0.14630958 , 0.09643477 ,-0.06099023,
  0.09733975 ,-0.06348241  ,0.06569268, -0.01448966])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for ""Where is SFT2D2 expressed in the context of diabetes?"" (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question ="Where is SFT2D2 expressed in the context of diabetes?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print("Original answer:", original_answer_str)

In [None]:
temp= 0
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff


In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 0,0,0, 0, 0, 1,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([1,2,0,0,0,0,0,0,5,0])
coeff = np.array([ 0.27131229,0.23315656, -0.51126837  ,0.12686812 ,-0.01043048 , 0.09524099,
 -0.06514419 , 0.56561928 , 0.55597409  ,0.23164353])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "Where is SFT2D2 expressed in the context of diabetes?" (Temp 1) by comparing it with the ground truth, considering variability due to higher randomness.


In [None]:
question = "Where is SFT2D2 expressed in the context of diabetes?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
temp= 1
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff


In [None]:
#Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 0,0,0, 0, 0, 1,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([1,2,0,0,0,0,0,0,5,0])
coeff = np.array([-0.04822793, -0.29208269, -0.23251313, -0.05444959 ,-0.06727032 ,-0.26086002,
 -0.09458866,  0.81293898  ,0.04877018  ,0.2129207 ])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the functaion
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for ""What is diabetic peripheral angiopathy associated with?"" (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question = "What is diabetic peripheral angiopathy associated with?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
temp= 0
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)

formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 0,0,0,0,0,0,1])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([3,1,0,0,0,0,0,0,0,5])
coeff = np.array([-0.19119336, -0.25409155 ,-0.01440644, -0.2132483 , -0.08588113 , 0.05283582,
  0.40574537 , 0.11966471,  0.21763478 , 0.78327978])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=np.array([ 0.,  0., -0., -0., -0.,  0., -0.,  0.,  1.,  0.])
# Call the functaion
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "What is diabetic peripheral angiopathy associated with?" (Temp 1) by comparing it with the ground truth, considering variability due to higher randomness.


In [None]:
question = "What is diabetic peripheral angiopathy associated with?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
temp= 1
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff

In [None]:
#  Ground truth (true labels) and predicted probabilities
y_true = np.array([0,0,0,0,0,0,0,0,0,1])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([3,1,0,0,0,0,0,0,0,5])
coeff = np.array([ 0.011 , 0.388, -0.487,  0.028 , 0.228 ,-0.27  ,-0.217 ,-0.461, -0.385 , 0.238])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff = np.array([ 0.011 , 0.388, -0.487,  0.028 , 0.228 ,-0.27  ,-0.217 ,-0.461, -0.385 , 0.238])
# Call the functaion
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Assess the explanation for "what is Insulin tregopil and with which drug it does synergistic interaction?" (Temp 1) by comparing it with the ground truth, considering variability due to higher randomness.


In [None]:
question = "what is Insulin tregopil and with which drug it does synergistic interaction?"

In [None]:
temp = 1
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff


In [None]:
# Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 0,0,1,0,0,0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities

# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([2,0,3,0,0,5,0,0,0,0])
coeff = np.array([ 0.01090008 , 0.38841307 ,-0.48697776 , 0.02760995 , 0.22780567 ,-0.26982554,
 -0.21744315 ,-0.46074367 ,-0.38511049,  0.23776261])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the functaion
plot_knowledge_graph_explainability(kg, part_indices, coeff)

## Accuracy: Evaluate the explanation for "what is Insulin tregopil and with which drug it does synergistic interaction?" (Temp 0) by comparing it to the ground truth for precision and alignment.


In [None]:
question = "what is Insulin tregopil and with which drug it does synergistic interaction?"

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(f"Original answer: {original_answer_str}")

In [None]:
temp= 0
coeff = calculate_coefficients_print_Temerature(
    temp= temp,
    original=original,
    kg=kg,
    part_names=part_names,
    question=question,
    original_answer_embedding=original_answer_embedding,
    original_answer_str=original_answer_str,
)

In [None]:
print(coeff)
# Format the coefficients to a specific number format (e.g., rounding to 3 decimal places)
formatted_coeff = np.round(coeff, 3)  # Rounds to 3 decimal places
formatted_coeff


In [None]:
#  Ground truth (true labels) and predicted probabilities
y_true = np.array([0, 0, 0, 0,0,1,0,0,0,0])  # Ground truth (0: negative, 1: positive)
y_scores = formatted_coeff # Predicted probabilities
# Call the function to compute and plot ROC curve
roc_auc_value = plot_roc_curve(y_true, y_scores)
print(f"Computed AUC: {roc_auc_value:.3f}")

In [None]:
expert_ranks = np.array([2,0,3,0,0,5,0,0,0,0])
coeff = np.array([ 0.027 ,-0.043 ,-0.371 , 0.574 , 0.913 ,-0.034 ,-0.24  ,-0.397 ,-0.382 , 0.399])


results = evaluate_ranking(expert_ranks, coeff)

print("Expert ranks:", results["expert_ranks"])
print("Model ranks: ", results["model_ranks"])
print("Correct nodes (relaxed match with repeated ranks):", results["correct_nodes"])
print("Accuracy:", results["accuracy"])


In [None]:
coeff=formatted_coeff
# Call the functaion
plot_knowledge_graph_explainability(kg, part_indices, coeff)

In [None]:
# Define part_indices for 24 triples
part_indices = {
    'Part 1': range(0, 3),
    'Part 2': range(3, 6),
    'Part 3': range(6, 8),
    'Part 4': range(8, 11),
    'Part 5': range(11, 13),
    'Part 6': range(13, 15),
    'Part 7': range(15, 17),
    'Part 8': range(17, 20),
    'Part 9': range(20, 22),
    'Part 10': range(22, 24)
}

# Convert kg (DiGraph) to NetworkxEntityGraph
def convert_to_networkx_entity_graph(kg):
    triples = [
        ('Neonatal insulin-dependent diabetes mellitus', 'associated_with', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'ppi', 'Neonatal insulin-dependent diabetes mellitus'),
        ('Neonatal insulin-dependent diabetes mellitus', 'associated_with', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'ppi', 'Neonatal insulin-dependent diabetes mellitus'),
        ('Neonatal insulin-dependent diabetes mellitus', 'ppi', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'associated_with', 'Neonatal insulin-dependent diabetes mellitus'),
        ('Neonatal insulin-dependent diabetes mellitus', 'ppi', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'associated_with', 'Neonatal insulin-dependent diabetes mellitus'),
        ('pancreatic A cell fate commitment', 'ppi', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'associated_with', 'pancreatic A cell fate commitment'),
        ('pancreatic serous cystadenocarcinoma', 'interacts_with', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'associated_with', 'pancreatic serous cystadenocarcinoma'),
        ('pancreatic A cell fate commitment', 'associated_with', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'ppi', 'pancreatic A cell fate commitment'),
        ('pancreatic serous cystadenocarcinoma', 'interacts_with', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'associated_with', 'pancreatic serous cystadenocarcinoma'),
        ('pancreatic serous cystadenocarcinoma', 'associated_with', 'insulin-like growth factor receptor binding'),
        ('insulin-like growth factor receptor binding', 'interacts_with', 'pancreatic serous cystadenocarcinoma'),
        ('glucagon receptor activity', 'ppi', 'low-affinity glucose:proton symporter activity'),
        ('low-affinity glucose:proton symporter activity', 'associated_with', 'glucagon receptor activity'),
        ('vasoconstriction of artery involved in baroreceptor response to lowering of systemic arterial blood pressure', 'expression_present', 'glucose binding'),
        ('glucose binding', 'expression_present', 'Neurofibrillary tangles'),
        ('Neurofibrillary tangles', 'ppi', 'glucose binding'),
        ('Flurandrenolide', 'synergistic_interaction', 'Insulin peglispro'),
        ('glucose 1-phosphate phosphorylation', 'ppi', 'mitochondrial genome maintenance'),
        ('mitochondrial genome maintenance', 'expression_present', 'glucose 1-phosphate phosphorylation'),
        ('glucose 1-phosphate phosphorylation', 'expression_present', 'mitochondrial genome maintenance'),
        ('mitochondrial genome maintenance', 'ppi', 'glucose 1-phosphate phosphorylation'),
        ('UDP-glucose:glycoprotein glucosyltransferase activity', 'ppi', 'serotonin:sodium symporter activity'),
        ('serotonin:sodium symporter activity', 'associated_with', 'UDP-glucose:glycoprotein glucosyltransferase activity'),
        ('Insulin tregopil', 'synergistic_interaction', 'Cyclothiazide'),
        ('UDP-glucose transmembrane transporter activity', 'associated_with', 'protein-DNA-RNA complex remodeling'),
        ('Severe intrauterine growth retardation', 'phenotype_present', 'glucose-1-phosphate thymidylyltransferase activity'),
        ('glucose-1-phosphate thymidylyltransferase activity', 'ppi', 'CCL4L1'),
        ('SFT2D2', 'expression_present', 'adrenal gland'),
        ('SFT2D2', 'expression_present', 'deltoid'),
        ('diabetic peripheral angiopathy', 'associated_with', 'calcium-release channel activity'),
        ('diabetic peripheral angiopathy', 'associated_with', 'acute myeloid leukemia with t(8;21)(q22;q22) translocation')
    ]
    G = nx.DiGraph()
    for subject, predicate, obj in triples:
        G.add_node(subject)
        G.add_node(obj)
        G.add_edge(subject, obj, relation=predicate)
    entity_graph = NetworkxEntityGraph(G)
    print(f"Converted to NetworkxEntityGraph with {len(G.nodes)} nodes and {len(G.edges)} edges")
    unique_triples = list({(u, v, d['relation']) for u, v, d in G.edges(data=True)})
    print(f"Unique triples: {len(unique_triples)}")
    for i, triple in enumerate(unique_triples[:5]):
        print(f"Triple {i+1}: {triple}")
    return entity_graph

# Wrapper to handle errors in calculate_coefficients_print_Temerature
def safe_calculate_coefficients(temp, original, kg, part_names, question, original_answer_embedding, original_answer_str):
    try:
        # Preprocess graph to list of triples
        triples = [(u, v, d['relation']) for u, v, d in kg._graph.edges(data=True)]
        original_triples = [(u, v, d['relation']) for u, v, d in original._graph.edges(data=True)]
        coeff = calculate_coefficients_print_Temerature(
            temp=temp,
            original=original._graph,
            kg=kg._graph,
            part_names=part_names,
            question=question,
            original_answer_embedding=original_answer_embedding,
            original_answer_str=original_answer_str
        )
        return coeff
    except Exception as e:
        print(f"Error in calculate_coefficients_print_Temerature: {e}")
        print("Inspecting kg relations:")
        relations = list(kg.get_relations()) if hasattr(kg, 'get_relations') else [(u, v, d['relation']) for u, v, d in kg._graph.edges(data=True)]
        for i, rel in enumerate(relations[:5]):
            print(f"Relation {i+1}: {rel}")
        return np.zeros(10)  # Fallback

# Define questions and ground truths
questions_data = [
    {
        "question": "What is insulin-like growth factor receptor binding associated with?",
        "ground_truth": [1,0,0,0,0,0,0,0,0,0]
    },
    {
        "question": "What is pancreatic serous cystadenocarcinoma associated with?",
        "ground_truth": [1,0,0,0,0,0,0,0,0,0]
    },
    {
        "question": "What is glucagon receptor activity associated with and how it interacts through ppi with low-affinity glucose:proton symporter activity?",
        "ground_truth": [1,0,0,0,0,0,0,0,0,0]
    },
    {
        "question": "What is glucose binding?",
        "ground_truth": [0,1,0,0,0,0,0,0,0,0]
    },
    {
        "question": "What has a synergistic interaction with Flurandrenolide drug?",
        "ground_truth": [0,0,1,0,0,0,0,0,0,0]
    },
    {
        "question": "What is glucose 1-phosphate phosphorylation and mitochondrial genome maintenance, how do they interact through ppi?",
        "ground_truth": [0,0,0,1,0,0,0,0,0,0]
    },
    {
        "question": "What is UDP-glucose:glycoprotein glucosyltransferase activity associated with and how it interacts with serotonin:sodium symporter activity through ppi?",
        "ground_truth": [0,0,0,0,1,0,0,0,0,0]
    },
    {
        "question": "Where is SFT2D2 expressed?",
        "ground_truth": [0,0,0,0,0,0,0,0,1,0]
    },
    {
        "question": "What is diabetic peripheral angiopathy associated with?",
        "ground_truth": [0,0,0,0,0,0,0,0,0,1]
    },
    {
        "question": "What is Insulin tregopil and with which drug it does synergistic interaction?",
        "ground_truth": [0,0,0,0,0,1,0,0,0,0]
    }
]

# Convert kg to NetworkxEntityGraph
try:
    kg = convert_to_networkx_entity_graph(kg)
    original = kg
except NameError:
    print("Error: kg not defined in the environment")
    exit(1)

# Debug part_indices
print(f"part_indices: {part_indices}")

# Initialize table data
table_data = []

# Process each question for temp 0 and temp 1
for data in questions_data:
    question = data["question"]
    y_true = np.array(data["ground_truth"])

    # Temp 0
    try:
        print(f"\nProcessing '{question}' at temp 0")
        temp = 0
        original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph=kg)
        print(f"Original answer: {original_answer_str}")
        print(f"Inputs to calculate_coefficients_print_Temerature (temp 0):")
        print(f"  temp: {temp}")
        print(f"  original type: {type(original._graph)}")
        print(f"  kg type: {type(kg._graph)}")
        print(f"  part_names: {part_names}")
        print(f"  question: {question}")
        print(f"  original_answer_embedding type: {type(original_answer_embedding)}")
        print(f"  original_answer_str: {original_answer_str}")
        coeff = safe_calculate_coefficients(
            temp=temp,
            original=original,
            kg=kg,
            part_names=part_names,
            question=question,
            original_answer_embedding=original_answer_embedding,
            original_answer_str=original_answer_str
        )
        if not isinstance(coeff, np.ndarray) or len(coeff) != 10:
            print(f"Invalid coeff for {question} at temp 0: {coeff}")
            coeff = np.zeros(10)
        formatted_coeff = np.round(coeff, 3)
        y_scores = formatted_coeff
        roc_auc_value = plot_roc_curve(y_true, y_scores)
        print(f"Computed AUC (Temp 0): {roc_auc_value:.3f}")
        plot_knowledge_graph_explainability(kg, part_indices, formatted_coeff)
    except Exception as e:
        print(f"Error processing question '{question}' at temp 0: {e}")
        formatted_coeff = np.zeros(10)
        roc_auc_value = 0.0

    y_predict_temp0 = formatted_coeff
    auc_temp0 = roc_auc_value

    # Temp 1
    try:
        print(f"\nProcessing '{question}' at temp 1")
        temp = 1
        original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph=kg)
        print(f"Original answer: {original_answer_str}")
        print(f"Inputs to calculate_coefficients_print_Temerature (temp 1):")
        print(f"  temp: {temp}")
        print(f"  original type: {type(original._graph)}")
        print(f"  kg type: {type(kg._graph)}")
        print(f"  part_names: {part_names}")
        print(f"  question: {question}")
        print(f"  original_answer_embedding type: {type(original_answer_embedding)}")
        print(f"  original_answer_str: {original_answer_str}")
        coeff = safe_calculate_coefficients(
            temp=temp,
            original=original,
            kg=kg,
            part_names=part_names,
            question=question,
            original_answer_embedding=original_answer_embedding,
            original_answer_str=original_answer_str
        )
        if not isinstance(coeff, np.ndarray) or len(coeff) != 10:
            print(f"Invalid coeff for {question} at temp 1: {coeff}")
            coeff = np.zeros(10)
        formatted_coeff = np.round(coeff, 3)
        y_scores = formatted_coeff
        roc_auc_value = plot_roc_curve(y_true, y_scores)
        print(f"Computed AUC (Temp 1): {roc_auc_value:.3f}")
        plot_knowledge_graph_explainability(kg, part_indices, formatted_coeff)
    except Exception as e:
        print(f"Error processing question '{question}' at temp 1: {e}")
        formatted_coeff = np.zeros(10)
        roc_auc_value = 0.0

    y_predict_temp1 = formatted_coeff
    auc_temp1 = roc_auc_value

    table_data.append({
        "Questions": question,
        "Ground Truth": str(y_true.tolist()),
        "Y-Predict (Temp 0)": str(y_predict_temp0.tolist()),
        "AUC (Temp 0)": auc_temp0,
        "Y-Predict (Temp 1)": str(y_predict_temp1.tolist()),
        "AUC (Temp 1)": auc_temp1
    })

# Create DataFrame
df = pd.DataFrame(table_data)

# Save to Excel
output_file = '/content/drive/MyDrive/PrimeKG_Data/primekgqa_table.xlsx'
try:
    df.to_excel(output_file, index=False, engine='openpyxl')
    print(f"\nTable saved to {output_file}")
except Exception as e:
    print(f"Error saving Excel file: {e}")

# Print the table for verification
print("\nFinal Table:")
print(df.to_string(index=False))