In [None]:
# Install the OpenAI and LangChain libraries
# - `openai`: Provides access to OpenAI's GPT models for tasks like text generation, embeddings, and completions.
# - `langchain`: A framework for building applications using large language models (LLMs).
#                Includes tools for chaining prompts, memory, and integrations like knowledge graphs.

!pip install -q openai langchain


# Attempt to install the LangChain Community library
# - `langchain-community`: This may refer to a community-supported version or extensions of LangChain.
#   Ensure this package exists and is maintained if errors occur during installation.

!pip install -q langchain-community

In [5]:
!pip install  rdflib SPARQLWrapper  matplotlib

In [6]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up the DBpedia SPARQL endpoint
sparql = SPARQLWrapper("https://dbpedia.org/sparql")

# SPARQL Query: Retrieve cybersecurity-related concepts and their triples
query = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?concept ?label ?abstract
WHERE {
  ?concept rdf:type dbo:Software .
  ?concept rdfs:label ?label .
  ?concept dbo:abstract ?abstract .
  FILTER (LANG(?label) = 'en' && LANG(?abstract) = 'en')
  FILTER (CONTAINS(LCASE(?label), "cyber") || CONTAINS(LCASE(?label), "security") || CONTAINS(LCASE(?label), "malware"))
}
LIMIT 10
"""

# Execute the query
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# Initialize Knowledge Graph (KG) and portion tracking
kg = []
portion_indices = {}
portion_counter = 1  # Start portion numbering
triple_index = 0  # Track overall index

print("\nStructured Knowledge Graph:\n")

for result in results["results"]["bindings"]:
    concept = result["concept"]["value"].split("/")[-1]  # Extracts entity name
    label = result["label"]["value"]
    abstract = result["abstract"]["value"]  # Store full abstract without truncation

    # Store portion index range
    start_index = triple_index
    portion_indices[f"Part {portion_counter}"] = range(start_index, start_index + 3)  # Each part has 3 triples

    # Print structured output
    print(f"\n# Part {portion_counter}")
    print(f"({concept}) → (type) → (Software)")
    print(f"({concept}) → (label) → ({label})")
    print(f"({concept}) → (abstract) →")
    print(abstract)  # Print full abstract with line breaks
    print("-" * 80)

    # Store in KG
    kg.append((concept, "type", "Software"))
    kg.append((concept, "label", label))
    kg.append((concept, "abstract", abstract))  # Store full abstract

    # Increment indices
    triple_index += 3
    portion_counter += 1

# Print portion indices separately
print("\nPortion Indices:\n")
for part, index_range in portion_indices.items():
    print(f"{part}: {index_range}")

# Save KG to a text file
with open("knowledge_graph_output.txt", "w", encoding="utf-8") as f:
    for triple in kg:
        f.write(f"( {triple[0]} , {triple[1]} , {triple[2]})\n\n")  # Ensuring full visibility

print("\nFinal Knowledge Graph saved as 'knowledge_graph_output.txt'.")

# Print the final KG in a readable format
print("\nFinal Knowledge Graph List:\n")
for triple in kg:
  print("(", triple[0],",", triple[1],", ",triple[2], ")")  # Print without truncation


This script initializes the OpenAI API client and defines a function to interact with the GPT model.
The `get_chat_response` function sends a user-provided text input to the GPT model (gpt-3.5-turbo)
and returns the model's response.


In [7]:
import os
from openai import OpenAI

# Set the API key in the environment variable
os.environ["OPENAI_API_KEY"] = "sk-MNL1gYbV6CyXkh2rwPxao_D7n8nSxwW4_0wozr5sUtT3BlbkFJoEpwVXUH_Z3deg71NI-mM8QqSOkOGzQ5WDXmQ8FQEA" # Replace with your actual API key

client = OpenAI()

def get_chat_response(text):
    """
    This function takes a text input and returns the chat completion message.
    """
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": text,
            }
        ],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content


In [None]:
import networkx as nx  # For creating and analyzing graphs/networks.

import matplotlib.pyplot as plt  # For data visualization and plotting.

import numpy as np  # For numerical operations and array handling.

import random  # For generating random numbers.

from langchain.graphs.networkx_graph import NetworkxEntityGraph, KnowledgeTriple # Represents (subject, predicate, object) triples.

from scipy.spatial.distance import cosine  # For cosine similarity/distance between vectors.

from scipy.stats import wasserstein_distance  # For Wasserstein distance (probability distribution comparison).

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.linear_model import LinearRegression, BayesianRidge  # Regression models.

from sklearn.datasets import fetch_20newsgroups  # Fetch the 20 Newsgroups text dataset.

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # Model performance metrics.

import matplotlib.colors as mcolors  # For handling and customizing colors in visualizations.

import sklearn.metrics  # For evaluation metrics like accuracy, precision, recall, etc.

import matplotlib.colors as mcolors  # For handling color schemes in plots

import textwrap  # For wrapping text into fixed-width lines

from langchain.llms import OpenAI


If you're using OpenAI with a GraphIndexCreator, and it’s unavailable in the new LangChain version, you can adapt the code using NetworkxEntityGraph for creating and querying a graph with an LLM, or we replaced it with a custom CustomGraphIndexCreator that integrates NetworkxEntityGraph.

In [None]:
from langchain.chains import GraphQAChain  # For question answering over knowledge graphs.

# Prompt Engineering
from langchain.prompts import PromptTemplate  # To define templates for LLM prompts.



This script defines a knowledge graph using a set of triples representing entities (nodes)
and their relationships (edges). The triples are categorized into parts based on themes,
such as LLMs in the legal context, RAG integration, collaborations, and key people involved.
The knowledge graph is constructed programmatically by adding these triples into the graph
index, which allows for efficient querying and analysis.



In [None]:
part_indices ={
"Part 1": range(0, 3),
"Part 2": range(3, 6),
"Part 3": range(6, 9),
"Part 4": range(9, 12),
"Part 5": range(12, 15),
"Part 6": range(15, 18),
"Part 7": range(18, 21),
"Part 8": range(21, 24),
"Part 9": range(24, 27),
"Part 10": range(27, 30)
}
part_names = list(part_indices.keys())

# Instantiate the graph
graph = NetworkxEntityGraph()

# Build the graph from the knowledge triples
for (node1, relation, node2) in kg:
    graph.add_triple(KnowledgeTriple(node1, relation, node2))


Visualizes the knowledge graph as a directed graph using NetworkX and Matplotlib.
Nodes represent entities, and edges depict relationships with labels for clarity.
The layout uses spring positioning with increased spacing for readability. Custom
node colors and labeled edges enhance the visualization, displayed without axes.


In [None]:
def wrap_text(text, max_words=8):
    """Wrap text if it contains more than `max_words` words."""
    words = text.split()
    return "Explanation" if len(words) > max_words else text

def visualize_graph_with_chains(kg, part_indices):
    """
    Visualize a directed graph highlighting nodes and edges by chain membership.

    Parameters:
        kg (list of tuples): The knowledge graph as a list of (node1, relation, node2).
        part_indices (dict): A dictionary where keys are chain names and values are lists of indices
                             corresponding to the `kg` entries in each chain.
    Returns:
        None
    """
    # Create graph
    G = nx.DiGraph()
    for node1, relation, node2 in kg:
        G.add_edge(node1, node2, label=relation)

    # Generate positions for the graph
    pos = nx.spring_layout(G, k=8, iterations=100, seed=0)

    # Define color maps
    chain_cmap = mcolors.LinearSegmentedColormap.from_list('chain_colors', ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854'])
    chain_norm = mcolors.Normalize(vmin=0, vmax=len(part_indices) - 1)

    # Initialize node and edge colors
    node_colors = ['lightblue'] * len(G.nodes())
    edge_colors = ['gray'] * len(G.edges())

    # Step 1: Assign distinct colors for each chain
    chain_color_map = {}
    for i, (chain_name, indices) in enumerate(part_indices.items()):
        color = chain_cmap(chain_norm(i))
        chain_color_map[chain_name] = color

    # Step 2: Color nodes based on the chain they belong to
    node_chain_map = {}
    for chain_name, indices in part_indices.items():
        color = chain_color_map[chain_name]
        for idx in indices:
            node1, relation, node2 = kg[idx]

            # Update node colors based on chain
            if node1 in G.nodes:
                node_chain_map[node1] = chain_name
                node_colors[list(G.nodes).index(node1)] = color
            if node2 in G.nodes:
                node_chain_map[node2] = chain_name
                node_colors[list(G.nodes).index(node2)] = color

    # Step 3: Assign edge colors based on the chain
    for i, (node1, node2) in enumerate(G.edges()):
        for chain_name, indices in part_indices.items():
            color = chain_color_map[chain_name]
            for idx in indices:
                n1, _, n2 = kg[idx]
                if (node1, node2) == (n1, n2):
                    edge_colors[i] = color
                    break

    # Apply label filtering
    wrapped_labels = {node: wrap_text(node) for node in G.nodes()}

    # Create the figure with subplots
    fig, axs = plt.subplots(1, 2, figsize=(20, 8), dpi=600)

    # Left: Original Knowledge Graph
    nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=1200, ax=axs[0])
    nx.draw_networkx_edges(G, pos, edge_color='gray', width=1.2, ax=axs[0])
    nx.draw_networkx_labels(G, pos, labels=wrapped_labels, font_size=6, ax=axs[0])
    edge_labels = nx.get_edge_attributes(G, 'label')
    wrapped_edge_labels = {edge: wrap_text(label) for edge, label in edge_labels.items()}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=wrapped_edge_labels, font_size=6, ax=axs[0])
    axs[0].set_title("Original Knowledge Graph", fontsize=10)
    axs[0].axis('off')

    # Right: Highlighted Nodes Based on Chains
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=1200, ax=axs[1], edgecolors='black')
    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=1.5, ax=axs[1])
    nx.draw_networkx_labels(G, pos, labels=wrapped_labels, font_size=6, ax=axs[1])
    nx.draw_networkx_edge_labels(G, pos, edge_labels=wrapped_edge_labels, font_size=6, ax=axs[1])
    axs[1].set_title("Graph Highlighted by Chain Membership", fontsize=10)
    axs[1].axis('off')

    # Create a legend for chain colors
    handles = [plt.Line2D([0], [0], marker='o', color=color, markersize=10, linestyle='', label=chain_name)
               for chain_name, color in chain_color_map.items()]
    axs[1].legend(handles=handles, title="Chains", loc='upper right', fontsize=8)

    # Display the plot
    plt.show()

    # Print which nodes belong to which chain
    print("\n--- Node Chain Mapping ---")
    for node, chain in node_chain_map.items():
        print(f"Node '{node}' belongs to chain '{chain}'.")


In [None]:
visualize_graph_with_chains(kg, part_indices)


Defines a function to perturb the knowledge graph by selectively removing triples
belonging to specified parts. This allows testing the impact of missing information
on downstream tasks or analysis. The function filters out triples associated with
the indices of the parts to be removed and returns the modified knowledge graph.

In [None]:
def perturb_kg_by_removing_parts(kg, parts_to_remove):
    """
    Perturbs the knowledge graph by removing triples from the specified parts.

    Parameters:
    - kg: The full knowledge graph triples list
    - parts_to_remove: List of part names to remove

    Returns:
    - perturbed_kg: The perturbed KG without the specified parts
    """
    perturbed_kg = []

    # Collect indices of the triples to keep based on parts to remove
    indices_to_remove = set()
    for part in parts_to_remove:
        indices_to_remove.update(part_indices[part])

    # Add triples that are not in the indices to remove
    perturbed_kg = [triple for i, triple in enumerate(kg) if i not in indices_to_remove]

    return perturbed_kg



Defines a function to query a GraphQAChain with a question and temperature setting,
returning the answer and its embedding. The function initializes the chain with a
specified graph and temperature, processes the question, and computes the embedding
for the returned answer, facilitating downstream analysis or comparison.


In [None]:
def get_answer_and_embedding(question: str, temp: float, graph):
    """
    Sends a question and temperature to the GraphQAChain and returns the original answer string
    and its embedding as separate outputs.

    Args:
        question (str): The question to ask the chain.
        temp (float): The temperature setting for the OpenAI model.
        graph: The graph object for the GraphQAChain.

    Returns:
        Tuple[str, list]: The original answer as a string and its embedding as a list.
    """
    # Initialize the GraphQAChain with the specified temperature
    chain = GraphQAChain.from_llm(OpenAI(temperature=temp), graph=graph, verbose=False)

    # Run the question through the chain to get the answer
    original_answer = chain.run(question)
    original_answer_str = str(original_answer)

    # Compute the embedding for the original answer
    original_answer_embedding = get_embedding(original_answer)

    # Return both answer and embedding separately
    return original_answer_str, original_answer_embedding


This function computes the embedding for a given text using a specified model.
It processes the text by removing newline characters and queries the OpenAI
embeddings API to generate a vector representation, useful for similarity
comparisons and downstream tasks.

In [None]:
import random
import json
import unicodedata
import re
embedding_cache = {}
EMBEDDING_MODEL = "text-embedding-3-small"
def normalize_text(text):
    """
    Normalize text by removing excessive spaces, normalizing Unicode characters,
    and converting to lowercase.
    """
    text = text.replace("\n", " ").strip()  # Remove newlines and extra spaces
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase (optional but recommended)
    return text

def get_embedding(text):
    text = normalize_text(text)
    if text in embedding_cache:
        return embedding_cache[text]  # Return cached embedding
    embedding = client.embeddings.create(input=[text], model=EMBEDDING_MODEL).data[0].embedding
    embedding_cache[text] = embedding  # Store result in cache
    return embedding


This function visualizes the explainability of a knowledge graph by displaying the original graph and an
enhanced graph with nodes and edges colored based on their importance coefficients. It leverages a directed
graph structure, wraps node labels for readability, adjusts node sizes based on connectivity, and applies a
custom colormap to represent the significance of graph components. The visualization is presented in a
two-panel layout, highlighting both the original structure and the explainability features derived from
Simple SMILE GraphRAG analysis. A color bar provides a reference for importance coefficients.


In [None]:
def wrap_label(label, width=15):
    """Wraps labels to fit within a specified width for better visualization."""
    return '\n'.join(textwrap.wrap(label, width))

def wrap_text(node1, relation, node2, max_words=8):
    """Wrap text if it contains more than `max_words` words and append '_explanation' to abstract nodes."""
    if relation == "abstract":
        words = node2.split()
        return f"{node1}_explanation" if len(words) > max_words else node2
    return node2

def build_graph(kg, coeff, part_indices):
    """Helper function to build graph, assign colors, and sizes."""
    G = nx.DiGraph()
    for node1, relation, node2 in kg:
        wrapped_node1 = wrap_label(node1)
        wrapped_node2 = wrap_label(wrap_text(node1, relation, node2))
        wrapped_relation = wrap_label(relation)
        G.add_edge(wrapped_node1, wrapped_node2, label=wrapped_relation)

    pos = nx.spring_layout(G, k=8, iterations=200, seed=0)
    cmap = mcolors.LinearSegmentedColormap.from_list('red_blue', ['blue', '#d3d3d3', 'red'])
    norm = mcolors.Normalize(vmin=-1, vmax=1)
    node_sizes = [1500 + 100 * G.degree(node) for node in G.nodes()]

    node_colors = []
    for node in G.nodes():
        for part_name, indices in part_indices.items():
            part_idx = int(part_name.split()[-1]) - 1
            coeff_value = coeff[part_idx]
            color = cmap(norm(coeff_value))
            if any(i < len(kg) and (wrap_label(node) == wrap_label(kg[i][0]) or wrap_label(node) == wrap_label(wrap_text(kg[i][0], kg[i][1], kg[i][2]))) for i in indices):
                node_colors.append(color)
                break
        else:
            node_colors.append('#8da0cb')

    edge_colors = []
    for i, (node1, node2) in enumerate(G.edges()):
        for part_name, indices in part_indices.items():
            part_idx = int(part_name.split()[-1]) - 1
            coeff_value = coeff[part_idx]
            color = cmap(norm(coeff_value))
            if i in indices:
                edge_colors.append(color)
                break
        else:
            edge_colors.append('gray')

    return G, pos, node_sizes, node_colors, edge_colors

def plot_knowledge_graph_explainability(kg, part_indices, coeff):
    """
    Improved visualization of a knowledge graph with explainability features.

    Parameters:
        kg (list): Knowledge graph triplets (node1, relation, node2).
        part_indices (dict): Mapping of part names to indices.
        coeff (list): Importance coefficients for each part.
    """
    G, pos, node_sizes, node_colors, edge_colors = build_graph(kg, coeff, part_indices)
    fig, ax = plt.subplots(figsize=(10, 10), dpi=100)

    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, ax=ax)
    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=1.5, ax=ax)
    nx.draw_networkx_labels(G, pos, font_size=6, ax=ax)
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6, ax=ax)

    ax.set_title("Knowledge Graph Explainability", fontsize=14)
    ax.axis('off')

    cmap = mcolors.LinearSegmentedColormap.from_list('red_blue', ['blue', '#d3d3d3', 'red'])
    norm = mcolors.Normalize(vmin=-1, vmax=1)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    fig.colorbar(sm, ax=ax, orientation='horizontal', label='Importance Coefficients', fraction=0.03, pad=0.05)

    plt.savefig('knowledge_graph_explainability.png', bbox_inches='tight')
    plt.show()


This function computes and displays fidelity metrics for evaluating regression model performance,
including standard measures like Mean Squared Error (MSE), R-squared (R²), and Mean Absolute Error (MAE).
It also calculates advanced metrics such as weighted R², weighted adjusted R², mean losses (L1 and L2),
and weighted losses. These metrics provide a comprehensive assessment of model fidelity, considering
weights and coefficients for a nuanced evaluation of the regression results.

In [None]:
def calculate_fidelity_metrics(y_true, y_pred, weights, coeff):
    """
    Calculate and print various fidelity metrics for a regression model.

    Parameters:
        y_true (array-like): True values (ground truth).
        y_pred (array-like): Predicted values.
        weights (array-like): Sample weights.
        coeff (array-like): Coefficients of the regression model (used for adjusted R²).
    """
    # Calculate regression metrics
    mse = mean_squared_error(y_true, y_pred, sample_weight=weights)
    r2 = r2_score(y_true, y_pred, sample_weight=weights)
    mae = mean_absolute_error(y_true, y_pred, sample_weight=weights)

    # Mean loss (Lm)
    mean_loss_f = np.mean(y_true)
    mean_loss_g = np.mean(y_pred)
    mean_loss = abs(mean_loss_f - mean_loss_g)

    # Mean L1 and L2 loss
    mean_l1 = np.mean(np.abs(y_true - y_pred))
    mean_l2 = np.mean((y_true - y_pred) ** 2)

    # Weighted L1 and L2 loss
    n = len(y_true)
    weighted_l1 = np.sum(weights * np.abs(y_true - y_pred)) / n
    weighted_l2 = np.sum(weights * (y_true - y_pred) ** 2) / n

    # Weighted R²
    f_mean = np.average(y_true, weights=weights)
    ss_tot = np.sum(weights * (y_true - f_mean) ** 2)
    ss_res = np.sum(weights * (y_true - y_pred) ** 2)
    weighted_r2 = 1 - ss_res / ss_tot

    # Weighted adjusted R²
    p = len(coeff)
    weighted_adj_r2 = 1 - (1 - weighted_r2) * (n - 1) / (n - p - 1)

    # Print fidelity metrics
    print(100 * '-')
    print('Fidelity:')
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R²): {r2}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Loss (Lm): {mean_loss}")
    print(f"Mean L1 Loss: {mean_l1}")
    print(f"Mean L2 Loss: {mean_l2}")
    print(f"Weighted L1 Loss: {weighted_l1}")
    print(f"Weighted L2 Loss: {weighted_l2}")
    print(f"Weighted R-squared (R²ω): {weighted_r2}")
    print(f"Weighted Adjusted R-squared (Rˆ²ω): {weighted_adj_r2}")
    print(100 * '-')


This function generates a scatter plot comparing actual (true) values to predicted values,
with an optional weighting mechanism to scale point sizes. It includes a perfect prediction
line (y = x) for reference and displays the R² score as a measure of model performance. The
plot provides an intuitive visual assessment of the alignment between predictions and ground
truth, with customizable point size based on weights for better visualization.

In [None]:
def plot_actual_vs_predicted(y_true, y_pred, weights=None):
    """
    Plot actual vs. predicted values with an optional weight normalization for point sizes.
    Displays the R² score and a perfect prediction line (y = x).

    Parameters:
        y_true (array-like): True values (ground truth).
        y_pred (array-like): Predicted values.
        weights (array-like, optional): Weights for scaling point sizes in the scatter plot.
    """
    # Calculate the R² score
    r2 = r2_score(y_true, y_pred, sample_weight=weights)

    # Normalize weights for better visualization
    if weights is not None:
        normalized_weights = np.array(weights) / np.max(weights) * 100  # Scale weights to a reasonable range
    else:
        normalized_weights = 50

    # Plotting
    fig, ax = plt.subplots()
    ax.scatter(y_true, y_pred, s=normalized_weights, label='Data points', alpha=0.6)  # Use weights for point sizes

    # Determine the range for the perfect prediction line
    min_val = min(y_true.min(), y_pred.min())
    max_val = max(y_true.max(), y_pred.max())

    # Plotting the Perfect Prediction Line (y = x)
    ax.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Perfect Prediction Line')

    # Set the plot limits to better frame the data
    ax.set_xlim([min_val, max_val])
    ax.set_ylim([min_val, max_val])

    # Labeling the axes
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')

    # Title with R² score rounded to two decimal places
    ax.set_title(f'Actual vs. Predicted Values\nR²: {r2:.2f}')

    # Show legend
    ax.legend()

    # Show plot
    plt.show()


Defines the question to query the GraphQAChain or knowledge retrieval system.
Here, the question "What is RAG?" seeks information about Retrieval-Augmented Generation,
a framework that integrates external knowledge bases to improve the accuracy and reliability
of AI-generated responses.

In [None]:
question = "what is Network_Security_Services?"
#Portion 3

This snippet sets the temperature parameter to 0 for deterministic response generation and
queries the GraphQAChain with the question, "What is RAG?". The function `get_answer_and_embedding`
returns the original answer as a string along with its embedding. The answer is then printed for review.

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)
# Compute the embedding for the original answer

In [None]:
# Define part names for easier reference


# Define the original vector (all parts present)
original = np.array([1, 1, 1, 1, 1,1, 1, 1, 1, 1])
original = original.reshape(1, -1)  # Shape becomes (1, 10)

# Exploring Explainability: Fidelity Metrics Comparison for Text vs. Text and Graph vs. Graph Representations Using Cosine, wasserstein_distance, and Combined Approaches

This script evaluates the robustness of a knowledge graph system by perturbing the graph and measuring the
impact on the generated responses. It iteratively removes random parts of the graph, calculates similarity
metrics (Wasserstein distance and cosine similarity) between the original and perturbed response embeddings,
and stores the perturbation details for analysis. Additionally, it computes cosine distances and kernel-based
weights for further analysis of the perturbations' impact.

Key Steps:
1. Generate perturbed versions of the knowledge graph by randomly removing parts.
2. Create embeddings for responses from the perturbed graphs.
3. Calculate similarity metrics between original and perturbed responses.
4. Compute kernel-based weights using cosine distances for downstream tasks.

In [None]:
similarities_wd = []
similarities_cosine = []
perturbations_vect2 = []
perturbation_texts = []  # Store the perturbation texts

# Loop for perturbations
for i in range(20):
    # Make a copy of the original vector for each iteration
    perturbation_vector = original.copy().flatten()

    # Randomly choose one or more parts to remove
    num_parts_to_remove = random.randint(1, len(part_names))
    parts_to_remove_indices = random.sample(range(len(part_names)), num_parts_to_remove)

    # Set the selected parts to 0 in the perturbation vector
    for part_idx in parts_to_remove_indices:
        perturbation_vector[part_idx] = 0

    # Append the perturbation vector to perturbations_vect2
    perturbations_vect2.append(perturbation_vector)

    # Perturb the KG by removing the selected parts
    parts_to_remove = [part_names[idx] for idx in parts_to_remove_indices]
    perturbed_kg = perturb_kg_by_removing_parts(kg, parts_to_remove)

    # Create a temporary graph for the perturbed KG
    graph_temp = NetworkxEntityGraph()
    for (node1, relation, node2) in perturbed_kg:
        graph_temp.add_triple(KnowledgeTriple(node1, relation, node2))

    # Generate response using GraphQAChain
    chain = GraphQAChain.from_llm(OpenAI(temperature=0), graph=graph_temp, verbose=False)
    temp_response = chain.run(question)

    # Store the perturbed response text
    perturbation_texts.append(temp_response)

    # Get embedding for the perturbed response
    temp_response_embedding = get_embedding(temp_response)


    # Calculate Wasserstein distance between the original and perturbed responses
    similarity_wd = wasserstein_distance(original_answer_embedding, temp_response_embedding)
    similarities_wd.append(similarity_wd)

    # Calculate cosine similarity between the original and perturbed responses
    similarity_cosine = 1 - cosine(original_answer_embedding, temp_response_embedding)
    similarities_cosine.append(similarity_cosine)

    print(f"Iteration {i + 1}")
    print(f"Parts removed: {parts_to_remove}")
    print(f"original_answer response: {original_answer_str}")
    print(f"Perturbed response: {temp_response}")
    print(f"Wasserstein Distance with original answer: {similarity_wd}")
    print(f"Cosine Similarity with original answer: {similarity_cosine}\n")

# Convert perturbations_vect2 to a numpy array for pairwise distance calculation
perturbations_vect2 = np.array(perturbations_vect2)

# Calculate cosine distances between perturbation vectors and the original vector
distances = sklearn.metrics.pairwise_distances(perturbations_vect2, original, metric='cosine').ravel()

# Assuming you may use kernel width in further computations
kernel_width = 0.25
weights = np.sqrt(np.exp(-(distances**2)/kernel_width**2))

# Print all similarities and weights
print(f"Wasserstein Distances: {similarities_wd}")
print(f"Cosine Similarities: {similarities_cosine}")
print(f"Weights: {weights}")

# Optionally print all perturbation texts together for a consolidated view
print("\n--- Summary of Perturbations ---")
for i, text in enumerate(perturbation_texts):
    print(f"Perturbation {i + 1}: {text}")

# BayLIME: Dual Metrics: Blending Inverse Wasserstein Distance and Cosine Similarity for Text and Graph-to-Graph using Cosine
Cosine Similarity: Measuring Alignment and Fidelity Between Textual Representations
Wasserstein Distance: Evaluating Semantic Shifts Between Textual Representations


This script computes a combined similarity metric by scaling inverse Wasserstein distances
(to normalize between 0 and 1) and adding them to cosine similarities. The result, stored
in `Similarities_`, provides a composite measure of fidelity, highlighting the alignment
between perturbed and original responses in the knowledge graph system.

In [None]:
#bayLime_sum_inv_wd_cosine
epsilon = 1e-6

# Calculate the inverse of each Wasserstein distance, adding a small epsilon to avoid division by zero
inverse_similarities_wd = [1.0 / (dist + epsilon) for dist in similarities_wd]

# Find the minimum and maximum of the inverse Wasserstein distances
min_value = min(inverse_similarities_wd)
max_value = max(inverse_similarities_wd)

# Scale inverse Wasserstein distances between 0 and 1
scaled_similarities_wd = [(value - min_value) / (max_value - min_value) for value in inverse_similarities_wd]

# Combine the scaled inverse Wasserstein distances and cosine similarities
Similarities_ = [wd + cos for wd, cos in zip(scaled_similarities_wd, similarities_cosine)]
# Print the combined list of similarities
print(Similarities_)

This code trains a Bayesian Ridge regression model to learn the relationship between perturbation vectors
(`perturbations_vect2`) and the combined similarity metric (`Similarities_`). The model incorporates sample
weights (`weights`) to prioritize certain data points. After training, the coefficients (`coeff`) of the model
are extracted, which represent the importance of each feature in predicting the similarity metric.

In [None]:
simpler_model = BayesianRidge()
simpler_model.fit(X=perturbations_vect2, y= Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

This code defines the importance coefficients (`coeff`) for different parts of the knowledge graph
and visualizes their explainability using the `plot_knowledge_graph_explainability` function. The
coefficients indicate the contribution of each part to the overall fidelity, and the visualization
highlights these contributions through node and edge color mappings in the graph.

In [None]:
# Define importance coefficients
coeff = np.array([ 0.02665219, -0.03145513,  1.26443743, -0.03655348,  0.01738235,
        0.04127252,  0.02919482, -0.02534338, -0.00374719,  0.01072243])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

This code calculates fidelity metrics to evaluate the performance of the Bayesian Ridge regression model.
The true similarity values (`y_true`) and predicted values (`y_pred`) are passed to the
`calculate_fidelity_metrics` function, along with sample weights (`weights`) and model coefficients (`coeff`).
The function outputs metrics like MSE, R², weighted R², and other advanced measures for a comprehensive
assessment of the model's fidelity.

In [None]:
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

This code generates a scatter plot to visually compare the true similarity values (`y_true`) against the
predicted values (`y_pred`). The function `plot_actual_vs_predicted` scales point sizes based on sample
weights (`weights`) and includes a reference line for perfect predictions (y = x). It also displays the
R² score to quantify the model's predictive accuracy.

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# BayLIME: Analyzing Fidelity: Comparing Text and Graph Representations Using Cosine Similarity.

In [None]:
simpler_model =  BayesianRidge()
simpler_model.fit(X=perturbations_vect2, y=similarities_cosine, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([ 0.02421552, -0.02469173,  0.25378628, -0.03249805,  0.01327625,
        0.03722382,  0.03044294, -0.01968551, -0.00673365,  0.01327592])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

In [None]:
y_true = np.array(similarities_cosine).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# BayLIME: Fidelity Analysis of Text-to-Text using Inverse Wasserstein Distance and Graph-to-Graph using Cosine


In [None]:
Similarities_ = [(value - min_value) / (max_value - min_value) for value in inverse_similarities_wd]
print(Similarities_)

In [None]:
simpler_model = BayesianRidge()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([ 3.28418237e-05, -3.60859654e-05,  9.99996056e-01, -4.41499600e-05,
        1.93175917e-05,  4.99926463e-05,  3.78055664e-05, -2.83822627e-05,
       -6.86463434e-06,  1.55103960e-05])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)


In [None]:
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# Linear : Analyzing Fidelity: Comparing Text and Graph Representations Using Cosine Similarity.

In [None]:
simpler_model = LinearRegression()
simpler_model.fit(X=perturbations_vect2, y=similarities_cosine, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([ 0.02694535, -0.03291494,  0.2670603 , -0.03728667,  0.01841689,
        0.04200553,  0.02868802, -0.0267623 , -0.0028504 ,  0.0098873 ])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)


In [None]:
y_true = np.array(similarities_cosine).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# Linear: Fidelity Analysis of Text-to-Text using Inverse Wasserstein Distance and Graph-to-Graph using Cosine

In [None]:
# Scale between 0 and 1
Similarities_ = [(value - min_value) / (max_value - min_value) for value in inverse_similarities_wd]
print(Similarities_)

In [None]:
simpler_model = LinearRegression()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([ 3.37931265e-05, -4.12797966e-05,  1.00000545e+00, -4.67625382e-05,
        2.30972735e-05,  5.26806270e-05,  3.59786609e-05, -3.35635493e-05,
       -3.57478489e-06,  1.24000096e-05])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)


In [None]:
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# Linear : Dual Metrics: Blending Inverse Wasserstein Distance and Cosine Similarity for Text and Graph-to-Graph using Cosine
Cosine Similarity: Measuring Alignment and Fidelity Between Textual Representations
Wasserstein Distance: Evaluating Semantic Shifts Between Textual Representations


In [None]:
# Combine the scaled inverse Wasserstein distances and cosine similarities
Similarities_ = [wd + cos for wd, cos in zip(scaled_similarities_wd, similarities_cosine)]
# Print the combined list of similarities
print(Similarities_)

In [None]:
simpler_model = LinearRegression()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([ 0.02697915, -0.03295622,  1.26706576, -0.03733343,  0.01843998,
        0.04205821,  0.02872399, -0.02679586, -0.00285397,  0.0098997 ])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)


In [None]:
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# Linear: Fidelity Analysis of Text-to-Text using Wasserstein Distance and Graph-to-Graph using Cosine

In [None]:
min_value_wd= min(similarities_wd)
max_value_wd = max(similarities_wd)
# Scale between 0 and 1
Similarities_ = [(value - min_value_wd) / (max_value_wd - min_value_wd) for value in similarities_wd]
print(Similarities_)

In [None]:
simpler_model = LinearRegression()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([-0.0259767 ,  0.03173169, -1.0041931 ,  0.03594626, -0.01775482,
       -0.04049549, -0.02765672,  0.02580023,  0.00274793, -0.00953186])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)

In [None]:
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# BayLime: Fidelity Analysis of Text-to-Text using Wasserstein Distance and Graph-to-Graph using Cosine

In [None]:
# Scale between 0 and 1
Similarities_ = [(value - min_value_wd) / (max_value_wd - min_value_wd) for value in similarities_wd]

In [None]:
simpler_model = BayesianRidge()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([-0.0255766 ,  0.02995282, -1.00109786,  0.03501568, -0.01651002,
       -0.03956092, -0.02820174,  0.02409017,  0.00379259, -0.01048943])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)


In [None]:
# Example inputs
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# BayLIME: Hybrid Text Metrics (Wasserstein Distance + Cosine) vs. Graph Metrics (Cosine) Fidelity Analysis.

In [None]:
# Combine the scaled inverse Wasserstein distances and cosine similarities
Similarities= [wd + cos for wd, cos in zip(similarities_wd, similarities_cosine)]
# Print the combined list of similarities
print(Similarities)

In [None]:
simpler_model = BayesianRidge()
simpler_model.fit(X=perturbations_vect2, y=Similarities, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([ 0.02418697, -0.02465132,  0.25294701, -0.03245874,  0.01325452,
        0.03718056,  0.03042151, -0.01965449, -0.00673555,  0.01327148])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)


In [None]:
y_true = np.array(Similarities).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# Linear: Hybrid Text Metrics (Wasserstein Distance + Cosine) vs. Graph Metrics (Cosine) Fidelity Analysis.

In [None]:
# Combine the scaled Wasserstein distances and cosine similarities
Similarities_ = [wd + cos for wd, cos in zip(similarities_wd, similarities_cosine)]
# Print the combined list of similarities
print(Similarities_)

In [None]:
simpler_model = LinearRegression()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
# Define importance coefficients
coeff = np.array([ 0.02692419, -0.03288909,  0.26624221, -0.03725738,  0.01840242,
        0.04197254,  0.02866548, -0.02674128, -0.00284816,  0.00987953])
# Call the function
plot_knowledge_graph_explainability(kg, part_indices, coeff)


In [None]:
# Example inputs
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# Number of perturb
Evaluating Robustness Across Perturbations: This analysis explores the impact of varying numbers of perturbations  on the fidelity of both text and graph representations. By systematically introducing perturbations, we assess how sensitive the representations are to changes and identify key patterns in their robustness. This approach provides valuable insights into the stability of explainability metrics and highlights the thresholds where fidelity begins to degrade, offering a comprehensive understanding of the model's behavior under varying conditions."

In [None]:
# Define the iteration counts
iteration_counts = [10, 30, 60, 120]
coefficients_list=[]
for iteration_count in iteration_counts:
    print(f"\nRunning for {iteration_count} iterations...\n")

    similarities_wd = []
    similarities_cosine = []
    perturbations_vect2 = []
    perturbation_texts = []  # Store the perturbation texts

    # Loop for perturbations
    for i in range(iteration_count):
        # Make a copy of the original vector for each iteration
        perturbation_vector = original.copy().flatten()

        # Randomly choose one or more parts to remove
        num_parts_to_remove = random.randint(1, len(part_names))
        parts_to_remove_indices = random.sample(range(len(part_names)), num_parts_to_remove)

        # Set the selected parts to 0 in the perturbation vector
        for part_idx in parts_to_remove_indices:
            perturbation_vector[part_idx] = 0

        # Append the perturbation vector to perturbations_vect2
        perturbations_vect2.append(perturbation_vector)

        # Perturb the KG by removing the selected parts
        parts_to_remove = [part_names[idx] for idx in parts_to_remove_indices]
        perturbed_kg = perturb_kg_by_removing_parts(kg, parts_to_remove)

        # Create a temporary graph for the perturbed KG
        graph_temp = NetworkxEntityGraph()
        for (node1, relation, node2) in perturbed_kg:
            graph_temp.add_triple(KnowledgeTriple(node1, relation, node2))

        # Generate response using GraphQAChain
        chain = GraphQAChain.from_llm(OpenAI(temperature=0), graph=graph_temp, verbose=False)
        temp_response = chain.run(question)

        # Store the perturbed response text
        perturbation_texts.append(temp_response)

        # Get embedding for the perturbed response
        temp_response_embedding = get_embedding(temp_response)

        # Calculate Wasserstein distance between the original and perturbed responses
        similarity_wd = wasserstein_distance(original_answer_embedding, temp_response_embedding)
        similarities_wd.append(similarity_wd)

        # Calculate cosine similarity between the original and perturbed responses
        similarity_cosine = 1 - cosine(original_answer_embedding, temp_response_embedding)
        similarities_cosine.append(similarity_cosine)

        # print(f"Iteration {i + 1}")
        # print(f"Parts removed: {parts_to_remove}")
        # print(f"original_answer response: {original_answer_str}")
        # print(f"Perturbed response: {temp_response}")
        # print(f"Wasserstein Distance with original answer: {similarity_wd}")
        # print(f"Cosine Similarity with original answer: {similarity_cosine}\n")

    # Convert perturbations_vect2 to a numpy array for pairwise distance calculation
    perturbations_vect2 = np.array(perturbations_vect2)

    # Calculate cosine distances between perturbation vectors and the original vector
    distances = sklearn.metrics.pairwise_distances(perturbations_vect2, original, metric='cosine').ravel()

    # Assuming you may use kernel width in further computations
    kernel_width = 0.25
    weights = np.sqrt(np.exp(-(distances**2) / kernel_width**2))

    # Print all similarities and weights
    # print(f"\nResults for {iteration_count} Iterations")
    # print(f"Wasserstein Distances: {similarities_wd}")
    # print(f"Cosine Similarities: {similarities_cosine}")
    # print(f"Weights: {weights}")

    # Optionally print all perturbation texts together for a consolidated view
    print("\n--- Summary of Perturbations ---")
    for i, text in enumerate(perturbation_texts):
        print(f"Perturbation {i + 1}: {text}")
    epsilon = 1e-6

    # Calculate the inverse of each Wasserstein distance, adding a small epsilon to avoid division by zero
    inverse_similarities_wd = [1.0 / (dist + epsilon) for dist in similarities_wd]

    # Find the minimum and maximum of the inverse Wasserstein distances
    min_value = min(inverse_similarities_wd)
    max_value = max(inverse_similarities_wd)

    Similarities_ = [(value - min_value) / (max_value - min_value) for value in inverse_similarities_wd]

    simpler_model = LinearRegression()
    simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)

    # Get coefficients and store them
    coeff = simpler_model.coef_
    coefficients_list.append(coeff)

    # 1. Visualize the knowledge graph with the current coefficients
    print("Plotting knowledge graph explainability...")
    plot_knowledge_graph_explainability(kg, part_indices, coeff)

    # 2. Calculate fidelity metrics
    print("Calculating fidelity metrics...")

    y_true = np.array(Similarities_).ravel()
    y_pred = simpler_model.predict(perturbations_vect2).ravel()

    # Call the function
    calculate_fidelity_metrics(y_true, y_pred, weights, coeff)