In [None]:
# Install the OpenAI and LangChain libraries
# - `openai`: Provides access to OpenAI's GPT models for tasks like text generation, embeddings, and completions.
# - `langchain`: A framework for building applications using large language models (LLMs).
#                Includes tools for chaining prompts, memory, and integrations like knowledge graphs.

!pip install -q openai langchain


# Attempt to install the LangChain Community library
# - `langchain-community`: This may refer to a community-supported version or extensions of LangChain.
#   Ensure this package exists and is maintained if errors occur during installation.

!pip install -q langchain-community

In [4]:
!pip install rdflib  SPARQLWrapper


This script initializes the OpenAI API client and defines a function to interact with the GPT model.
The `get_chat_response` function sends a user-provided text input to the GPT model (gpt-3.5-turbo)
and returns the model's response.


In [5]:
import os
from openai import OpenAI

# Set the API key in the environment variable
os.environ["OPENAI_API_KEY"] = "sk-MNL1gYbV6CyXkh2rwPxao_D7n8nSxwW4_0wozr5sUtT3BlbkFJoEpwVXUH_Z3deg71NI-mM8QqSOkOGzQ5WDXmQ8FQEA" # Replace with your actual API key

client = OpenAI()

def get_chat_response(text):
    """
    This function takes a text input and returns the chat completion message.
    """
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": text,
            }
        ],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content


In [None]:
import networkx as nx  # For creating and analyzing graphs/networks.
import matplotlib.pyplot as plt  # For data visualization and plotting.
import numpy as np  # For numerical operations and array handling.
import seaborn as sns
import pandas as pd
import random  # For generating random numbers.
from langchain.llms import OpenAI
from langchain.graphs.networkx_graph import NetworkxEntityGraph, KnowledgeTriple  # Represents (subject, predicate, object) triples.
from scipy.spatial.distance import cosine  # For cosine similarity/distance between vectors.
from scipy.stats import wasserstein_distance  # For Wasserstein distance (probability distribution comparison).
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression, BayesianRidge  # Regression models.
from sklearn.datasets import fetch_20newsgroups  # Fetch the 20 Newsgroups text dataset.
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # Model performance metrics.
import matplotlib.colors as mcolors  # For handling and customizing colors in visualizations.
import sklearn.metrics  # For evaluation metrics like accuracy, precision, recall, etc.
import textwrap  # For wrapping text into fixed-width lines
from sklearn.metrics import roc_curve, auc
from langchain.chains import GraphQAChain  # For question answering over knowledge graphs.
from langchain.prompts import PromptTemplate  # To define templates for LLM prompts.
import os
from openai import OpenAI
from sentence_transformers import SentenceTransformer

In [None]:
!pip install -q spacy
!python -m spacy download en_core_web_sm
import spacy

If you're using OpenAI with a GraphIndexCreator, and it’s unavailable in the new LangChain version, you can adapt the code using NetworkxEntityGraph for creating and querying a graph with an LLM, or we replaced it with a custom CustomGraphIndexCreator that integrates NetworkxEntityGraph.

In [None]:
from langchain.chains import GraphQAChain  # For question answering over knowledge graphs.

# Prompt Engineering
from langchain.prompts import PromptTemplate  # To define templates for LLM prompts.



This script defines a knowledge graph using a set of triples representing entities (nodes)
and their relationships (edges). The triples are categorized into parts based on themes,
such as LLMs in the legal context, RAG integration, collaborations, and key people involved.
The knowledge graph is constructed programmatically by adding these triples into the graph
index, which allows for efficient querying and analysis.



In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Set up the DBpedia SPARQL endpoint
sparql = SPARQLWrapper("https://dbpedia.org/sparql")

# SPARQL Query: Retrieve cybersecurity-related concepts and their triples
query = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?concept ?label ?abstract
WHERE {
  ?concept rdf:type dbo:Software .
  ?concept rdfs:label ?label .
  ?concept dbo:abstract ?abstract .
  FILTER (LANG(?label) = 'en' && LANG(?abstract) = 'en')
  FILTER (CONTAINS(LCASE(?label), "cyber") || CONTAINS(LCASE(?label), "security") || CONTAINS(LCASE(?label), "malware"))
}
LIMIT 10
"""

# Execute the query
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# Initialize Knowledge Graph (KG) and portion tracking
kg = []
portion_indices = {}
portion_counter = 1  # Start portion numbering
triple_index = 0  # Track overall index

print("\nStructured Knowledge Graph:\n")

for result in results["results"]["bindings"]:
    concept = result["concept"]["value"].split("/")[-1]  # Extracts entity name
    label = result["label"]["value"]
    abstract = result["abstract"]["value"]  # Store full abstract without truncation

    # Store portion index range
    start_index = triple_index
    portion_indices[f"Part {portion_counter}"] = range(start_index, start_index + 3)  # Each part has 3 triples

    # Print structured output
    print(f"\n# Part {portion_counter}")
    print(f"({concept}) → (type) → (Software)")
    print(f"({concept}) → (label) → ({label})")
    print(f"({concept}) → (abstract) →")
    print(abstract)  # Print full abstract with line breaks
    print("-" * 80)

    # Store in KG
    kg.append((concept, "type", "Software"))
    kg.append((concept, "label", label))
    kg.append((concept, "abstract", abstract))  # Store full abstract

    # Increment indices
    triple_index += 3
    portion_counter += 1

# Print portion indices separately
print("\nPortion Indices:\n")
for part, index_range in portion_indices.items():
    print(f"{part}: {index_range}")

# Save KG to a text file
with open("knowledge_graph_output.txt", "w", encoding="utf-8") as f:
    for triple in kg:
        f.write(f"( {triple[0]} , {triple[1]} , {triple[2]})\n\n")  # Ensuring full visibility

print("\nFinal Knowledge Graph saved as 'knowledge_graph_output.txt'.")

# Print the final KG in a readable format
print("\nFinal Knowledge Graph List:\n")
for triple in kg:
  print("(", triple[0],",", triple[1],", ",triple[2], ")")  # Print without truncation


Defines a function to perturb the knowledge graph by selectively removing triples
belonging to specified parts. This allows testing the impact of missing information
on downstream tasks or analysis. The function filters out triples associated with
the indices of the parts to be removed and returns the modified knowledge graph.

In [None]:
def perturb_kg_by_removing_parts(kg, parts_to_remove):
    """
    Perturbs the knowledge graph by removing triples from the specified parts.

    Parameters:
    - kg: The full knowledge graph triples list
    - parts_to_remove: List of part names to remove

    Returns:
    - perturbed_kg: The perturbed KG without the specified parts
    """
    perturbed_kg = []

    # Collect indices of the triples to keep based on parts to remove
    indices_to_remove = set()
    for part in parts_to_remove:
        indices_to_remove.update(part_indices[part])

    # Add triples that are not in the indices to remove
    perturbed_kg = [triple for i, triple in enumerate(kg) if i not in indices_to_remove]

    return perturbed_kg


In [None]:
def calculate_fidelity_metrics(y_true, y_pred, weights, coeff):
    """
    Calculate and print various fidelity metrics for a regression model.

    Parameters:
        y_true (array-like): True values (ground truth).
        y_pred (array-like): Predicted values.
        weights (array-like): Sample weights.
        coeff (array-like): Coefficients of the regression model (used for adjusted R²).
    """
    # Calculate regression metrics
    mse = mean_squared_error(y_true, y_pred, sample_weight=weights)
    r2 = r2_score(y_true, y_pred, sample_weight=weights)
    mae = mean_absolute_error(y_true, y_pred, sample_weight=weights)

    # Mean loss (Lm)
    mean_loss_f = np.mean(y_true)
    mean_loss_g = np.mean(y_pred)
    mean_loss = abs(mean_loss_f - mean_loss_g)

    # Mean L1 and L2 loss
    mean_l1 = np.mean(np.abs(y_true - y_pred))
    mean_l2 = np.mean((y_true - y_pred) ** 2)

    # Weighted L1 and L2 loss
    n = len(y_true)
    weighted_l1 = np.sum(weights * np.abs(y_true - y_pred)) / n
    weighted_l2 = np.sum(weights * (y_true - y_pred) ** 2) / n

    # Weighted R²
    f_mean = np.average(y_true, weights=weights)
    ss_tot = np.sum(weights * (y_true - f_mean) ** 2)
    ss_res = np.sum(weights * (y_true - y_pred) ** 2)
    weighted_r2 = 1 - ss_res / ss_tot

    # Weighted adjusted R²
    p = len(coeff)
    weighted_adj_r2 = 1 - (1 - weighted_r2) * (n - 1) / (n - p - 1)

    # Print fidelity metrics
    print(100 * '-')
    print('Fidelity:')
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R²): {r2}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Loss (Lm): {mean_loss}")
    print(f"Mean L1 Loss: {mean_l1}")
    print(f"Mean L2 Loss: {mean_l2}")
    print(f"Weighted L1 Loss: {weighted_l1}")
    print(f"Weighted L2 Loss: {weighted_l2}")
    print(f"Weighted R-squared (R²ω): {weighted_r2}")
    print(f"Weighted Adjusted R-squared (Rˆ²ω): {weighted_adj_r2}")
    print(100 * '-')



Defines a function to query a GraphQAChain with a question and temperature setting,
returning the answer and its embedding. The function initializes the chain with a
specified graph and temperature, processes the question, and computes the embedding
for the returned answer, facilitating downstream analysis or comparison.


In [None]:
from langchain.chat_models import ChatOpenAI
def get_answer_and_embedding(question: str, temp: float, graph):
    """
    Sends a question and temperature to the GraphQAChain and returns the original answer string
    and its embedding as separate outputs.

    Args:
        question (str): The question to ask the chain.
        temp (float): The temperature setting for the OpenAI model.
        graph: The graph object for the GraphQAChain.

    Returns:
        Tuple[str, list]: The original answer as a string and its embedding as a list.
    """
    # Initialize the correct LLM instance for LangChain
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=temp)

    # Initialize the GraphQAChain with the correct LLM
    chain = GraphQAChain.from_llm(llm, graph=graph, verbose=False)

    # Run the question through the chain to get the answer
    original_answer = chain.run(question)
    original_answer_str = str(original_answer)

    # Compute the embedding for the original answer
    original_answer_embedding = get_embedding(original_answer)

    # Return both answer and embedding separately
    return original_answer_str, original_answer_embedding




This function computes the embedding for a given text using a specified model.
It processes the text by removing newline characters and queries the OpenAI
embeddings API to generate a vector representation, useful for similarity
comparisons and downstream tasks.

In [None]:
import random
import json
import unicodedata
import re
embedding_cache = {}
EMBEDDING_MODEL = "text-embedding-3-small"
def normalize_text(text):
    """
    Normalize text by removing excessive spaces, normalizing Unicode characters,
    and converting to lowercase.
    """
    text = text.replace("\n", " ").strip()  # Remove newlines and extra spaces
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase (optional but recommended)
    return text

def get_embedding(text):
    text = normalize_text(text)
    if text in embedding_cache:
        return embedding_cache[text]  # Return cached embedding
    embedding = client.embeddings.create(input=[text], model=EMBEDDING_MODEL).data[0].embedding
    embedding_cache[text] = embedding  # Store result in cache
    return embedding

This function visualizes the explainability of a knowledge graph by displaying the original graph and an
enhanced graph with nodes and edges colored based on their importance coefficients. It leverages a directed
graph structure, wraps node labels for readability, adjusts node sizes based on connectivity, and applies a
custom colormap to represent the significance of graph components. The visualization is presented in a
two-panel layout, highlighting both the original structure and the explainability features derived from
Simple SMILE GraphRAG analysis. A color bar provides a reference for importance coefficients.


In [None]:
def wrap_label(label, width=15):
    """Wraps labels to fit within a specified width for better visualization."""
    return '\n'.join(textwrap.wrap(label, width))

def wrap_text(node1, relation, node2, max_words=8):
    """Wrap text if it contains more than `max_words` words and append '_explanation' to abstract nodes."""
    if relation == "abstract":
        words = node2.split()
        return f"{node1}_explanation" if len(words) > max_words else node2
    return node2

def build_graph(kg, coeff, part_indices, cmap, norm):
    """Helper function to build graph, assign colors, and sizes."""
    G = nx.DiGraph()
    for node1, relation, node2 in kg:
        wrapped_node1 = wrap_label(node1)
        wrapped_node2 = wrap_label(wrap_text(node1, relation, node2))
        wrapped_relation = wrap_label(relation)
        G.add_edge(wrapped_node1, wrapped_node2, label=wrapped_relation)

    pos = nx.spring_layout(G, k=8, iterations=200, seed=0)
    node_sizes = [1500 + 100 * G.degree(node) for node in G.nodes()]

    node_colors = []
    for node in G.nodes():
        assigned_color = '#8da0cb'  # Default color
        for part_name, indices in part_indices.items():
            part_idx = int(part_name.split()[-1]) - 1
            coeff_value = coeff[part_idx]
            color = cmap(norm(coeff_value))
            if any(i < len(kg) and (wrap_label(node) == wrap_label(kg[i][0]) or wrap_label(node) == wrap_label(wrap_text(kg[i][0], kg[i][1], kg[i][2]))) for i in indices):
                assigned_color = color
                break
        node_colors.append(assigned_color)

    edge_colors = []
    for i, (node1, node2) in enumerate(G.edges()):
        assigned_color = 'gray'  # Default edge color
        for part_name, indices in part_indices.items():
            part_idx = int(part_name.split()[-1]) - 1
            coeff_value = coeff[part_idx]
            color = cmap(norm(coeff_value))
            if i in indices:
                assigned_color = color
                break
        edge_colors.append(assigned_color)

    return G, pos, node_sizes, node_colors, edge_colors

def plot_knowledge_graph_explainability(kg, part_indices, coeff):
    """
    Improved visualization of a knowledge graph with explainability features.

    Parameters:
        kg (list): Knowledge graph triplets (node1, relation, node2).
        part_indices (dict): Mapping of part names to indices.
        coeff (list): Importance coefficients for each part.
    """
    cmap = mcolors.LinearSegmentedColormap.from_list('red_blue', ['blue', '#d3d3d3', 'red'])
    norm = mcolors.Normalize(vmin=-1, vmax=1)

    G, pos, node_sizes, node_colors, edge_colors = build_graph(kg, coeff, part_indices, cmap, norm)
    fig, ax = plt.subplots(figsize=(10, 10), dpi=100)

    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, ax=ax)
    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=1.5, ax=ax)
    nx.draw_networkx_labels(G, pos, font_size=6, ax=ax)
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6, ax=ax)

    ax.set_title("Knowledge Graph Explainability", fontsize=14)
    ax.axis('off')

    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    fig.colorbar(sm, ax=ax, orientation='horizontal', label='Importance Coefficients', fraction=0.03, pad=0.05)

    plt.savefig('knowledge_graph_explainability.png', bbox_inches='tight')
    plt.show()


In [None]:
def plot_actual_vs_predicted(y_true, y_pred, weights=None):
    """
    Plot actual vs. predicted values with an optional weight normalization for point sizes.
    Displays the R² score and a perfect prediction line (y = x).

    Parameters:
        y_true (array-like): True values (ground truth).
        y_pred (array-like): Predicted values.
        weights (array-like, optional): Weights for scaling point sizes in the scatter plot.
    """
    # Calculate the R² score
    r2 = r2_score(y_true, y_pred, sample_weight=weights)

    # Normalize weights for better visualization
    if weights is not None:
        normalized_weights = np.array(weights) / np.max(weights) * 100  # Scale weights to a reasonable range
    else:
        normalized_weights = 50

    # Plotting
    fig, ax = plt.subplots()
    ax.scatter(y_true, y_pred, s=normalized_weights, label='Data points', alpha=0.6)  # Use weights for point sizes

    # Determine the range for the perfect prediction line
    min_val = min(y_true.min(), y_pred.min())
    max_val = max(y_true.max(), y_pred.max())

    # Plotting the Perfect Prediction Line (y = x)
    ax.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Perfect Prediction Line')

    # Set the plot limits to better frame the data
    ax.set_xlim([min_val, max_val])
    ax.set_ylim([min_val, max_val])

    # Labeling the axes
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')

    # Title with R² score rounded to two decimal places
    ax.set_title(f'Actual vs. Predicted Values\nR²: {r2:.2f}')

    # Show legend
    ax.legend()

    # Show plot
    plt.show()


Defines the question to query the GraphQAChain or knowledge retrieval system.
Here, the question seeks information ,
a framework that integrates external knowledge bases to improve the accuracy and reliability
of AI-generated responses.

In [None]:
part_indices ={
"Part 1": range(0, 3),
"Part 2": range(3, 6),
"Part 3": range(6, 9),
"Part 4": range(9, 12),
"Part 5": range(12, 15),
"Part 6": range(15, 18),
"Part 7": range(18, 21),
"Part 8": range(21, 24),
"Part 9": range(24, 27),
"Part 10": range(27, 30)
}
part_names = list(part_indices.keys())

# Instantiate the graph
graph = NetworkxEntityGraph()

# Build the graph from the knowledge triples
for (node1, relation, node2) in kg:
    graph.add_triple(KnowledgeTriple(node1, relation, node2))

In [None]:
# Define the original vector (all parts present)
original = np.array([1, 1, 1, 1, 1,1, 1, 1, 1, 1])
original = original.reshape(1, -1)  # Shape becomes (1,10)

In [None]:
question = "Analyze the cybersecurity risks posed by malware such as Mirai and MS Antivirus in comparison to legitimate security software like Network Security Services (NSS). Consider the historical impact, attack vectors, mitigation strategies, and long-term implications."


This snippet sets the temperature parameter to 0 for deterministic response generation and
queries the GraphQAChain with the question. The function `get_answer_and_embedding`
returns the original answer as a string along with its embedding. The answer is then printed for review.

In [None]:
llm = OpenAI()  # Initialize the OpenAI client without temperature
temp = 0

original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)
# Compute the embedding for the original answer

In [None]:
def wrap_text(node, relation, max_words=5):
    """Wrap text if it contains more than max_words words and append '_explanation' to abstract nodes."""
    words = node.split()
    if len(words) > max_words:
        return f"{words[0]}_explanation"
    return node

def extract_entities_relations(question):
    """Extracts entities and key relations (verbs) from the question using spaCy."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(question)
    entities = [ent.text for ent in doc.ents]
    relations = [token.lemma_ for token in doc if token.pos_ == "VERB"]  # Focus on verbs as relations

    return entities, relations

def generate_chain_of_thought(kg, question):
    """Generates a reasoning chain by identifying relevant entities in the knowledge graph and determines an answer."""
    print("\n--- Reasoning Chain ---")
    entities, relations = extract_entities_relations(question)

    reasoning_chain = []
    relevant_nodes = set()
    final_answer = None

    for entity in entities:
        for triple in kg:
            node1, relation, node2 = triple[0], triple[1], triple[2]
            if entity.lower() in node1.lower() or entity.lower() in node2.lower():
                reasoning_chain.append((node1, relation, node2))
                relevant_nodes.update([node1, node2])

    for step in reasoning_chain:
        print(f"{step[0]} --[{step[1]}]--> {step[2]}")


    return {wrap_label(wrap_text(node, '')) for node in relevant_nodes}, reasoning_chain

def build_graph(kg):
    """Helper function to build the graph and assign colors."""
    G = nx.DiGraph()
    for node1, relation, node2 in kg:
        wrapped_node1 = wrap_text(node1, relation)
        wrapped_node2 = wrap_text(node2, relation)
        G.add_edge(wrap_label(wrapped_node1), wrap_label(wrapped_node2), label=wrap_label(relation))

    pos = nx.circular_layout(G, scale=3)  # Ensure well-spaced circular layout
    node_sizes = [1200 + 100 * G.degree(node) for node in G.nodes()]  # Adjust node size
    node_colors = ['#8da0cb' for _ in G.nodes()]
    edge_colors = ['black' for _ in G.edges()]  # Make edges bold

    return G, pos, node_sizes, node_colors, edge_colors

def visualize_graph(G, pos, node_sizes, node_colors, edge_colors, highlighted_nodes, reasoning_chain):
    """Visualize the knowledge graph with a circular layout and optimized label spacing."""
    plt.figure(figsize=(7, 7))  # Reduce figure size

    # Highlight relevant nodes
    highlight_node_colors = ['#ff9999' if node in highlighted_nodes else '#d3d3d3' for node in G.nodes()]

    # Draw graph with circular layout
    nx.draw(G, pos, node_size=node_sizes, node_color=highlight_node_colors, edge_color=edge_colors,
            width=1.5, with_labels=True, arrows=True, font_size=7)

    # Edge labels
    edge_labels = {edge: G.edges[edge]['label'] for edge in G.edges()}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=7)

    plt.title("Circular Knowledge Graph Visualization", fontsize=12, fontweight='bold')
    plt.savefig('knowledge_graph_circular_optimized.png', bbox_inches='tight')
    plt.show()



# Define the question for reasoning

question = "Analyze the cybersecurity risks posed by malware such as Mirai and MS Antivirus in comparison to legitimate security software like Network Security Services (NSS). Consider the historical impact, attack vectors, mitigation strategies, and long-term implications."

# Generate the reasoning chain and determine an answer
highlighted_nodes, reasoning_chain = generate_chain_of_thought(kg, question)

# Build graph
G, pos, node_sizes, node_colors, edge_colors = build_graph(kg)

# Visualize graph
visualize_graph(G, pos, node_sizes, node_colors, edge_colors, highlighted_nodes, reasoning_chain)


def format_triples_for_prompt(reasoning_chain):
    """
    Formats the extracted triples into a readable structure for LLM input.
    """
    prompt = "The following triples represent relevant knowledge about the question:\n\n"
    for node1, relation, node2 in reasoning_chain:
        prompt += f"- {node1} --[{relation}]--> {node2}\n"

    prompt += "\nBased on the above information, answer the question:\n"
    prompt += question
    return prompt

# Format the extracted triples for the prompt
structured_prompt = format_triples_for_prompt(reasoning_chain)

# Get response from LLM
final_answer = get_chat_response(structured_prompt)

# # Print the LLM's response
# print("\n--- LLM Response ---\n")
# print(final_answer)


In [None]:
# Store the original answer
original_answer = final_answer  # Save the original answer
original_answer_str = str(final_answer)  # Convert the answer to a string format

# Print the original answer in string format
print(original_answer_str)  # Output the original answer for verification
# Compute the embedding for the original answer
original_answer_embedding = get_embedding(original_answer)

In [None]:
similarities_cosine =[]
similarities_wd=[]
generated_embeddings = []
similarities = []
perturbations_vect2 = []
perturbation_texts = []

def format_triples_for_prompt(reasoning_chain):
    """
    Formats the extracted triples into a readable structure for LLM input.
    """
    if not reasoning_chain:
        return "No relevant triples extracted from the knowledge graph."

    prompt = "The following triples represent relevant knowledge about the question:\n\n"
    if isinstance(reasoning_chain, set):
        reasoning_chain = list(reasoning_chain)  # Convert set to list to avoid TypeError

    for triple in reasoning_chain:
        if isinstance(triple, (list, tuple)) and len(triple) == 3:
            prompt += f"- {triple[0]} --[{triple[1]}]--> {triple[2]}\n"

    prompt += "\nBased on the above information, answer the question:\n"
    prompt += question
    return prompt

# Loop for perturbations
for i in range(20):
    perturbation_vector = original.copy().flatten()

    # Randomly choose parts to remove
    num_parts_to_remove = random.randint(1, len(part_names))
    parts_to_remove_indices = random.sample(range(len(part_names)), num_parts_to_remove)

    # Set selected parts to 0
    for part_idx in parts_to_remove_indices:
        perturbation_vector[part_idx] = 0

    perturbations_vect2.append(perturbation_vector)

    # Perturb the KG
    parts_to_remove = [part_names[idx] for idx in parts_to_remove_indices]
    perturbed_kg = perturb_kg_by_removing_parts(kg, parts_to_remove)

    # Generate extracted triples from perturbed KG
    extracted_triples = generate_chain_of_thought(perturbed_kg, question)

    if not extracted_triples:
        extracted_triples = []  # Ensure it's at least an empty list to avoid errors

    # Format extracted triples for LLM input
    structured_prompt = format_triples_for_prompt(extracted_triples)

    # Get response from LLM
    temp_response = get_chat_response(structured_prompt)

    # Handle None or empty response
    if not temp_response:
        temp_response = "No meaningful response generated."
    perturbation_texts.append(temp_response)

    # Get embedding for the perturbed response
    try:
        temp_response_embedding = get_embedding(temp_response)
        generated_embeddings.append(temp_response_embedding)
    except Exception as e:
        print(f"Error generating embedding for response: {temp_response}")
        print(f"Exception: {e}")
        continue

    # Calculate Wasserstein distance between the original and perturbed responses
    similarity_wd = wasserstein_distance(original_answer_embedding, temp_response_embedding)
    similarities_wd.append(similarity_wd)

    # Calculate cosine similarity between the original and perturbed responses
    similarity_cosine = 1 - cosine(original_answer_embedding, temp_response_embedding)
    similarities_cosine.append(similarity_cosine)

    print(f"Iteration {i + 1}")
    print(f"Parts removed: {parts_to_remove}")
    print(f"original_answer response: {original_answer_str}")
    print(f"Perturbed response: {temp_response}")
    print(f"Wasserstein Distance with original answer: {similarity_wd}")
    print(f"Cosine Similarity with original answer: {similarity_cosine}\n")

# Convert perturbations_vect2 to a numpy array for pairwise distance calculation
perturbations_vect2 = np.array(perturbations_vect2)

# Calculate cosine distances between perturbation vectors and the original vector
distances = sklearn.metrics.pairwise_distances(perturbations_vect2, original, metric='cosine').ravel()

# Assuming you may use kernel width in further computations
kernel_width = 0.25
weights = np.sqrt(np.exp(-(distances**2)/kernel_width**2))

# Print all similarities and weights
print(f"Wasserstein Distances: {similarities_wd}")
print(f"Cosine Similarities: {similarities_cosine}")
print(f"Weights: {weights}")

# Optionally print all perturbation texts together for a consolidated view
print("\n--- Summary of Perturbations ---")
for i, text in enumerate(perturbation_texts):
    print(f"Perturbation {i + 1}: {text}")




In [None]:
# min_value_wd= min(similarities_wd)
# max_value_wd = max(similarities_wd)
# # Scale between 0 and 1
# Similarities_ = [(value - min_value_wd) / (max_value_wd - min_value_wd) for value in similarities_wd]
# print(Similarities_)
# simpler_model = LinearRegression()
# simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
# coeff = simpler_model.coef_
# coeff

In [None]:
epsilon = 1e-6

# Calculate the inverse of each Wasserstein distance, adding a small epsilon to avoid division by zero
inverse_similarities_wd = [1.0 / (dist + epsilon) for dist in similarities_wd]

# Find the minimum and maximum of the inverse Wasserstein distances
min_value = min(inverse_similarities_wd)
max_value = max(inverse_similarities_wd)

# Scale inverse Wasserstein distances between 0 and 1

Similarities_ = [(value - min_value) / (max_value - min_value) for value in inverse_similarities_wd]
print(Similarities_)
simpler_model = LinearRegression()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
plot_knowledge_graph_explainability(kg, part_indices, coeff)

In [None]:
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

# Preprompt

# Simple Running of Prompt

In [None]:
question = "Which software in the dataset has been misused for malicious purposes, and what were the consequences?"
#Portion 3

In [None]:
#new
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)
# Compute the embedding for the original answer

In [None]:
temp = 0
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)
# Compute the embedding for the original answer

# PrePrompt Approach

    Generate multiple rephrasings of the question.
    Query the LLM with these variations.
    Aggregate the answers to determine a single, accurate answer based on the combined responses, without showing each answer individually.

Approach

To achieve this, we can use the following strategy:

    Cosine Similarity Aggregation: Instead of a simple voting mechanism, use an embedding-based approach where we:
        Convert each generated answer into embeddings.
        Compute cosine similarity between all the answers to find the most representative response (centroid).
    Weighted Aggregation: Use embeddings to weigh the answers and pick the answer closest to the centroid of all embeddings, making the final answer more robust and accurate.

This method provides a form of explainability since the answer chosen is the one most aligned with all variations.

Explanation

    Rephrasing: The rephrase_question_via_openai function generates different versions of the input question using OpenAI.
    Running the Questions: The run_questions_with_graph_qa function queries the LLM using each rephrased question.
    Embedding-Based Aggregation:
        Uses SentenceTransformer to generate embeddings for each answer.
        Finds the centroid (average embedding) of all answers.
        Selects the answer closest to the centroid using cosine similarity, providing a robust and representative final answer.

Requirements

Install the necessary packages:

pip install openai langchain sentence-transformers scikit-learn

Notes

    The all-MiniLM-L6-v2 model from sentence-transformers is a lightweight and efficient model for embedding generation.
    This method is more robust and explainable than simple voting or direct answer selection, as it considers the semantic similarity between answers.

In [None]:
import numpy as np
from langchain.chains import GraphQAChain
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import openai
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


# Initialize LLM and GraphQAChain
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=temp)

# Initialize the GraphQAChain with the correct LLM
graph_qa_chain = GraphQAChain.from_llm(llm, graph=graph, verbose=False)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
client = openai.Client()

def get_chat_response(text):
    """
    Get a chat completion response using the OpenAI client.
    """
    try:
        chat_completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": text}],
        )
        return chat_completion.choices[0].message.content.strip()
    except Exception as e:
        logging.error(f"Error during chat response: {e}")
        return ""

def rephrase_question_via_openai(question, num_variants=5):
    """
    Create different rephrased versions of the input question.
    """
    text = f"Please rephrase the following question in {num_variants} different ways:\nQuestion: {question}"
    response_text = get_chat_response(text)
    rephrased_questions = [q.strip() for q in response_text.split("\n") if q.strip()]
    return rephrased_questions[:num_variants]

def run_questions_with_graph_qa(questions):
    """
    Run the list of questions through GraphQAChain and collect answers.
    """
    results = {}
    for question in questions:
        try:
            answer = graph_qa_chain.run(question)
            results[question] = answer
        except Exception as e:
            results[question] = f"Error: {str(e)}"
    return results

def filter_valid_answers(answers):
    """
    Remove answers that contain 'I do not know' or similar phrases.
    """
    valid_answers = {q: ans for q, ans in answers.items() if "I do not know" not in ans and "I don't know" not in ans}
    return valid_answers

def aggregate_answers_with_embeddings(valid_answers):
    """
    Aggregate answers using cosine similarity of embeddings to determine the most accurate response.
    """
    if not valid_answers:
        return "No valid answers available."

    answer_list = list(valid_answers.values())
    embeddings = embedding_model.encode(answer_list)
    centroid = np.mean(embeddings, axis=0).reshape(1, -1)
    similarities_to_centroid = cosine_similarity(embeddings, centroid).flatten()
    best_index = np.argmax(similarities_to_centroid)
    return answer_list[best_index]

# Example usage
input_question = "Which software in the dataset has been misused for malicious purposes, and what were the consequences?"

# Generate rephrased questions
rephrased_questions = rephrase_question_via_openai(input_question)

# Run the rephrased questions through GraphQAChain
answers = run_questions_with_graph_qa(rephrased_questions)

# Filter valid answers
filtered_answers = filter_valid_answers(answers)

# Check if there are valid answers before proceeding
if not filtered_answers:
    final_answer = "No valid answers available."
else:
    # Aggregate the answers
    final_answer = aggregate_answers_with_embeddings(filtered_answers)

# Save the final answer to a file
with open("final_answer.txt", "w") as file:
    file.write(final_answer)

# Print results
print("\nRephrased Questions and Their Answers:\n" + "="*50)
for question, answer in answers.items():  # Print all original answers (including "I don't know")
    print(f"Question: {question}\nAnswer: {answer}\n{'-'*50}")

print("\nFiltered Questions and Their Answers:\n" + "="*50)
for question, answer in filtered_answers.items():  # Print only filtered valid answers
    print(f"Question: {question}\nAnswer: {answer}\n{'-'*50}")

print("\nFinal Aggregated Answer:\n" + "="*50)
print(final_answer)


In [None]:
original_answer =final_answer
original_answer_str = str(final_answer)

print(original_answer_str)
# Compute the embedding for the original answer
original_answer_embedding = get_embedding(original_answer)

In [None]:
similarities_cosine = []
similarities_wd = []
generated_embeddings = []
perturbations_vect2 = []
perturbation_texts = []

# Loop for perturbations
for i in range(20):
    perturbation_vector = original.copy().flatten()
    num_parts_to_remove = random.randint(1, len(part_names))
    parts_to_remove_indices = random.sample(range(len(part_names)), num_parts_to_remove)


    for part_idx in parts_to_remove_indices:
        perturbation_vector[part_idx] = 0

    perturbations_vect2.append(perturbation_vector)
    parts_to_remove = [part_names[idx] for idx in parts_to_remove_indices]
    perturbed_kg = perturb_kg_by_removing_parts(kg, parts_to_remove)

    graph_temp = NetworkxEntityGraph()
    for (node1, relation, node2) in perturbed_kg:
        graph_temp.add_triple(KnowledgeTriple(node1, relation, node2))

    rephrased_questions = rephrase_question_via_openai(input_question)
    perturbed_answers = run_questions_with_graph_qa(rephrased_questions)
    filtered_answers = filter_valid_answers(perturbed_answers)
    temp_response = aggregate_answers_with_embeddings(filtered_answers)

    if not temp_response:
        temp_response = "No valid response generated."

    perturbation_texts.append(temp_response)
    temp_response_embedding = get_embedding(temp_response)
    generated_embeddings.append(temp_response_embedding)

    #Calculate Wasserstein distance between the original and perturbed responses
    similarity_wd = wasserstein_distance(original_answer_embedding, temp_response_embedding)
    similarities_wd.append(similarity_wd)

    # Calculate cosine similarity between the original and perturbed responses
    similarity_cosine = 1 - cosine(original_answer_embedding, temp_response_embedding)
    similarities_cosine.append(similarity_cosine)

    print(f"Iteration {i + 1}")
    print(f"Parts removed: {parts_to_remove}")
    print(f"Original response: {original_answer_str}")
    print(f"Perturbed response: {temp_response}")
    print(f"Wasserstein Distance with original answer: {similarity_wd}")
    print(f"Cosine Similarity with original answer: {similarity_cosine}\n")

# Convert perturbations_vect2 to a numpy array for pairwise distance calculation
perturbations_vect2 = np.array(perturbations_vect2)

# Calculate cosine distances between perturbation vectors and the original vector
distances = sklearn.metrics.pairwise_distances(perturbations_vect2, original, metric='cosine').ravel()

# Assuming you may use kernel width in further computations
kernel_width = 0.25
weights = np.sqrt(np.exp(-(distances**2)/kernel_width**2))

# Print all similarities and weights
print(f"Wasserstein Distances: {similarities_wd}")
print(f"Cosine Similarities: {similarities_cosine}")
print(f"Weights: {weights}")

# Optionally print all perturbation texts together for a consolidated view
print("\n--- Summary of Perturbations ---")
for i, text in enumerate(perturbation_texts):
    print(f"Perturbation {i + 1}: {text}")


In [None]:
epsilon = 1e-6

# Calculate the inverse of each Wasserstein distance, adding a small epsilon to avoid division by zero
inverse_similarities_wd = [1.0 / (dist + epsilon) for dist in similarities_wd]

# Find the minimum and maximum of the inverse Wasserstein distances
min_value = min(inverse_similarities_wd)
max_value = max(inverse_similarities_wd)

# Scale inverse Wasserstein distances between 0 and 1

Similarities_ = [(value - min_value) / (max_value - min_value) for value in inverse_similarities_wd]
print(Similarities_)
simpler_model = LinearRegression()
simpler_model.fit(X=perturbations_vect2, y=Similarities_, sample_weight=weights)
coeff = simpler_model.coef_
coeff

In [None]:
plot_knowledge_graph_explainability(kg, part_indices, coeff)

In [None]:
y_true = np.array(Similarities_).ravel()
y_pred = simpler_model.predict(perturbations_vect2).ravel()

# Call the function
calculate_fidelity_metrics(y_true, y_pred, weights, coeff)

In [None]:
# Call the function
plot_actual_vs_predicted(y_true, y_pred, weights)

In [None]:
temp = 0
question = "Which software in the dataset has been misused for malicious purposes, and what were the consequences?"
original_answer_str, original_answer_embedding = get_answer_and_embedding(question, temp, graph)
print(original_answer_str)
# Compute the embedding for the original answer