In [15]:
from falkordb import FalkorDB
from faker import Faker
import numpy as np
import random
from langchain.chat_models import ChatHuggingFace
from langchain.schema import HumanMessage, SystemMessage
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain import HuggingFaceHub


# Initialize Faker and FalkorDB
faker = Faker()
db = FalkorDB(host='localhost', port=6379)
graph = db.select_graph('Vulnerabilities_v1')

# Step 1: Generate synthetic dataset with 1000 vulnerabilities
def generate_dataset(num_records=1000):
    devices = []
    vulnerabilities = []
    severities = ['low', 'medium', 'high', 'critical']
    tags_pool = ['buffer overflow', 'encryption', 'malware', 'misconfiguration', 
                 'authentication', 'data leak', 'phishing']

    for _ in range(num_records):
        device_name = faker.hostname()
        vuln_id = faker.uuid4()
        severity = random.choice(severities)
        tags = random.sample(tags_pool, random.randint(1, 3))
        # Using tags as a representation for vector rather than random description
        description_vector = [random.uniform(0.1, 1.0) for _ in range(5)]

        devices.append({
            "name": device_name
        })

        vulnerabilities.append({
            "device": device_name,
            "id": vuln_id,
            "severity": severity,
            "tags": tags,
            "description_vector": description_vector
        })
    
    return devices, vulnerabilities

devices, vulnerabilities = generate_dataset()

# Step 2: Insert the generated dataset into FalkorDB
def insert_data(devices, vulnerabilities):
    create_queries = []
    
    for vuln in vulnerabilities:
        query = f"""
        CREATE 
            (:Device {{name: '{vuln["device"]}'}})-[:HAS_VULNERABILITY]->(
                :Vulnerability {{
                    id: '{vuln["id"]}', 
                    severity: '{vuln["severity"]}', 
                    description: vecf32({vuln["description_vector"]}), 
                    tags: {vuln["tags"]}
                }})
        """
        create_queries.append(query)
    
    for query in create_queries:
        graph.query(query)

insert_data(devices, vulnerabilities)

# Step 3: Create a vector index for vulnerabilities
graph.create_node_vector_index('Vulnerability_v1', 'description', dim=5, similarity_function='cosine')

# Step 4: Query Function to find similar vulnerabilities
def find_similar_vulnerabilities(query_text, top_k=10):
    try:
        # For simplicity, we simulate a query vector based on the tags (you could use an embedding model here)
        query_vector = [random.uniform(0.1, 1.0) for _ in range(5)]  # Replace this with real embedding logic

        # Vector search to find the most similar vulnerabilities
        vector_search_result = graph.query(f"""
            CALL db.idx.vector.queryNodes('Vulnerability', 'description', {top_k}, vecf32({query_vector}))
            YIELD node 
            RETURN node.id AS id, node.severity AS severity, node.tags AS tags
        """)

        vector_results = vector_search_result.result_set

        if not vector_results:
            print("No similar vulnerabilities found.")
        return vector_results
    except Exception as e:
        print(f"Error in vector search: {e}")
        return []

# Step 5: Knowledge Graph query for devices with high vulnerabilities
def find_devices_with_high_vulnerabilities():
    try:
        # Querying devices with critical or high vulnerabilities
        knowledge_query = """
            MATCH (d:Device)-[:HAS_VULNERABILITY]->(v:Vulnerability)
            WHERE v.severity IN ['high', 'critical']
            RETURN d.name AS device_name, v.id AS vuln_id, v.severity AS severity, v.tags AS tags
        """
        knowledge_results = graph.query(knowledge_query)
        return knowledge_results.result_set
    except Exception as e:
        print(f"Error in knowledge graph query: {e}")
        return []

# Step 6: Filter and format results
def filter_results(knowledge_results, vector_results, query_text, max_items=5):
    # Simple filtering by relevant tags (you can add more complex logic here)
    relevant_tags = [tag for tag in query_text.lower().split() if tag in ['phishing', 'malware', 'data leak', 'encryption']]

    # Filter knowledge graph results
    filtered_knowledge = [
        (result[0], result[1], result[2], result[3])
        for result in knowledge_results if any(tag in result[3] for tag in relevant_tags)
    ][:max_items]

    # Filter vector search results (this could be based on similarity score or tags)
    filtered_vector = [
        (result[0], random.uniform(0.5, 1.0))  # Simulating similarity score for demo purposes
        for result in vector_results
    ][:max_items]

    return filtered_knowledge, filtered_vector

In [23]:
# Step 7: Generate response based on query and search results
def generate_response(query_text):
    try:
        # Vector search results
        vector_results = find_similar_vulnerabilities(query_text)
        knowledge_results = find_devices_with_high_vulnerabilities()

        # Filter results based on query text (relevant tags)
        filtered_knowledge, filtered_vector = filter_results(knowledge_results, vector_results, query_text, max_items=5)
        
        if not filtered_vector:
            vector_output = "No similar vulnerabilities found."
        else:
            vector_output = "\n".join(
                [f"Vulnerability ID: {v[0]}, Similarity: {v[1]:.2f}" for v in filtered_vector]
            )
        
        if not filtered_knowledge:
            knowledge_output = "No devices found with high or critical vulnerabilities related to the query."
        else:
            knowledge_output = "\n".join(
                [f"Device: {row[0]}, Vulnerability ID: {row[1]}, Severity: {row[2]}, Tags: {row[3]}" for row in filtered_knowledge]
            )
        
        # LangChain setup for LLM invocation
        messages = [
            SystemMessage(content="You are a cybersecurity assistant"),
            HumanMessage(content=f"""
                    Query: {query_text}
                    Knowledge Graph Output:
                    {knowledge_output}
                    
                    Vector Search Output:
                    {vector_output}
                """)
        ]

        # Using HuggingFace Pipeline model
        llm = HuggingFaceHub(
            repo_id="Qwen/Qwen2.5-3B-Instruct", 
            huggingfacehub_api_token='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',
            model_kwargs={"max_tokens": 1024}
         )

        chat_model = ChatHuggingFace(llm=llm)
        ai_msg = chat_model.invoke(messages)
        return ai_msg.content

    except Exception as e:
        print(f"Error generating response: {e}")
        return "An error occurred while generating the response."

# Example query
query_text = "What are the vulnerabilities involving phishing or malware?"
response = generate_response(query_text)
print("LLM Response:\n", response)


Error in vector search: Invalid arguments for procedure 'db.idx.vector.queryNodes'
LLM Response:
 <|im_start|>system
You are a cybersecurity assistant<|im_end|>
<|im_start|>user

                    Query: What are the vulnerabilities involving phishing or malware?
                    Knowledge Graph Output:
                    Device: db-55.hamilton-collier.info, Vulnerability ID: fa97a9f8-c279-4fca-8d43-8af3e81d7d81, Severity: high, Tags: ['malware', 'phishing', 'misconfiguration']
Device: laptop-02.adams-welch.com, Vulnerability ID: f785d3b9-2e70-441c-ae67-4c5b8f98f335, Severity: high, Tags: ['malware', 'phishing', 'encryption']
Device: desktop-18.galloway.com, Vulnerability ID: dac8cc1d-7ba9-4274-9e4d-a2fe28fd21d9, Severity: critical, Tags: ['malware', 'phishing']
Device: email-49.thomas-brown.info, Vulnerability ID: 186ae146-e05f-453a-8aae-9050c8d8aa44, Severity: high, Tags: ['phishing']
Device: email-16.miranda-lewis.com, Vulnerability ID: 1fac4317-3ae8-4140-aa4b-e0c0b9d6899d, Se