In [None]:
import pandas as pd
import pymongo
import gensim
from gensim import corpora
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

# Load the DataFrame from the Pickle file
df = pd.read_pickle('C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\processed_document_data.pkl')

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_lg")

# Load the pre-trained LDA model
lda_model = gensim.models.LdaMulticore.load('C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\lda_model.pkl')

# Create a dictionary for topic modeling
dictionary = corpora.Dictionary(df['Trigram'].apply(lambda x: x.split()))

# MongoDB Connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["WS_Data_DB"]
collection = db["LogRhythm7_15Docs"]

# Define predefined topics
predefined_topics = [
    "agent",
    "siem",
    "logrhythm",
    "network monitoring", 
    "threat detection", 
    "system configuration",
    "AI Engine",
    "AI Engine Rules",
    "System Monitor Agent",
    "Disaster Recovery",
    "High Availability",
    "Log Sources",
    "Data Processor",
    "Mediator",
    "Open Collector",
    "Installation",
    "Upgrade",
    "MPE",
    "Message Processing Engine",
    "Platform Manager",
    "Configuration Manager",
    "Data Indexer",
    "Global Log Processing",
    "Security Policies",
    "Log Source Virtualization",
    "Advanced Reporting",
    "SIEM Platform Optimization",
    "Real-Time Data Analysis",
    "Log Data Normalization",
    "Security Event Correlation",
    "Network Behavior Analysis",
    "Custom Rule Creation",
    "Cloud-Based Log Management",
    "Data Privacy and Protection",
    "Automated Threat Hunting",
    "Log Source Integration",
    "Compliance Reporting",
    "Scalable Architecture",
    "Customizable Dashboards",
    "Security Information Management",
    "Advanced Threat Intelligence",
    "AI-Driven Security Analysis",
    "User and Entity Behavior Profiling",
    "Security Workflow Automation",
    "Incident Management and Response",
    "Endpoint Data Collection",
    "Forensic Data Analysis",
    "Cybersecurity Risk Assessment",
    "Log Archive and Retrieval",
    "Security Operations Center (SOC) Analytics",
    "Threat Lifecycle Management Framework",
    "LogRhythm NetMon Integration",
    "CloudAI Security Analytics",
    "LogRhythm Labs Threat Research",
    "Managed Security Services",
    "Data Exfiltration Detection",
    "Customizable Alerting System",
    "Cyber Resilience Strategies",
    "Log Source Management",
    "Advanced Security Orchestration",
    "Regulatory Compliance Assurance",
    "Multi-Tenant Security Management",
    "Extended Detection and Response (XDR)",
    "LogRhythm API Capabilities",
    "Security Data Lake",
    "Cross-Platform Security Integration",
    "LogRhythm Training Programs",
    "Cybersecurity Policy Enforcement",
    "Threat Detection Algorithms",
    "Security Data Visualization",
    "Log Management Best Practices"
]
    
# Function to fetch content for a given topic from MongoDB
def fetch_content_for_topic(topic):
    documents = collection.find({})
    for doc in documents:
        content_sections = doc.get('content_sections', {})
        if isinstance(content_sections, dict):
            for section_title, section_content in content_sections.items():
                if topic.lower() in section_title.lower():
                    return section_content
    return "Sorry, I don't have information on that topic."

# Define the keyword_based_extraction function
def keyword_based_extraction(input_text):
    lower_text = input_text.lower()
    for keyword in predefined_topics:
        if keyword in lower_text:
            return keyword
    return None

# Enhanced topic extraction function
def enhanced_extract_topics(input_text):
    input_vec = nlp(input_text)
    topic_scores = {topic: input_vec.similarity(nlp(topic)) for topic in predefined_topics}
    relevant_topic = max(topic_scores, key=topic_scores.get, default=None)
    return relevant_topic if relevant_topic and topic_scores[relevant_topic] > 0.5 else None

# Main chat loop
print("Welcome to the Chatbot! Type 'exit' to end the conversation.")
while True:
    user_input = input("You: ").strip()
    if user_input.lower() == "exit":
        print("Bot: Goodbye! Have a great day!")
        break

    # First, try keyword-based extraction
    topic = keyword_based_extraction(user_input)
    if not topic:
        # If no keyword match, use enhanced topic modeling
        topic = enhanced_extract_topics(user_input)

    response = fetch_content_for_topic(topic) if topic else "I'm sorry, I don't understand."
    print("Bot:", response)

Welcome to the Chatbot! Type 'exit' to end the conversation.
You: siem
Bot: To effectively fight threats, you need the right tools. The LogRhythm SIEM Platform aligns your team, technology, and processes. It helps you see across your IT environment, identify threats, and quickly mitigate and recover from security incidents.

You: logrhythm
Bot: To effectively fight threats, you need the right tools. The LogRhythm SIEM Platform aligns your team, technology, and processes. It helps you see across your IT environment, identify threats, and quickly mitigate and recover from security incidents.

You: agent
Bot: LogRhythm provides support for Agent failover across several Data Processors (up to three is the most common configuration). Three prioritized Processors configured into the Agent as mediator1, mediator2, and mediator3 (ordered list). Collection performance is maintained across Mediator failover, and this capability can also be used to support Agent load balancing.
One of three scena

  topic_scores = {topic: input_vec.similarity(nlp(topic)) for topic in predefined_topics}


Bot: Onboarding new log sources should be easy. That’s why we’ve expanded the number of Beats LogRhythm Administrators
can manage from the Web Console
. By onboarding log sources in the Web Console, you can save time and cut your Beat Administration workload in half. In this latest release, LogRhythm now supports management for six additional Beats including:
Gmail Message Tracking
Gmail Message Tracking
GSuite
GSuite
Okta
Okta
Darktrace
Darktrace
Sophos
Sophos
Qualys FIM
Qualys FIM


You: manager
Bot: The Platform Manager (PM) is the hub of a LogRhythm installation. It serves as the central repository for events, configuration and licensing information, the LogRhythm Knowledge Base, and LogMart. In small deployments, the Platform Manager can also host other LogRhythm components. In larger deployments, it should be a dedicated system. There is only one PM per deployment
The Platform Manager (PM) is a Windows Server system running SQL Server, the LogRhythm Alarming and Response Manager 

document = collection.find_one({"topic": "agent"})
if document:
    print(document['content'])  # This should print the content related to LogRhythm
else:
    print("Content not found in the database.")

In [28]:
# Example topic for testing
test_topic = "Network"  # Replace with a known topic from your database

# Call the function and print the result
test_result = fetch_content_for_topic(test_topic)
print("Content for '{}':\n{}".format(test_topic, test_result))

Content for 'Network':
Sorry, I don't have information on that topic.


print("Content for 'SIEM':", fetch_content_for_topic("SIEM"))
print("Content for 'Agent':", fetch_content_for_topic("Agent"))

import pymongo

# MongoDB connection setup
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["WS_Data_DB"]  # Replace with your database name
collection = db["LogRhythm7_15Docs"]  # Replace with your collection name

# Define your predefined topics
predefined_topics = [
    "agent",
    "siem",
    "logrhythm",
    "network monitoring", 
    "threat detection", 
    "system configuration",
    "AI Engine",
    "AI Engine Rules",
    "System Monitor Agent",
    "Disaster Recovery",
    "High Availability",
    "Log Sources",
    "Data Processor",
    "Mediator",
    "Open Collector",
    "Installation",
    "Upgrade",
    "MPE",
    "Message Processing Engine",
    "Platform Manager",
    "Configuration Manager",
    "Data Indexer",
    "Global Log Processing",
    "Security Policies",
    "Log Source Virtualization",
    "Advanced Reporting",
    "SIEM Platform Optimization",
    "Real-Time Data Analysis",
    "Log Data Normalization",
    "Security Event Correlation",
    "Network Behavior Analysis",
    "Custom Rule Creation",
    "Cloud-Based Log Management",
    "Data Privacy and Protection",
    "Automated Threat Hunting",
    "Log Source Integration",
    "Compliance Reporting",
    "Scalable Architecture",
    "Customizable Dashboards",
    "Security Information Management",
    "Advanced Threat Intelligence",
    "AI-Driven Security Analysis",
    "User and Entity Behavior Profiling",
    "Security Workflow Automation",
    "Incident Management and Response",
    "Endpoint Data Collection",
    "Forensic Data Analysis",
    "Cybersecurity Risk Assessment",
    "Log Archive and Retrieval",
    "Security Operations Center (SOC) Analytics",
    "Threat Lifecycle Management Framework",
    "LogRhythm NetMon Integration",
    "CloudAI Security Analytics",
    "LogRhythm Labs Threat Research",
    "Managed Security Services",
    "Data Exfiltration Detection",
    "Customizable Alerting System",
    "Cyber Resilience Strategies",
    "Log Source Management",
    "Advanced Security Orchestration",
    "Regulatory Compliance Assurance",
    "Multi-Tenant Security Management",
    "Extended Detection and Response (XDR)",
    "LogRhythm API Capabilities",
    "Security Data Lake",
    "Cross-Platform Security Integration",
    "LogRhythm Training Programs",
    "Cybersecurity Policy Enforcement",
    "Threat Detection Algorithms",
    "Security Data Visualization",
    "Log Management Best Practices"
]

# Function to fetch and print content for a given topic
def print_content_for_topic(topic):
    document = collection.find_one({"topic": topic})
    if document:
        print(f"Content for '{topic}': {document['content']}")
    else:
        print(f"No content found for topic '{topic}'.")

# Test each predefined topic
for topic in predefined_topics:
    print_content_for_topic(topic) 

In [29]:
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["WS_Data_DB"]  # Replace with your database name
collection = db["LogRhythm7_15Docs"]  # Replace with your collection name

# Get distinct topics
topics = collection.distinct("content_sections")

# Print the list of topics
for topic in topics:
    print(topic)

{'General': 'Go to homepage\nLogRhythm SIEM\nlogrhythm.com\ncommunity\nsupport\nuniversity\nfeedback\n', 'LogRhythm SIEM': 'To effectively fight threats, you need the right tools. The LogRhythm SIEM Platform aligns your team, technology, and processes. It helps you see across your IT environment, identify threats, and quickly mitigate and recover from security incidents.\n', 'Documentation': '7.15.0.64 Release Notes - 5 February 2024\n7.15.0 GA Release Notes - 4 January 2024\n7.14.0.107 Release Notes - 18 October 2023\n7.14.1 Release Notes - 13 November 2023\n7.14.0 GA Release Notes - 2 October 2023\n7.13.0.78 Release Notes - 2 August 2023\n7.13.0.76 Release Notes - 17 July 2023\n7.13.0 GA Release Notes - 29 June 2023\n7.12.0 GA Release Notes - 3 April 2023\n7.11.0.43 Release Notes - 6 March 2023\n7.10.0.124 Release Notes - 6 March 2023\n7.11.0 GA Release Notes - 5 January 2023\n7.10.0.123 Release Notes - 11 October 2022\n7.10.0 GA Release Notes - 29 September 2022\n7.9.0 GA Release No