# LogRhythm Chatbot for Document Query and Summarization

This notebook presents a NLP-based chatbot that utilizes Word2Vec, TF-IDF, spaCy, and Hugging Face Transformers to analyze, categorize, and summarize textual data from LogRhythm documentation. It aims to provide relevant responses to user queries by identifying the most appropriate category and document.

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import joblib
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq
import nltk
import spacy
from transformers import pipeline
from summa import keywords
from collections import Counter

### Download libraries, traing models, and cleaned data used in the chatbot

In [None]:
# Downloading necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Loading spaCy model for extractive summarization
nlp = spacy.load("en_core_web_lg")

# Initializing Hugging Face pipeline for abstractive summarization
abstractive_summarizer = pipeline("summarization")

# Loading the trained Word2Vec model and TF-IDF vectorizer
word2vec_model = Word2Vec.load("word2vec_model.bin")
tfidf_vectorizer = joblib.load("tfidf_model.pkl")

# Loading the cleaned data with categories
df = pd.read_csv('cleaned_section_data_with_categories.csv')

# Loading the combined embeddings
combined_embeddings = np.load('combined_features.npy')

### Setting Up Categories and Keyword Rules

Define categories and corresponding keywords to classify user inputs effectively.

In [None]:
# Defining categories and their associated keywords
categories = {
    'Installation & Setup': [
        'install', 'setup', 'implementation', 'deployment', 'configure', 'initialization', 
        'installing', 'deploy', 'configuration', 'set-up', 'initiate', 'launch', 'activate',
        'how to install', 'setting up', 'installation guide', 'deploying', 'configuring'
    ],
    'Maintenance & Management': [
        'maintain', 'maintenance', 'servicing', 'management', 'optimization', 'service', 
        'manage', 'routine check', 'system upkeep', 'system care', 'upkeep', 'tune-up',
        'maintaining', 'managing', 'service routine', 'optimizing', 'how to maintain'
    ],
    'Troubleshooting & Support': [
        'troubleshoot', 'error', 'issue', 'problem', 'diagnosis', 'resolution', 'fix', 
        'solve', 'rectify', 'repair', 'resolve', 'correct', 'debug', 'fault finding',
        'troubleshooting', 'solving issues', 'fixing errors', 'diagnosing problems', 'resolving'
    ],
    'Upgrades & Updates': [
        'upgrade', 'update', 'new version', 'patch', 'release', 'enhancement', 'updating', 
        'upgrading', 'version upgrade', 'system update', 'software update', 'patching',
        'how to upgrade', 'applying updates', 'version updating', 'software enhancement'
    ],
    'General Information & Overview': [
        'overview', 'introduction', 'info', 'summary', 'guide', 'documentation', 
        'information', 'details', 'background', 'basics', 'general data', 'key points',
        'what is', 'explain', 'description of', 'details about'
    ],
    'Security & Monitoring': [
        'surveillance', 'log management', 'event tracking', 'real-time analysis', 
        'security watch', 'monitoring', 'security check', 'system monitoring', 'network watch',
        'security overview', 'monitoring setup', 'event tracking system'
    ],
    'Threat Detection & Analysis': [
        'threat detection', 'anomaly detection', 'intrusion detection', 'threat intelligence', 
        'security alerts', 'risk detection', 'threat identification', 'vulnerability detection', 
        'security threat detection', 'analyzing threats', 'identifying risks', 'detecting anomalies'
    ],
    'Incident Response & Management': [
        'incident response', 'incident management', 'forensics', 'mitigation', 'recovery', 
        'incident handling', 'crisis management', 'incident analysis', 'emergency response',
        'responding to incidents', 'managing incidents', 'incident recovery'
    ],
    'Compliance & Auditing': [
        'compliance', 'regulatory compliance', 'audit', 'reporting', 'policy enforcement', 
        'regulation management', 'compliance tracking', 'legal compliance', 'audit management',
        'compliance policies', 'auditing processes', 'regulatory reporting'
    ],
    'Integration & Compatibility': [
        'integration', 'compatibility', 'third-party integration', 'API', 'interoperability', 
        'system merging', 'software integration', 'data integration', 'platform integration',
        'integrating systems', 'API usage', 'compatibility issues'
    ],
    'Network Security & Protection': [
        'network security', 'firewall', 'traffic analysis', 'intrusion prevention', 
        'network protection', 'cybersecurity', 'network defense', 'network safeguard',
        'protecting networks', 'network firewalls', 'cybersecurity measures'
    ]
}

# Function to determine the category of user input based on keywords
def determine_category(user_input):
    for category, keywords in categories.items():
        if any(keyword in user_input.lower() for keyword in keywords):
            return category
    return 'Other'

### Text Embedding and Summarization Functions

Set up functions for generating combined text embeddings, extractive and abstractive summarization, and text highlighting.

In [None]:
# Function to generate a combined embedding for a given text
def get_combined_embedding(text):
    # Tokenize the text and generate embeddings
    words = word_tokenize(text)
    word_embeddings = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    w2v_embedding = np.mean(word_embeddings, axis=0) if word_embeddings else np.zeros(word2vec_model.vector_size)
    tfidf_embedding = tfidf_vectorizer.transform([text]).toarray()[0]
    combined_embedding = np.hstack((w2v_embedding, tfidf_embedding))
    return combined_embedding

# Extractive summarization function using spaCy
def summarize_text_spacy(text, num_sentences=3):
    doc = nlp(text)
    word_frequencies = Counter(token.text.lower() for token in doc if not token.is_stop and not token.is_punct)
    max_frequency = max(word_frequencies.values())
    sentence_scores = {sentence: sum(word_frequencies[token.text.lower()] for token in sentence) / max_frequency for sentence in doc.sents}
    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join([sentence.text for sentence in summary_sentences])

# Abstractive summarization function using Hugging Face Transformers
def generate_summary(text):
    return abstractive_summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']

# Summarization function using summa
def summarize_text_summa(text, num_keywords=10):
    return keywords.keywords(text, words=num_keywords)

# Function to highlight context in the text
def highlight_context(full_text, user_input):
    # Split the full text into sentences
    sentences = full_text.split('.')
    
    # Identify keywords in the user's query
    keywords = user_input.split()
    
    # Initialize list to store highlighted sentences
    highlighted_sentences = []
    
    # Iterate through sentences to find and highlight relevant portions
    for sentence in sentences:
        for keyword in keywords:
            if keyword.lower() in sentence.lower():
                # Highlight the keyword in the sentence
                highlighted_sentence = sentence.replace(keyword, f'**{keyword}**')
                highlighted_sentences.append(highlighted_sentence)
                break
    
    # Combine highlighted sentences into a single text
    highlighted_text = '. '.join(highlighted_sentences)
    
    return highlighted_text

### Chatbot Response Generation

Create a function to generate chatbot responses based on user inputs, using the most relevant document and summarization techniques.

In [None]:
# Function to find the most relevant document
def find_most_relevant_document(input_text, filtered_df):
    input_embedding = get_combined_embedding(input_text)
    max_similarity = 0
    most_similar_doc_index = None
    for index, row in filtered_df.iterrows():
        doc_embedding = combined_embeddings[index]
        similarity = cosine_similarity([input_embedding], [doc_embedding])[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_doc_index = index
    return most_similar_doc_index, max_similarity

# Function to generate chatbot response
def generate_chatbot_response(user_input):
    user_category = determine_category(user_input)
    filtered_df = df[df['Category'] == user_category]
    doc_index, similarity = find_most_relevant_document(user_input, filtered_df)
    
    if similarity > 0.15 and doc_index is not None:
        full_text = df.iloc[doc_index]['Cleaned Text']
        highlighted_text = highlight_context(full_text, user_input)  # Function to highlight relevant portions
        if highlighted_text.strip():
            response = f"Here is the information I found:\n\n{highlighted_text}"
        else:
            response = "I found a document, but it doesn't contain enough information to summarize. Please try a more specific query or consult the LogRhythm documentation."
    else:
        response = "I'm sorry, I couldn't find relevant information based on your query. Please try asking differently or consult the LogRhythm documentation."
    
    return response    

# Main chatbot interaction loop
print("LogRhythm Chatbot — Type 'quit' to exit.")
while True:
    user_input = input("You: ").strip()
    if user_input.lower() == "quit":
        print("\033[1mLogRhythm Chatbot: Goodbye!\033[0m")
        break
    response = generate_chatbot_response(user_input)
    print("\033[1mLogRhythm Chatbot:\033[0m", response)