In [5]:
import pymongo
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from bs4 import BeautifulSoup, Comment, MarkupResemblesLocatorWarning
from collections import Counter
import re
import warnings
import pandas as pd
from langdetect import detect, DetectorFactory
import pickle
import gensim
import os
from tqdm import tqdm

# Download Necessary NLTK Data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # for lemmatization

# MongoDB Connection and Data Retrieval
try:
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["WS_Data_DB"]
    collection = db["LogRhythm7_15Docs"]
    # Fetch documents from MongoDB
    documents = list(collection.find())
except Exception as e:
    print(f"Error connecting to MongoDB: {e}")
    documents = []

# Function to remove HTML, JavaScript, and CSS
def clean_html_and_js(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
    try:
        soup = BeautifulSoup(text, "html.parser")
        for script_or_style in soup(["script", "style"]):
            script_or_style.decompose()
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()
        return soup.get_text(separator=' ').strip()
    except Exception as e:
        print(f"Error in cleaning HTML/JS: {e}")
        return ""

# Phrase Normalization Function
def normalize_phrases(text):
    phrase_map = {
        'logrhythm siem': 'LogRhythm SIEM',
        'logrhythm.com': 'LogRhythm',
        # Add more mappings as needed
    }
    for key, value in phrase_map.items():
        text = text.replace(key, value)
    return text

# Improved regex pattern to remove punctuation while keeping certain cases
pattern = r"""
    (?<![0-9a-zA-Z])[\/,(](?![0-9a-zA-Z])|        # Keeps slashes, commas, and parentheses not surrounded by alphanumeric characters
    (?<!\b(?:i\.e|e\.g)\.)(?<!\b(?:i\.e|e\.g)),|  # Keeps commas not preceded by 'i.e.' or 'e.g.'
    (?<=\d)[\/:](?=\d)|                           # Keeps slashes and colons between numbers
    (?<!n)[/](?!a)|                               # Keeps slash between 'n' and 'a' in 'n/a'
    (?<=\s)[.:,;!?](?=\s|$)|                      # Removes punctuation with a space before and after it or at the end of a string
    (?<=\s\w)\.(?=\s|$)                           # Removes period with a letter before it and a space after it or at the end of a string
"""

# Lemmatization Function
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Function to preprocess text
def preprocess_text(text):
    text = clean_html_and_js(text)
    text = normalize_phrases(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Preserve regex patterns
    preserved_regex = []
    text = re.sub(r'\/.*?\/[a-z]*', lambda match: preserved_regex.append(match.group()) or f'<<{len(preserved_regex) - 1}>>', text)
    text = re.sub(pattern, ' ', text, flags=re.VERBOSE)
    # Restore preserved regex patterns
    for i, regex_pattern in enumerate(preserved_regex):
        text = text.replace(f'<<{i}>>', regex_pattern)
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    return text.lower()  # Convert text to lowercase

# Function for Data Quality Check
def data_quality_check(texts):
    # Implement checks here (e.g., checking length, unexpected characters)
    pass

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        print(f"Error in language detection: {e}")
        return None

# Function to remove stopwords from text
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

# Function to find frequent trigrams from the raw text
def find_frequent_trigrams(texts, threshold=5):
    ngram_counter = Counter()
    for text in texts:
        words = text.split()
        trigrams = ngrams(words, 3)
        ngram_counter.update([' '.join(gram) for gram in trigrams])
    return sorted([(gram, count) for gram, count in ngram_counter.items() if count >= threshold], key=lambda x: x[1], reverse=True)

# Preprocess documents and extract raw text
raw_texts = []
for doc in tqdm(documents, desc="Preprocessing Documents"):
    try:
        raw_text = ' '.join(section_text for section_text in doc['content_sections'].values())
        raw_texts.append(preprocess_text(raw_text))
    except Exception as e:
        print(f"Error in processing document: {e}")

# Implement data quality check
data_quality_check(raw_texts)        
        
# Language Detection and Filtering
DetectorFactory.seed = 0
filtered_texts = []
for text in tqdm(raw_texts, desc="Detecting Language and Filtering"):
    if detect_language(text) == 'en':
        filtered_texts.append(text)

# Removing stopwords and finding frequent trigrams
filtered_texts = [remove_stopwords(text) for text in tqdm(filtered_texts, desc="Removing Stopwords")]
frequent_trigrams = find_frequent_trigrams(filtered_texts)

# Saving results to DataFrame
df = pd.DataFrame(frequent_trigrams, columns=['Trigram', 'Count'])

# Saving DataFrame to Pickle and Text Files
pickle_path = 'C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\processed_document_data.pkl'
text_path = 'C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\processed_document_data.txt'

df.to_pickle(pickle_path)
df.to_csv(text_path, index=False)

print(f"Data saved to {pickle_path} and {text_path}")

# Creating dictionary and corpus for LDA
dictionary = gensim.corpora.Dictionary(df['Trigram'].apply(lambda x: x.split()))
corpus = [dictionary.doc2bow(text.split()) for text in filtered_texts]

# Train LDA model
lda_model = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=5, passes=2, workers=2)

# Save LDA model
lda_model.save('C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\lda_model.pkl')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ted59\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ted59\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ted59\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Preprocessing Documents: 100%|██████████| 1084/1084 [00:02<00:00, 487.17it/s]
Detecting Language and Filtering: 100%|██████████| 1084/1084 [00:13<00:00, 77.92it/s] 
Removing Stopwords: 100%|██████████| 1084/1084 [00:00<00:00, 1771.02it/s]


Data saved to C:\Users\ted59\Knapp069-Practicum-1-Project\Processed Data\processed_document_data.pkl and C:\Users\ted59\Knapp069-Practicum-1-Project\Processed Data\processed_document_data.txt
