# Text Preprocessing for NLP
This notebook outlines the steps for fetching and preprocessing text data from a MongoDB database for NLP tasks. It includes cleaning the text, removing unnecessary characters, tokenization, lemmatization, and more.

In [1]:
# Imports
import pymongo
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from bs4 import BeautifulSoup, Comment
from collections import Counter
import re
import warnings
from langdetect import detect, DetectorFactory
import os

## Downloading Necessary NLTK Data
The following cells download the required NLTK packages for stopwords, tokenization, and lemmatization.

In [2]:
# Download Necessary NLTK Data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ted59\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ted59\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ted59\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## MongoDB Connection and Data Retrieval
Establish a connection to MongoDB and retrieve documents from the specified collection for processing.

In [3]:
# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["WS_Data_DB"]  # Database name
collection = db["LogRhythm7_15Docs"]  # Collection name

# Fetch data from MongoDB
documents = collection.find().limit(100)  #<--------------------------------
df = pd.DataFrame(list(documents))

In [4]:
print(documents)

<pymongo.cursor.Cursor object at 0x000002F4A5E97990>


## Text Preprocessing Functions
Define functions for preprocessing the text data, including HTML tag removal, lowercasing, removing URLs and special characters, tokenization, stopwords removal, lemmatization, and generating n-grams.

In [5]:
# Suppress Warnings for BeautifulSoup
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# Function to remove HTML, JavaScript, and CSS
def clean_html_and_js(text):
    soup = BeautifulSoup(text, "html.parser")
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    text = soup.get_text(separator=' ')
    return text.strip()

# Function to parse the user input into indices, including ranges
def parse_input_indices(input_str):
    indices = set()
    if not input_str.strip():
        return list(indices)  # Return an empty list if input is empty

    parts = input_str.split(',')

    for part in parts:
        try:
            if '-' in part:
                start, end = map(int, part.split('-'))
                indices.update(range(start, end + 1))
            else:
                indices.add(int(part.strip()))
        except ValueError:
            print(f"Invalid input: {part}. Please enter valid indices.")
            continue

    return list(indices)

# Make boilerplate phrase list
def populate_boilerplate_phrases(frequent_phrases, input_indices):
    return [frequent_phrases[i-1][0] for i in input_indices if i <= len(frequent_phrases) and i > 0]

# Function to display phrases and ask for removal after every 20 phrases
def display_and_select_phrases(phrases):
    selected_phrases = []
    phrases_per_batch = 20

    print(f"Total phrases: {len(phrases)}")  # Debug: Print the total number of phrases

    for i in range(0, len(phrases), phrases_per_batch):
        batch = phrases[i:i+phrases_per_batch]
        print("\nFrequent Phrases:")
        for j, (phrase, count) in enumerate(batch, 1):
            print(f"{i+j}. {phrase} (Count: {count})")

        input_str = input("\nEnter the indices or ranges of indices (e.g., 1-3, 5, 7) you want to remove from this batch: ")
        input_indices = parse_input_indices(input_str)
        selected_phrases.extend(populate_boilerplate_phrases(batch, input_indices))

    return selected_phrases


## Extracting Frequent Phrases
Process the fetched documents to extract and display frequent phrases for user selection.

In [6]:
# Define a function to find frequent phrases from the documents
def find_frequent_phrases(documents, threshold=1):  
    phrase_counter = Counter()
    
    for doc in documents:
        # Make sure to replace 'text_field' with your actual field name
        text = clean_html_and_js(doc['content_sections'])
        words = word_tokenize(text.lower())
        # Count bigrams in the document
        phrase_counter.update([' '.join(pair) for pair in ngrams(words, 2)])

    frequent_phrases = [(phrase, count) for phrase, count in phrase_counter.items() if count >= threshold]
    return frequent_phrases

frequent_phrases = find_frequent_phrases(documents)
print("Frequent Phrases:", frequent_phrases[:10])  # Print the top 10 frequent phrases to check


# Display and select phrases to remove
selected_phrases_for_removal = display_and_select_phrases(frequent_phrases)
print("Selected phrases for removal:", selected_phrases_for_removal)

Frequent Phrases: []
Total phrases: 0
Selected phrases for removal: []


## Preprocessing Document Sections
Apply the preprocessing steps to each section in the documents.

In [7]:
# Function for text preprocessing
def preprocess_text(text):
    text = clean_html_and_js(text)
    text = re.sub(r'http\S+|www\S+|https\S+|@[A-Za-z0-9]+', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.lower()) for word in words]
    return ' '.join(words)

# Function to preprocess each section in a document
def preprocess_document_sections(document_sections):
    processed_sections = {}
    for section, content in document_sections.items():
        processed_text = preprocess_text(content)
        processed_sections[section] = processed_text
    return processed_sections

# Apply preprocessing
df['processed_content_sections'] = df['content_sections'].apply(preprocess_document_sections)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


## Language Detection and Filtering
Detect the language of each document and filter out non-English documents to maintain consistency in language processing.

In [8]:
# Language Detection Setup
DetectorFactory.seed = 0

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return None

# Detect language for each document
df['language'] = df['processed_content_sections'].apply(lambda x: detect_language(' '.join(x.values())))

# Keep only English language documents
df = df[df['language'] == 'en']

## Saving Processed Data
Save the processed DataFrame to a Pickle file for future use and also export it as a text file.

In [9]:
# Saving DataFrame to Pickle and Text Files
df.to_pickle('C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\processed_document_data.pkl')
df_string = df.to_string()
with open('C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\processed_document_data.txt', 'w', encoding='utf-8') as file:
    file.write(df_string)