In [None]:
# notebooks/02_preprocessing.ipynb
# ==============================================================================
# Intelligent Document Classification System
# Preprocessing Pipeline Notebook
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
print("üì• Downloading NLTK resources...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-eng', quiet=True)

# Load spaCy model
print("üì• Loading spaCy model...")
try:
    nlp = spacy.load('en_core_web_sm')
except:
    print("Downloading spaCy model...")
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])
    nlp = spacy.load('en_core_web_sm')

# ==============================================================================
# 1. Configuration & Settings
# ==============================================================================

class PreprocessingConfig:
    """Configuration for preprocessing pipeline"""
    
    # Text cleaning
    REMOVE_SPECIAL_CHARS = True
    REMOVE_DIGITS = False
    CONVERT_TO_LOWERCASE = True
    REMOVE_EXTRA_SPACES = True
    
    # Stopwords
    REMOVE_STOPWORDS = True
    CUSTOM_STOPWORDS = ['example', 'document', 'file', 'page']
    
    # Lemmatization/Stemming
    LEMMATIZE = True
    STEM = False
    
    # Special processing
    REMOVE_EMAIL = True
    REMOVE_URL = True
    REMOVE_HTML = True
    EXPAND_CONTRACTIONS = True
    
    # Text length management
    MIN_TOKEN_LENGTH = 2
    MAX_TOKEN_LENGTH = 100
    MAX_DOCUMENT_LENGTH = 5000  # Truncate longer documents
    
    # Tokenization
    TOKENIZER = 'spacy'  # Options: 'spacy', 'nltk'

config = PreprocessingConfig()

# ==============================================================================
# 2. Load Data
# ==============================================================================

print("üìä Loading data...")

# Load data from EDA phase
train_df = pd.read_csv('../data/raw/train.csv')
val_df = pd.read_csv('../data/raw/val.csv')
test_df = pd.read_csv('../data/raw/test.csv')

# Combine for preprocessing consistency
all_data = pd.concat([train_df, val_df, test_df], ignore_index=True)

print(f"Total documents: {len(all_data):,}")
print(f"Columns: {list(all_data.columns)}")

# Identify text and target columns
text_col = 'text'  # Update based on your dataset
target_col = 'category'  # Update based on your dataset

# ==============================================================================
# 3. Preprocessing Class Definition
# ==============================================================================

class DocumentPreprocessor:
    """Complete document preprocessing pipeline"""
    
    def __init__(self, config):
        self.config = config
        self.stop_words = set(stopwords.words('english'))
        self.stop_words.update(config.CUSTOM_STOPWORDS)
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        
        # Contraction mapping
        self.contraction_map = {
            "ain't": "is not", "aren't": "are not", "can't": "cannot",
            "can't've": "cannot have", "'cause": "because", "could've": "could have",
            "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
            "don't": "do not", "hadn't": "had not", "hasn't": "has not",
            "haven't": "have not", "he'd": "he would", "he'll": "he will",
            "he's": "he is", "how'd": "how did", "how'll": "how will",
            "how's": "how is", "i'd": "i would", "i'll": "i will",
            "i'm": "i am", "i've": "i have", "isn't": "is not",
            "it'd": "it would", "it'll": "it will", "it's": "it is",
            "let's": "let us", "ma'am": "madam", "might've": "might have",
            "mightn't": "might not", "must've": "must have", "mustn't": "must not",
            "needn't": "need not", "oughtn't": "ought not", "shan't": "shall not",
            "she'd": "she would", "she'll": "she will", "she's": "she is",
            "should've": "should have", "shouldn't": "should not", "so've": "so have",
            "that's": "that is", "there's": "there is", "they'd": "they would",
            "they'll": "they will", "they're": "they are", "they've": "they have",
            "wasn't": "was not", "we'd": "we would", "we'll": "we will",
            "we're": "we are", "we've": "we have", "weren't": "were not",
            "what'll": "what will", "what're": "what are", "what's": "what is",
            "what've": "what have", "where's": "where is", "who'll": "who will",
            "who's": "who is", "won't": "will not", "would've": "would have",
            "wouldn't": "would not", "you'd": "you would", "you'll": "you will",
            "you're": "you are", "you've": "you have"
        }
    
    def clean_text(self, text):
        """Basic text cleaning"""
        if pd.isna(text):
            return ""
        
        text = str(text)
        
        # Convert to lowercase
        if self.config.CONVERT_TO_LOWERCASE:
            text = text.lower()
        
        # Remove HTML tags
        if self.config.REMOVE_HTML:
            text = re.sub(r'<.*?>', '', text)
        
        # Remove URLs
        if self.config.REMOVE_URL:
            text = re.sub(r'https?://\S+|www\.\S+', '', text)
        
        # Remove emails
        if self.config.REMOVE_EMAIL:
            text = re.sub(r'\S+@\S+', '', text)
        
        # Expand contractions
        if self.config.EXPAND_CONTRACTIONS:
            for contraction, expansion in self.contraction_map.items():
                text = text.replace(contraction, expansion)
        
        # Remove special characters
        if self.config.REMOVE_SPECIAL_CHARS:
            text = re.sub(r'[^\w\s]', ' ', text)
        
        # Remove digits
        if self.config.REMOVE_DIGITS:
            text = re.sub(r'\d+', '', text)
        
        # Remove extra whitespace
        if self.config.REMOVE_EXTRA_SPACES:
            text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text, method='spacy'):
        """Tokenize text using specified method"""
        if method == 'spacy':
            doc = nlp(text)
            tokens = [token.text for token in doc]
        elif method == 'nltk':
            tokens = word_tokenize(text)
        else:
            tokens = text.split()
        return tokens
    
    def remove_stopwords(self, tokens):
        """Remove stopwords from tokens"""
        if self.config.REMOVE_STOPWORDS:
            tokens = [token for token in tokens if token not in self.stop_words]
        return tokens
    
    def lemmatize_tokens(self, tokens):
        """Lemmatize tokens"""
        if self.config.LEMMATIZE:
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        return tokens
    
    def stem_tokens(self, tokens):
        """Stem tokens"""
        if self.config.STEM:
            tokens = [self.stemmer.stem(token) for token in tokens]
        return tokens
    
    def filter_tokens_by_length(self, tokens):
        """Filter tokens by length"""
        filtered = []
        for token in tokens:
            if (self.config.MIN_TOKEN_LENGTH <= len(token) <= 
                self.config.MAX_TOKEN_LENGTH):
                filtered.append(token)
        return filtered
    
    def preprocess_document(self, text, return_as='string'):
        """
        Complete preprocessing pipeline for a single document
        
        Args:
            text: Input text
            return_as: 'string' or 'tokens'
        """
        # Clean text
        cleaned_text = self.clean_text(text)
        
        # Tokenize
        tokens = self.tokenize(cleaned_text, method=self.config.TOKENIZER)
        
        # Remove stopwords
        tokens = self.remove_stopwords(tokens)
        
        # Lemmatize or stem
        if self.config.LEMMATIZE:
            tokens = self.lemmatize_tokens(tokens)
        elif self.config.STEM:
            tokens = self.stem_tokens(tokens)
        
        # Filter by length
        tokens = self.filter_tokens_by_length(tokens)
        
        # Truncate if too long
        if len(tokens) > self.config.MAX_DOCUMENT_LENGTH:
            tokens = tokens[:self.config.MAX_DOCUMENT_LENGTH]
        
        if return_as == 'string':
            return ' '.join(tokens)
        else:
            return tokens
    
    def batch_preprocess(self, texts, return_as='string', n_jobs=-1):
        """Preprocess multiple documents"""
        from tqdm import tqdm
        tqdm.pandas()
        
        print(f"Preprocessing {len(texts)} documents...")
        
        # Use parallel processing for large datasets
        if len(texts) > 1000 and n_jobs != 1:
            from joblib import Parallel, delayed
            results = Parallel(n_jobs=n_jobs)(
                delayed(self.preprocess_document)(text, return_as)
                for text in tqdm(texts, desc="Preprocessing")
            )
        else:
            results = [
                self.preprocess_document(text, return_as)
                for text in tqdm(texts, desc="Preprocessing")
            ]
        
        return results

# ==============================================================================
# 4. Initialize Preprocessor
# ==============================================================================

print("\nüõ†Ô∏è Initializing preprocessor...")
preprocessor = DocumentPreprocessor(config)

# ==============================================================================
# 5. Sample Preprocessing
# ==============================================================================

print("\nüîç Sample preprocessing demonstration:")

# Get sample documents
sample_texts = all_data[text_col].head(5).tolist()

for i, text in enumerate(sample_texts[:3]):
    print(f"\n{'='*60}")
    print(f"Sample {i+1} - Original:")
    print('-'*30)
    print(text[:500] + "..." if len(text) > 500 else text)
    
    # Process the text
    processed = preprocessor.preprocess_document(text, return_as='string')
    
    print(f"\nSample {i+1} - Processed:")
    print('-'*30)
    print(processed[:500] + "..." if len(processed) > 500 else processed)
    
    # Show tokens
    tokens = preprocessor.preprocess_document(text, return_as='tokens')
    print(f"\nTokens ({len(tokens)}):")
    print(tokens[:20])

# ==============================================================================
# 6. Apply Preprocessing to Entire Dataset
# ==============================================================================

print("\n‚öôÔ∏è Applying preprocessing to entire dataset...")

# Create copies to avoid modifying originals
train_processed = train_df.copy()
val_processed = val_df.copy()
test_processed = test_df.copy()

# Apply preprocessing
train_processed['processed_text'] = preprocessor.batch_preprocess(
    train_df[text_col], return_as='string', n_jobs=4
)

val_processed['processed_text'] = preprocessor.batch_preprocess(
    val_df[text_col], return_as='string', n_jobs=4
)

test_processed['processed_text'] = preprocessor.batch_preprocess(
    test_df[text_col], return_as='string', n_jobs=4
)

# Also store tokenized version for some models
print("\nüìù Creating tokenized versions...")
train_processed['tokens'] = preprocessor.batch_preprocess(
    train_df[text_col], return_as='tokens', n_jobs=4
)
val_processed['tokens'] = preprocessor.batch_preprocess(
    val_df[text_col], return_as='tokens', n_jobs=4
)
test_processed['tokens'] = preprocessor.batch_preprocess(
    test_df[text_col], return_as='tokens', n_jobs=4
)

# Calculate statistics
train_processed['processed_length'] = train_processed['processed_text'].apply(len)
train_processed['processed_word_count'] = train_processed['processed_text'].apply(
    lambda x: len(x.split())
)
