# Text Preprocessing Pipeline

## 1. Setup & Imports

In [1]:
import pandas as pd
import sys
import os
from tqdm import tqdm # Progress bar for long operations

# Add the project root to system path to import from 'src'
sys.path.append(os.path.abspath('..'))

from src.preprocessing import clean_text

# 1. Load the consolidated raw dataset (The Big 5 Classes)
# We use the file we saved in the previous notebook
input_path = '../data/raw/complaints_5_classes_raw.csv'

print("‚è≥ Loading raw consolidated data...")
df = pd.read_csv(input_path)

# Safety check: Ensure no nulls slipped in
df = df.dropna(subset=['text', 'target'])

print(f"‚úÖ Data Loaded. Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

‚úÖ SpaCy model 'en_core_web_sm' loaded successfully.
‚è≥ Loading raw consolidated data...
‚úÖ Data Loaded. Shape: (333096, 2)
Columns: ['text', 'target']


## 2. Sampling 

In [None]:
# UNCOMMENT THIS if you want a faster run for testing (e.g., 50k rows)
# df = df.sample(n=50000, random_state=42)
# print(f"Sampled down to: {df.shape}")

## 3. Processing

In [2]:
# Register pandas with tqdm to enable the 'progress_apply' function
tqdm.pandas()

print("---------------------------------------------------------")
print("‚è≥ Starting NLP Preprocessing Pipeline...")
print("   - Lowercasing")
print("   - Removing 'XXXX' anonymization masks")
print("   - Removing URLs/HTML")
print("   - SpaCy Lemmatization & Stopword Removal")
print("---------------------------------------------------------")

# Apply the cleaning function with a progress bar
df['processed_text'] = df['text'].progress_apply(clean_text)

print("\n‚úÖ Preprocessing complete!")

---------------------------------------------------------
‚è≥ Starting NLP Preprocessing Pipeline...
   - Lowercasing
   - Removing 'XXXX' anonymization masks
   - Removing URLs/HTML
   - SpaCy Lemmatization & Stopword Removal
---------------------------------------------------------


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 333096/333096 [4:58:51<00:00, 18.58it/s]      


‚úÖ Preprocessing complete!





## 4. Final Quality Check and Save

In [None]:
# Remove rows that became empty after cleaning
# (e.g., a complaint that was just "XXXX XXXX" would become empty)
initial_count = len(df)
df_final = df[df['processed_text'].str.len() > 1]
dropped_count = initial_count - len(df_final)

print(f"Rows dropped (empty after cleaning): {dropped_count}")
print(f"Final rows for modeling: {len(df_final)}")

# Save to processed folder
output_path = '../data/processed/complaints_processed.csv'
df_final.to_csv(output_path, index=False)

print(f"üíæ Processed dataset saved to: {output_path}")

# Show a random sample to verify quality
print("\n--- Cleaning Sample (Before vs After) ---")
sample = df_final.sample(20, random_state=42)  # Show 20 random samples
for index, row in sample.iterrows():
    print(f"ORIGINAL: {row['text'][:100]}...")
    print(f"CLEANED:  {row['processed_text'][:100]}...")
    print("-" * 50)

Rows dropped (empty after cleaning): 24
Final rows for modeling: 333072
üíæ Processed dataset saved to: ../data/processed/complaints_processed.csv

--- Cleaning Sample (Before vs After) ---
ORIGINAL: This company has been constantly calling my home for the last XXXX yrs all through out the day. The ...
CLEANED:  company constantly call home yrs day everyday afternoon ask respond numerous time let know person pe...
--------------------------------------------------
ORIGINAL: I was walking through the XXXX XXXX in XXXX, Ohio and was passing by the Kay Jewelers store. A Kay J...
CLEANED:  walk ohio pass kay jeweler store kay jeweler employee ask pass say say need signature day meet goal ...
--------------------------------------------------
ORIGINAL: Amount was paid in full in 2014 but still appear on credit report plz remove...
CLEANED:  pay appear credit report plz remove...
--------------------------------------------------
ORIGINAL: I have been conversing with SunTrust Bank regarding