In [35]:
import pandas as pd
import re
import unicodedata
from typing import List, Optional

In [36]:
class TextCleaner:
    def __init__(
        self, min_word_length: int = 3, custom_stopwords: Optional[List[str]] = None
    ):
        self.min_word_length = min_word_length
        self.stopwords = set(
            custom_stopwords
            or [
                "the",
                "a",
                "an",
                "and",
                "or",
                "but",
                "in",
                "on",
                "at",
                "to",
                "for",
                "of",
                "with",
                "by",
                "from",
                "up",
                "about",
                "into",
                "over",
                "after",
                "is",
                "are",
                "was",
                "were",
            ]
        )

    def normalize_text(self, text: str) -> str:
        text = (
            unicodedata.normalize("NFKD", str(text))
            .encode("ascii", "ignore")
            .decode("utf-8")
        )
        return text.lower()

    def remove_patterns(self, text: str) -> str:
        patterns = [
            r"<[^>]+>",  # HTML tags
            r"https?://\S+|www\.\S+",  # URLs
            r"\S+@\S+",  # Email addresses
            r"\b\d+\b",  # Numeric tokens
            r"[^\w\s]",  # Punctuation
        ]

        for pattern in patterns:
            text = re.sub(pattern, " ", text)

        return text

    def clean_text(self, text: str) -> str:
        if pd.isna(text):
            return ""

        text = self.normalize_text(text)
        text = self.remove_patterns(text)

        words = text.split()
        cleaned_words = [
            word
            for word in words
            if word not in self.stopwords and len(word) >= self.min_word_length
        ]

        return " ".join(cleaned_words)

In [37]:
# Load dataset
df = pd.read_csv('../data/processed/merged_news.csv')

# Print column names
print(df.columns)

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


In [38]:
# Initialize cleaner
cleaner = TextCleaner()

In [39]:
# Clean only text column
df['cleaned_text'] = df['text'].apply(cleaner.clean_text)

In [40]:
# Save the cleaned data
df.to_csv('../data/processed/cleaned_news.csv', index=False)