### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [2]:
import pandas as pd
import string
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import unittest

# ---------------------------
# Error-safe Dataset Loader
# ---------------------------
def load_sms_dataset(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise HTTPError for bad responses
        df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
        assert 'label' in df.columns and 'message' in df.columns
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# ---------------------------
# Preprocessing Function
# ---------------------------
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# ---------------------------
# Main TF-IDF Pipeline
# ---------------------------
def run_pipeline():
    url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
    df = load_sms_dataset(url)
    if df is None:
        return

    df['clean_message'] = df['message'].apply(preprocess_text)
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['clean_message'])

    df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
    y = df['label_num']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"TF-IDF matrix shape: {X.shape}")
    print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
    print(f"Train samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# ---------------------------
# Unit Tests
# ---------------------------
class TestPreprocessing(unittest.TestCase):

    def test_lowercase_conversion(self):
        self.assertEqual(preprocess_text("Hello World!"), "hello world")

    def test_punctuation_removal(self):
        self.assertEqual(preprocess_text("Yes!!! Great?"), "yes great")

    def test_empty_string(self):
        self.assertEqual(preprocess_text(""), "")

# ---------------------------
# Run everything
# ---------------------------
if __name__ == "__main__":
    run_pipeline()
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


...
----------------------------------------------------------------------
Ran 3 tests in 0.002s

OK


TF-IDF matrix shape: (5572, 9277)
Vocabulary size: 9277
Train samples: 4457, Test samples: 1115
