In [1]:
# ===============================
# 02_feature_engineering.ipynb
# ===============================

# Cell 1: Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
sys.path.append('../src')
from data_loader import load_data
from feature_extraction import get_tfidf_features

# Cell 2: Load data
df = load_data('../data/sms_spam.csv')
print("Dataset shape:", df.shape)

# Cell 3: TF-IDF vectorization
X, vectorizer = get_tfidf_features(df['message'])
y = df['label']

print(f"TF-IDF feature matrix shape: {X.shape}")

# Cell 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Cell 5: Save vectorizer and splits (optional)
import joblib
joblib.dump(vectorizer, '../results/tfidf_vectorizer.pkl')
joblib.dump((X_train, X_test, y_train, y_test), '../results/train_test_split.pkl')
print("✅ Vectorizer and splits saved.")


Dataset shape: (3821, 2)
TF-IDF feature matrix shape: (3821, 3000)
Training samples: 3056, Test samples: 765
✅ Vectorizer and splits saved.
