# ðŸ“Š EDA & Feature Engineering â€” Interview Patterns

This notebook covers common EDA and feature engineering techniques tested in ML interviews.

**Topics:**
- Data loading and inspection
- Handling missing values
- Feature scaling (StandardScaler, MinMax)
- Encoding categorical variables
- Feature selection
- Cross-validation

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)
print('âœ… Setup complete')

## 1. Generate Synthetic Dataset

We'll create a realistic dataset with mixed types, missing values, and outliers.

In [None]:
n = 500

# Numeric features
age = np.random.normal(35, 10, n).clip(18, 70).astype(int)
income = age * 1200 + np.random.normal(0, 10000, n)
credit_score = np.random.normal(700, 50, n).clip(300, 850).astype(int)

# Categorical features
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n, p=[0.3, 0.4, 0.2, 0.1])
employment = np.random.choice(['Employed', 'Self-Employed', 'Unemployed'], n, p=[0.7, 0.2, 0.1])

# Target (binary)
prob = 1 / (1 + np.exp(-(income/30000 + credit_score/200 - 6 + np.random.normal(0, 0.5, n))))
approved = (np.random.random(n) < prob).astype(int)

# Inject missing values (5%)
income_with_na = income.copy().astype(float)
income_with_na[np.random.choice(n, 25, replace=False)] = np.nan
credit_with_na = credit_score.copy().astype(float)
credit_with_na[np.random.choice(n, 15, replace=False)] = np.nan

print(f'Dataset: {n} samples')
print(f'Missing income: {np.isnan(income_with_na).sum()}')
print(f'Missing credit: {np.isnan(credit_with_na).sum()}')
print(f'Approval rate: {approved.mean():.1%}')

## 2. Handling Missing Values

Three approaches: **mean**, **median**, and **indicator variable**.

In [None]:
def impute_with_indicator(arr, strategy='median'):
    """Impute missing values and create a 'was_missing' indicator."""
    is_missing = np.isnan(arr).astype(float)
    
    if strategy == 'mean':
        fill_value = np.nanmean(arr)
    elif strategy == 'median':
        fill_value = np.nanmedian(arr)
    else:
        raise ValueError(f'Unknown strategy: {strategy}')
    
    imputed = np.where(np.isnan(arr), fill_value, arr)
    return imputed, is_missing

income_clean, income_missing_flag = impute_with_indicator(income_with_na, 'median')
credit_clean, credit_missing_flag = impute_with_indicator(credit_with_na, 'median')

print(f'Income imputed with median: {np.nanmedian(income_with_na):.0f}')
print(f'Credit imputed with median: {np.nanmedian(credit_with_na):.0f}')
print(f'Remaining NaN: {np.isnan(income_clean).sum() + np.isnan(credit_clean).sum()}')

## 3. Feature Scaling

**StandardScaler**: `z = (x - Î¼) / Ïƒ` â†’ mean=0, std=1  
**MinMax**: `z = (x - min) / (max - min)` â†’ range [0, 1]

In [None]:
class StandardScaler:
    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0)
        return self
    
    def transform(self, X):
        return (X - self.mean_) / (self.std_ + 1e-8)
    
    def fit_transform(self, X):
        return self.fit(X).transform(X)

class MinMaxScaler:
    def fit(self, X):
        self.min_ = np.min(X, axis=0)
        self.max_ = np.max(X, axis=0)
        return self
    
    def transform(self, X):
        return (X - self.min_) / (self.max_ - self.min_ + 1e-8)
    
    def fit_transform(self, X):
        return self.fit(X).transform(X)

# Compare
X_numeric = np.column_stack([age, income_clean, credit_clean])
labels = ['Age', 'Income', 'Credit Score']

X_standard = StandardScaler().fit_transform(X_numeric)
X_minmax = MinMaxScaler().fit_transform(X_numeric)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, (title, data) in enumerate([('Original', X_numeric), ('StandardScaler', X_standard), ('MinMaxScaler', X_minmax)]):
    for j, label in enumerate(labels):
        axes[i].hist(data[:, j], bins=30, alpha=0.6, label=label)
    axes[i].set_title(title)
    axes[i].legend(fontsize=8)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Encoding Categorical Variables

**One-Hot Encoding** for nominal categories, **Ordinal Encoding** for ordered categories.

In [None]:
def one_hot_encode(arr):
    """One-hot encode a categorical array."""
    categories = sorted(set(arr))
    encoded = np.zeros((len(arr), len(categories)))
    cat_to_idx = {cat: i for i, cat in enumerate(categories)}
    for i, val in enumerate(arr):
        encoded[i, cat_to_idx[val]] = 1
    return encoded, categories

def ordinal_encode(arr, order):
    """Ordinal encode with specified order."""
    mapping = {cat: i for i, cat in enumerate(order)}
    return np.array([mapping[val] for val in arr]).reshape(-1, 1)

# One-hot for employment
emp_encoded, emp_cats = one_hot_encode(employment)
print(f'Employment one-hot: {emp_cats}')
print(f'Shape: {emp_encoded.shape}')
print(f'First 3 rows:\n{emp_encoded[:3]}\n')

# Ordinal for education
edu_order = ['High School', 'Bachelor', 'Master', 'PhD']
edu_encoded = ordinal_encode(education, edu_order)
print(f'Education ordinal: {edu_order} â†’ [0, 1, 2, 3]')
print(f'First 5: {edu_encoded[:5].ravel()}')

## 5. K-Fold Cross-Validation from Scratch

In [None]:
def k_fold_split(n_samples, k=5, shuffle=True):
    """Generate k-fold train/val splits."""
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)
    
    fold_size = n_samples // k
    folds = []
    
    for i in range(k):
        val_start = i * fold_size
        val_end = val_start + fold_size if i < k - 1 else n_samples
        val_idx = indices[val_start:val_end]
        train_idx = np.concatenate([indices[:val_start], indices[val_end:]])
        folds.append((train_idx, val_idx))
    
    return folds

# Demo: simple accuracy per fold using nearest-centroid
X_full = np.column_stack([X_standard, edu_encoded, emp_encoded, income_missing_flag, credit_missing_flag])
y_full = approved

folds = k_fold_split(len(X_full), k=5)
fold_accs = []

for i, (train_idx, val_idx) in enumerate(folds):
    # Simple nearest-centroid classifier
    X_tr, y_tr = X_full[train_idx], y_full[train_idx]
    X_val, y_val = X_full[val_idx], y_full[val_idx]
    
    c0 = X_tr[y_tr == 0].mean(axis=0)
    c1 = X_tr[y_tr == 1].mean(axis=0)
    
    d0 = np.linalg.norm(X_val - c0, axis=1)
    d1 = np.linalg.norm(X_val - c1, axis=1)
    preds = (d1 < d0).astype(int)
    
    acc = np.mean(preds == y_val)
    fold_accs.append(acc)
    print(f'Fold {i+1}: Accuracy = {acc:.2%} (train={len(train_idx)}, val={len(val_idx)})')

print(f'\nðŸ“Š Mean CV Accuracy: {np.mean(fold_accs):.2%} Â± {np.std(fold_accs):.2%}')

## ðŸ’¡ Key Takeaways

| Technique | When to Use | Interview Tip |
|-----------|------------|---------------|
| **Mean/Median Imputation** | Small % of missing values | Always mention adding a missing indicator |
| **StandardScaler** | SVM, Logistic Regression, Neural Nets | Say "centers data, unit variance" |
| **MinMaxScaler** | When you need bounded range [0,1] | Works well for image pixels, distances |
| **One-Hot Encoding** | Nominal categories (no order) | Drop one column to avoid multicollinearity |
| **Ordinal Encoding** | Ordered categories | Preserves natural ordering |
| **K-Fold CV** | Model evaluation | Always use stratified for imbalanced data |