# Feature Engineering for Hit Song Prediction

## Objectives
1. Create interaction features (e.g., energy √ó danceability)
2. Add polynomial features for non-linear relationships
3. Engineer temporal features (month, day of week)
4. Create domain-specific features
5. Evaluate impact on model performance

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score

warnings.filterwarnings('ignore')
RANDOM_SEED = 42

project_root = Path.cwd().parent
processed_data_dir = project_root / 'data' / 'processed'
figures_dir = project_root / 'figures'

print("‚úÖ Setup complete")

## 1. Load Original Data

In [None]:
# Load the processed dataset
data_file = processed_data_dir / 'hits_dataset.csv'

if not data_file.exists():
    print("‚ùå ERROR: hits_dataset.csv not found!")
    print(f"   Expected location: {data_file}")
    print("\nüìã Please run the following notebooks first:")
    print("   1. 01_Week1_Data_Setup_EDA.ipynb")
    print("   2. 02_Week2_Baseline_Modeling.ipynb")
    print("\nThese notebooks will create the hits_dataset.csv file needed for feature engineering.")
    raise FileNotFoundError(f"Required file not found: {data_file}")

df = pd.read_csv(data_file)
print(f"‚úÖ Dataset loaded: {df.shape}")
print(f"   Features: {df.shape[1]}")
print(f"   Samples: {df.shape[0]}")
df.head()

## 2. Create Interaction Features

Combine features that might work together to predict hits.

In [None]:
# Get audio feature columns
exclude_cols = ['is_hit', 'year'] + df.select_dtypes(include=['object']).columns.tolist()
audio_features = [col for col in df.columns if col not in exclude_cols]

# Create interaction features
df_engineered = df.copy()

# Domain knowledge interactions
if 'energy' in audio_features and 'danceability' in audio_features:
    df_engineered['energy_x_danceability'] = df['energy'] * df['danceability']
    print("‚úÖ Created: energy √ó danceability")

if 'valence' in audio_features and 'energy' in audio_features:
    df_engineered['valence_x_energy'] = df['valence'] * df['energy']
    print("‚úÖ Created: valence √ó energy (happy & energetic)")

if 'loudness' in audio_features and 'energy' in audio_features:
    df_engineered['loudness_x_energy'] = df['loudness'] * df['energy']
    print("‚úÖ Created: loudness √ó energy")

if 'acousticness' in audio_features and 'energy' in audio_features:
    df_engineered['acoustic_vs_energy'] = df['acousticness'] - df['energy']
    print("‚úÖ Created: acousticness - energy (acoustic contrast)")

# Danceability composite
if all(f in audio_features for f in ['danceability', 'valence', 'energy']):
    df_engineered['party_factor'] = (df['danceability'] + df['valence'] + df['energy']) / 3
    print("‚úÖ Created: party_factor (avg of dance, valence, energy)")

print(f"\nDataset shape after interactions: {df_engineered.shape}")

## 3. Polynomial Features

Capture non-linear relationships.

In [None]:
# Add squared terms for key features
key_features = ['danceability', 'energy', 'valence'] if all(f in audio_features for f in ['danceability', 'energy', 'valence']) else audio_features[:3]

for feature in key_features:
    if feature in df.columns:
        df_engineered[f'{feature}_squared'] = df[feature] ** 2
        print(f"‚úÖ Created: {feature}¬≤")

print(f"\nDataset shape after polynomial features: {df_engineered.shape}")

## 4. Temporal Features

Extract month, season, day of week if date information is available.

In [None]:
# Year-based features
if 'year' in df.columns:
    year_min = df['year'].min()
    year_max = df['year'].max()
    
    # Normalize year (0-1 scale)
    if year_max > year_min:
        df_engineered['year_normalized'] = (df['year'] - year_min) / (year_max - year_min)
    else:
        # All years are the same, set to 0.5
        df_engineered['year_normalized'] = 0.5
        print("‚ö†Ô∏è  Warning: All songs from the same year, year_normalized set to 0.5")
    
    # Year bins (early, mid, late period)
    try:
        if year_max - year_min >= 2:  # Need at least 3 distinct values for 3 bins
            df_engineered['year_period'] = pd.cut(df['year'], bins=3, labels=[0, 1, 2]).astype(int)
        else:
            # Not enough year range for binning
            df_engineered['year_period'] = 1  # Set all to middle period
            print(f"‚ö†Ô∏è  Warning: Year range too small ({year_min}-{year_max}), year_period set to 1")
    except Exception as e:
        print(f"‚ö†Ô∏è  Warning: Could not create year_period bins: {e}")
        df_engineered['year_period'] = 1
    
    print("‚úÖ Created: year_normalized, year_period")
else:
    print("‚ö†Ô∏è  No 'year' column found, skipping temporal features")

print(f"\nFinal engineered dataset shape: {df_engineered.shape}")
print(f"Added {df_engineered.shape[1] - df.shape[1]} new features")

## 5. Feature Importance Analysis

In [None]:
# Prepare data
exclude_cols = ['is_hit', 'year'] + df_engineered.select_dtypes(include=['object']).columns.tolist()
feature_cols = [col for col in df_engineered.columns if col not in exclude_cols]

X = df_engineered[feature_cols].values
y = df_engineered['is_hit'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape}")
print(f"Features: {len(feature_cols)}")

## 6. Compare Original vs Engineered Features

In [None]:
# Get original features for comparison
original_features = [col for col in df.columns if col not in ['is_hit', 'year'] and col not in df.select_dtypes(include=['object']).columns]

# Use the SAME train/test split for fair comparison
# Create stratified split indices to ensure both models use identical train/test sets
X_orig = df[original_features].values
indices = np.arange(len(y))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Apply same split to original features
X_orig_train = X_orig[train_idx]
X_orig_test = X_orig[test_idx]
y_orig_train = y[train_idx]
y_orig_test = y[test_idx]

# Scale original features
scaler_orig = StandardScaler()
X_orig_train_scaled = scaler_orig.fit_transform(X_orig_train)
X_orig_test_scaled = scaler_orig.transform(X_orig_test)

# Train original features model
model_orig = LogisticRegression(class_weight='balanced', random_state=RANDOM_SEED, max_iter=1000)
model_orig.fit(X_orig_train_scaled, y_orig_train)
y_pred_orig = model_orig.predict(X_orig_test_scaled)

# Train engineered features model (already done in previous cells, but ensuring same split)
model_eng = LogisticRegression(class_weight='balanced', random_state=RANDOM_SEED, max_iter=1000)
model_eng.fit(X_train_scaled, y_train)
y_pred_eng = model_eng.predict(X_test_scaled)

# Verify same test set (should be True)
assert np.array_equal(y_orig_test, y_test), "Test sets don't match! Check random seed."

# Compare metrics on the SAME test set
print("\n" + "="*60)
print("ORIGINAL vs ENGINEERED FEATURES COMPARISON")
print("="*60)

metrics = {
    'Features Count': [len(original_features), len(feature_cols)],
    'Precision': [precision_score(y_orig_test, y_pred_orig), precision_score(y_test, y_pred_eng)],
    'Recall': [recall_score(y_orig_test, y_pred_orig), recall_score(y_test, y_pred_eng)],
    'F1 Score': [f1_score(y_orig_test, y_pred_orig), f1_score(y_test, y_pred_eng)]
}

comparison = pd.DataFrame(metrics, index=['Original', 'Engineered'])
print(comparison)

improvement = ((comparison.loc['Engineered', 'F1 Score'] - comparison.loc['Original', 'F1 Score']) /
               comparison.loc['Original', 'F1 Score'] * 100)
print(f"\nF1 Score Improvement: {improvement:+.2f}%")

## 7. Top Engineered Features

In [None]:
# Feature importance from coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': model_eng.coef_[0],
    'Abs_Coefficient': np.abs(model_eng.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

print("\nTop 10 Most Important Features:")
print("="*60)
for idx, row in feature_importance.head(10).iterrows():
    feature_type = "ENGINEERED" if row['Feature'] not in original_features else "ORIGINAL"
    print(f"{row['Feature']:30s} {row['Coefficient']:+.4f}  [{feature_type}]")

# Count engineered features in top 10
top10_engineered = sum(1 for f in feature_importance.head(10)['Feature'] if f not in original_features)
print(f"\n{top10_engineered}/10 top features are engineered")

## 8. Save Engineered Dataset

In [None]:
# Ensure output directory exists
processed_data_dir.mkdir(parents=True, exist_ok=True)

# Save engineered dataset
output_file = processed_data_dir / 'hits_dataset_engineered.csv'
df_engineered.to_csv(output_file, index=False)

print(f"‚úÖ Saved engineered dataset to: {output_file}")
print(f"   Original features: {len(original_features)}")
print(f"   Total features: {len(feature_cols)}")
print(f"   New features: {len(feature_cols) - len(original_features)}")
print(f"\nüìä Dataset info:")
print(f"   Rows: {df_engineered.shape[0]:,}")
print(f"   Columns: {df_engineered.shape[1]}")
print(f"   File size: {output_file.stat().st_size / 1024:.1f} KB")

---

## ‚úÖ Feature Engineering Complete!

### New Features Created:
1. **Interaction Terms**: energy√ódanceability, valence√óenergy, etc.
2. **Polynomial Features**: Squared terms for key features
3. **Domain Features**: party_factor, acoustic_contrast
4. **Temporal Features**: year_normalized, year_period

### Impact:
- Original features: Basic Spotify audio features
- Engineered features: Enhanced with domain knowledge
- Can improve model performance by capturing complex patterns

### Usage:
Use `hits_dataset_engineered.csv` in subsequent modeling for potentially better results!

---