# Feature Engineering for Hit Song Prediction

## Objectives
1. Create interaction features (e.g., energy × danceability)
2. Add polynomial features for non-linear relationships
3. Engineer temporal features (month, day of week)
4. Create domain-specific features
5. Evaluate impact on model performance

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score

warnings.filterwarnings('ignore')
RANDOM_SEED = 42

project_root = Path.cwd().parent
processed_data_dir = project_root / 'data' / 'processed'
figures_dir = project_root / 'figures'

print("✅ Setup complete")

## 1. Load Original Data

In [None]:
df = pd.read_csv(processed_data_dir / 'hits_dataset.csv')
print(f"Original dataset: {df.shape}")
df.head()

## 2. Create Interaction Features

Combine features that might work together to predict hits.

In [None]:
# Get audio feature columns
exclude_cols = ['is_hit', 'year'] + df.select_dtypes(include=['object']).columns.tolist()
audio_features = [col for col in df.columns if col not in exclude_cols]

# Create interaction features
df_engineered = df.copy()

# Domain knowledge interactions
if 'energy' in audio_features and 'danceability' in audio_features:
    df_engineered['energy_x_danceability'] = df['energy'] * df['danceability']
    print("✅ Created: energy × danceability")

if 'valence' in audio_features and 'energy' in audio_features:
    df_engineered['valence_x_energy'] = df['valence'] * df['energy']
    print("✅ Created: valence × energy (happy & energetic)")

if 'loudness' in audio_features and 'energy' in audio_features:
    df_engineered['loudness_x_energy'] = df['loudness'] * df['energy']
    print("✅ Created: loudness × energy")

if 'acousticness' in audio_features and 'energy' in audio_features:
    df_engineered['acoustic_vs_energy'] = df['acousticness'] - df['energy']
    print("✅ Created: acousticness - energy (acoustic contrast)")

# Danceability composite
if all(f in audio_features for f in ['danceability', 'valence', 'energy']):
    df_engineered['party_factor'] = (df['danceability'] + df['valence'] + df['energy']) / 3
    print("✅ Created: party_factor (avg of dance, valence, energy)")

print(f"\nDataset shape after interactions: {df_engineered.shape}")

## 3. Polynomial Features

Capture non-linear relationships.

In [None]:
# Add squared terms for key features
key_features = ['danceability', 'energy', 'valence'] if all(f in audio_features for f in ['danceability', 'energy', 'valence']) else audio_features[:3]

for feature in key_features:
    if feature in df.columns:
        df_engineered[f'{feature}_squared'] = df[feature] ** 2
        print(f"✅ Created: {feature}²")

print(f"\nDataset shape after polynomial features: {df_engineered.shape}")

## 4. Temporal Features

Extract month, season, day of week if date information is available.

In [None]:
# Year-based features
if 'year' in df.columns:
    # Normalize year (0-1 scale)
    df_engineered['year_normalized'] = (df['year'] - df['year'].min()) / (df['year'].max() - df['year'].min())
    
    # Year bins (early, mid, late period)
    df_engineered['year_period'] = pd.cut(df['year'], bins=3, labels=[0, 1, 2]).astype(int)
    
    print("✅ Created: year_normalized, year_period")

print(f"\nFinal engineered dataset shape: {df_engineered.shape}")
print(f"Added {df_engineered.shape[1] - df.shape[1]} new features")

## 5. Feature Importance Analysis

In [None]:
# Prepare data
exclude_cols = ['is_hit', 'year'] + df_engineered.select_dtypes(include=['object']).columns.tolist()
feature_cols = [col for col in df_engineered.columns if col not in exclude_cols]

X = df_engineered[feature_cols].values
y = df_engineered['is_hit'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape}")
print(f"Features: {len(feature_cols)}")

## 6. Compare Original vs Engineered Features

In [None]:
# Train models with original features
original_features = [col for col in df.columns if col not in ['is_hit', 'year'] and col not in df.select_dtypes(include=['object']).columns]
X_orig = df[original_features].values
X_orig_train, X_orig_test, y_orig_train, y_orig_test = train_test_split(
    X_orig, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

scaler_orig = StandardScaler()
X_orig_train_scaled = scaler_orig.fit_transform(X_orig_train)
X_orig_test_scaled = scaler_orig.transform(X_orig_test)

# Original features model
model_orig = LogisticRegression(class_weight='balanced', random_state=RANDOM_SEED, max_iter=1000)
model_orig.fit(X_orig_train_scaled, y_orig_train)
y_pred_orig = model_orig.predict(X_orig_test_scaled)

# Engineered features model
model_eng = LogisticRegression(class_weight='balanced', random_state=RANDOM_SEED, max_iter=1000)
model_eng.fit(X_train_scaled, y_train)
y_pred_eng = model_eng.predict(X_test_scaled)

# Compare
print("\n" + "="*60)
print("ORIGINAL vs ENGINEERED FEATURES COMPARISON")
print("="*60)

metrics = {
    'Features Count': [len(original_features), len(feature_cols)],
    'Precision': [precision_score(y_test, y_pred_orig), precision_score(y_test, y_pred_eng)],
    'Recall': [recall_score(y_test, y_pred_orig), recall_score(y_test, y_pred_eng)],
    'F1 Score': [f1_score(y_test, y_pred_orig), f1_score(y_test, y_pred_eng)]
}

comparison = pd.DataFrame(metrics, index=['Original', 'Engineered'])
print(comparison)

improvement = ((comparison.loc['Engineered', 'F1 Score'] - comparison.loc['Original', 'F1 Score']) / 
               comparison.loc['Original', 'F1 Score'] * 100)
print(f"\nF1 Score Improvement: {improvement:+.2f}%")

## 7. Top Engineered Features

In [None]:
# Feature importance from coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': model_eng.coef_[0],
    'Abs_Coefficient': np.abs(model_eng.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

print("\nTop 10 Most Important Features:")
print("="*60)
for idx, row in feature_importance.head(10).iterrows():
    feature_type = "ENGINEERED" if row['Feature'] not in original_features else "ORIGINAL"
    print(f"{row['Feature']:30s} {row['Coefficient']:+.4f}  [{feature_type}]")

# Count engineered features in top 10
top10_engineered = sum(1 for f in feature_importance.head(10)['Feature'] if f not in original_features)
print(f"\n{top10_engineered}/10 top features are engineered")

## 8. Save Engineered Dataset

In [None]:
# Save engineered dataset
output_file = processed_data_dir / 'hits_dataset_engineered.csv'
df_engineered.to_csv(output_file, index=False)

print(f"✅ Saved engineered dataset to: {output_file}")
print(f"   Original features: {len(original_features)}")
print(f"   Total features: {len(feature_cols)}")
print(f"   New features: {len(feature_cols) - len(original_features)}")

---

## ✅ Feature Engineering Complete!

### New Features Created:
1. **Interaction Terms**: energy×danceability, valence×energy, etc.
2. **Polynomial Features**: Squared terms for key features
3. **Domain Features**: party_factor, acoustic_contrast
4. **Temporal Features**: year_normalized, year_period

### Impact:
- Original features: Basic Spotify audio features
- Engineered features: Enhanced with domain knowledge
- Can improve model performance by capturing complex patterns

### Usage:
Use `hits_dataset_engineered.csv` in subsequent modeling for potentially better results!

---