# Feature Engineering for Time Series Forecasting
## Creating 73 Features from Daily Revenue Data

**Objectives:**
- Create temporal features (calendar + cyclical encoding)
- Create lag features (1, 2, 3, 7, 14, 21, 28 days)
- Create rolling statistics (mean, std, min, max)
- Create domain-specific features (momentum, RSI, volatility)
- Create interaction features
- **CRITICAL:** Avoid data leakage by properly shifting all features

In [None]:
# Import libraries
import sys
sys.path.insert(0, '../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from features.feature_engineering import TimeSeriesFeatureEngine

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')

print('✓ Libraries imported')

## 1. Load Processed Daily Revenue Data

In [None]:
# Load daily revenue time series
df = pd.read_csv('../data/processed/daily_revenue.csv', 
                 index_col='date', parse_dates=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Total days: {len(df)}")

df.head(10)

In [None]:
# Quick stats
print("Daily Revenue Statistics:")
print(df['revenue'].describe())

# Plot
plt.figure(figsize=(15, 5))
plt.plot(df.index, df['revenue'], linewidth=2, alpha=0.7)
plt.xlabel('Date', fontsize=11)
plt.ylabel('Revenue ($)', fontsize=11)
plt.title('Daily Revenue - Raw Data', fontsize=13, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 2. Initialize Feature Engineering Pipeline

In [None]:
# Initialize feature engine
fe = TimeSeriesFeatureEngine(lookback_days=28)

print("Feature engine initialized with lookback_days=28")

## 3. Create All Features

### Feature Categories:
1. **Temporal (22 features)**: Day of week, month, quarter, cyclical encoding
2. **Lag (7 features)**: Past values shifted by 1, 2, 3, 7, 14, 21, 28 days
3. **Rolling (24 features)**: Rolling statistics with windows 3, 7, 14, 28
4. **Expanding (4 features)**: Cumulative statistics
5. **Domain (13 features)**: Momentum, RSI, volatility, distance from MA
6. **Interaction (3 features)**: Feature combinations

In [None]:
# Create all features
df_features, feature_cols = fe.create_all_features(df, target_col='revenue')

print(f"\n✓ Created {len(feature_cols)} features")
print(f"Dataset shape: {df_features.shape}")

## 4. Explore Created Features

In [None]:
# Show feature groups
groups = fe.get_feature_importance_groups()

print("Feature Groups Breakdown:")
print("="*50)
for group_name, features in groups.items():
    print(f"{group_name:15s}: {len(features):3d} features")
    print(f"  Examples: {features[:3]}")
    print()

In [None]:
# Check for missing values
print("Missing Values Analysis:")
print("="*50)
missing = df_features[feature_cols].isnull().sum()
missing_features = missing[missing > 0].sort_values(ascending=False)

print(f"Features with missing values: {len(missing_features)}")
print(f"\nTop 10 features with most missing values:")
print(missing_features.head(10))

print(f"\nNote: Missing values are expected for lag/rolling features at the beginning of the series.")

In [None]:
# Sample of features
print("Sample Features (first 10 rows, selected features):")
sample_features = ['revenue', 'dayofweek', 'is_weekend', 'revenue_lag_1', 'revenue_lag_7', 
                   'revenue_rolling_mean_7', 'revenue_change_1d', 'revenue_pct_change_1d']
df_features[sample_features].head(10)

## 5. Visualize Key Features

In [None]:
# Plot lag features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Revenue vs Lag 1
axes[0, 0].scatter(df_features['revenue_lag_1'], df_features['revenue'], alpha=0.6)
axes[0, 0].set_xlabel('Revenue Lag 1 (yesterday)', fontsize=10)
axes[0, 0].set_ylabel('Revenue (today)', fontsize=10)
axes[0, 0].set_title('Revenue vs Lag 1', fontsize=11, fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# Revenue vs Lag 7
axes[0, 1].scatter(df_features['revenue_lag_7'], df_features['revenue'], alpha=0.6, color='orange')
axes[0, 1].set_xlabel('Revenue Lag 7 (last week)', fontsize=10)
axes[0, 1].set_ylabel('Revenue (today)', fontsize=10)
axes[0, 1].set_title('Revenue vs Lag 7 (Weekly Pattern)', fontsize=11, fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Revenue vs Rolling Mean 7
axes[1, 0].scatter(df_features['revenue_rolling_mean_7'], df_features['revenue'], alpha=0.6, color='green')
axes[1, 0].set_xlabel('7-Day Rolling Mean', fontsize=10)
axes[1, 0].set_ylabel('Revenue (today)', fontsize=10)
axes[1, 0].set_title('Revenue vs 7-Day Rolling Mean', fontsize=11, fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Revenue change distribution
axes[1, 1].hist(df_features['revenue_change_1d'].dropna(), bins=30, alpha=0.7, color='purple')
axes[1, 1].set_xlabel('Daily Revenue Change ($)', fontsize=10)
axes[1, 1].set_ylabel('Frequency', fontsize=10)
axes[1, 1].set_title('Distribution of Daily Revenue Change', fontsize=11, fontweight='bold')
axes[1, 1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 6. Prepare for Modeling

In [None]:
# Prepare X and y (drop NaN values)
X, y = fe.prepare_for_modeling(df_features, target_col='revenue', dropna=True)

print(f"Final dataset after dropping NaN:")
print(f"  X shape: {X.shape}")
print(f"  y shape: {y.shape}")
print(f"  Date range: {X.index.min()} to {X.index.max()}")
print(f"  Total samples: {len(X)}")

In [None]:
# Feature correlation with target
print("Top 20 Features Most Correlated with Revenue:")
print("="*50)

correlations = X.corrwith(y).abs().sort_values(ascending=False)
print(correlations.head(20))

In [None]:
# Visualize top correlations
top_corr = correlations.head(15)

plt.figure(figsize=(12, 6))
plt.barh(range(len(top_corr)), top_corr.values, alpha=0.7)
plt.yticks(range(len(top_corr)), top_corr.index)
plt.xlabel('Absolute Correlation with Revenue', fontsize=11)
plt.title('Top 15 Features by Correlation with Revenue', fontsize=13, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 7. Save Processed Features

In [None]:
# Save full dataset with features
df_features.to_csv('../data/processed/daily_revenue_with_features.csv')
print('✓ Full dataset saved to: ../data/processed/daily_revenue_with_features.csv')

# Save X and y
X.to_csv('../data/processed/X.csv')
y.to_csv('../data/processed/y.csv')
print('✓ X and y saved to: ../data/processed/X.csv and y.csv')

# Save feature names
with open('../data/processed/feature_names.txt', 'w') as f:
    for feat in feature_cols:
        f.write(feat + '\n')
print('✓ Feature names saved to: ../data/processed/feature_names.txt')

## 8. Summary

In [None]:
print("="*70)
print(" FEATURE ENGINEERING SUMMARY")
print("="*70)
print(f"\nTotal features created: {len(feature_cols)}")
print(f"Final samples (after dropping NaN): {len(X)}")
print(f"Date range: {X.index.min().date()} to {X.index.max().date()}")
print(f"\nFeature breakdown:")
for group_name, features in groups.items():
    print(f"  {group_name:15s}: {len(features):3d} features")

print(f"\nTop 5 features by correlation:")
for i, (feat, corr) in enumerate(correlations.head(5).items(), 1):
    print(f"  {i}. {feat:40s}: {corr:.4f}")

print("\n" + "="*70)
print(" READY FOR MODEL TRAINING")
print("="*70)