# 01. Data Preprocessing and Feature Engineering

**Objective:** Load and prepare inflation data for forecasting models

**Steps:**
1. Load raw data
2. Exploratory data analysis
3. Handle missing values
4. Feature engineering (lag features)
5. Train/test split
6. Save processed data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Paths
RAW_DATA_PATH = Path('../data/raw')
PROCESSED_DATA_PATH = Path('../data/processed')
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)

## 1. Load Raw Data

In [None]:
# Load datasets
# Adjust filenames based on your actual data files
df_raw = pd.read_csv(RAW_DATA_PATH / 'df_raw.csv', index_col=0, parse_dates=True)

print(f"Data shape: {df_raw.shape}")
print(f"\nDate range: {df_raw.index.min()} to {df_raw.index.max()}")
df_raw.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(df_raw.info())
print("\nBasic Statistics:")
df_raw.describe()

In [None]:
# Check missing values
missing_values = df_raw.isnull().sum()
missing_pct = (missing_values / len(df_raw)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_pct
})
missing_df[missing_df['Missing Count'] > 0]

In [None]:
# Visualize inflation time series (adjust column name as needed)
# Assuming there's a column for inflation rate
inflation_col = df_raw.columns[0]  # Adjust this to your actual inflation column

plt.figure(figsize=(14, 6))
plt.plot(df_raw.index, df_raw[inflation_col], linewidth=2)
plt.title('Inflation Rate Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Inflation Rate (%)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Handle Missing Values

In [None]:
# Handle missing values
# Option 1: Forward fill
df_filled = df_raw.fillna(method='ffill')

# Option 2: Drop rows with missing values
# df_filled = df_raw.dropna()

# Option 3: Interpolate
# df_filled = df_raw.interpolate(method='linear')

print(f"Missing values after handling: {df_filled.isnull().sum().sum()}")

## 4. Feature Engineering (Optional)

In [None]:
# Create lag features if needed
# Example: Create 1-12 month lags

def create_lag_features(df, target_col, n_lags=12):
    """
    Create lag features for time series forecasting
    """
    df_lags = df.copy()
    
    for lag in range(1, n_lags + 1):
        df_lags[f'{target_col}_lag{lag}'] = df_lags[target_col].shift(lag)
    
    return df_lags

# Uncomment if you need lag features
# df_with_lags = create_lag_features(df_filled, inflation_col, n_lags=12)
# df_processed = df_with_lags.dropna()

df_processed = df_filled.copy()

## 5. Train/Test Split

In [None]:
# Split data - 80% train, 20% test
train_size = int(len(df_processed) * 0.8)

df_train = df_processed.iloc[:train_size]
df_test = df_processed.iloc[train_size:]

print(f"Train set: {len(df_train)} samples ({df_train.index.min()} to {df_train.index.max()})")
print(f"Test set:  {len(df_test)} samples ({df_test.index.min()} to {df_test.index.max()})")

In [None]:
# Visualize train/test split
plt.figure(figsize=(14, 6))
plt.plot(df_train.index, df_train[inflation_col], label='Train', linewidth=2)
plt.plot(df_test.index, df_test[inflation_col], label='Test', linewidth=2)
plt.axvline(x=df_test.index[0], color='red', linestyle='--', alpha=0.7, label='Train/Test Split')
plt.title('Train/Test Split Visualization', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Inflation Rate (%)', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Save Processed Data

In [None]:
# Save processed datasets
df_processed.to_csv(PROCESSED_DATA_PATH / 'df_processed.csv')
df_train.to_csv(PROCESSED_DATA_PATH / 'df_train.csv')
df_test.to_csv(PROCESSED_DATA_PATH / 'df_test.csv')

print("✓ Processed data saved successfully!")
print(f"  - Full dataset: {PROCESSED_DATA_PATH / 'df_processed.csv'}")
print(f"  - Train set: {PROCESSED_DATA_PATH / 'df_train.csv'}")
print(f"  - Test set: {PROCESSED_DATA_PATH / 'df_test.csv'}")

## Summary

**Data preprocessing completed:**
- ✓ Loaded raw data
- ✓ Handled missing values
- ✓ Created features (if applicable)
- ✓ Split into train/test sets
- ✓ Saved processed data

**Next steps:**
- Run econometric models (Notebook 02)
- Run linear ML models (Notebook 03)
- Run nonlinear ML models (Notebook 04)