# Data Exploration and Preprocessing

This notebook covers:
1. Loading and exploring time series data
2. Data quality assessment
3. Feature engineering for LSTM models
4. Data preprocessing and transformation
5. Train/validation/test split strategies

In [None]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import our preprocessing utilities
import sys

sys.path.append('../src')
from data_processing.preprocessor import TimeSeriesPreprocessor

## 1. Load Sample Time Series Data

For this example, we'll create synthetic time series data. In a real project, you would load your actual dataset.

In [None]:
# Generate synthetic time series data
np.random.seed(42)
dates = pd.date_range('2020-01-01', '2023-12-31', freq='D')
n_points = len(dates)

# Create trend + seasonality + noise
trend = np.linspace(100, 150, n_points)
seasonal = 10 * np.sin(2 * np.pi * np.arange(n_points) / 365.25)  # Yearly seasonality
weekly = 5 * np.sin(2 * np.pi * np.arange(n_points) / 7)  # Weekly seasonality
noise = np.random.normal(0, 5, n_points)

values = trend + seasonal + weekly + noise

# Create DataFrame
df = pd.DataFrame({
    'date': dates,
    'value': values,
    'feature_1': np.random.normal(50, 10, n_points),
    'feature_2': np.random.exponential(2, n_points)
})

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nDescriptive Statistics:")
print(df.describe())

In [None]:
# Time series plot
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Main time series
axes[0, 0].plot(df['date'], df['value'])
axes[0, 0].set_title('Time Series Data')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Value')

# Distribution
axes[0, 1].hist(df['value'], bins=50, alpha=0.7)
axes[0, 1].set_title('Value Distribution')
axes[0, 1].set_xlabel('Value')
axes[0, 1].set_ylabel('Frequency')

# Seasonal decomposition (simplified)
monthly_avg = df.set_index('date')['value'].resample('M').mean()
axes[1, 0].plot(monthly_avg.index, monthly_avg.values)
axes[1, 0].set_title('Monthly Average')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Monthly Avg Value')

# Correlation matrix
corr_matrix = df[['value', 'feature_1', 'feature_2']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1, 1])
axes[1, 1].set_title('Feature Correlation')

plt.tight_layout()
plt.show()

## 3. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Check date continuity
date_diff = df['date'].diff().dropna()
expected_freq = pd.Timedelta(days=1)
irregular_dates = date_diff[date_diff != expected_freq]
print(f"\nIrregular date intervals: {len(irregular_dates)}")

# Outlier detection (simple method)
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['value'] < lower_bound) | (df['value'] > upper_bound)]
print(f"\nOutliers detected: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")

## 4. Feature Engineering

In [None]:
# Initialize preprocessor
preprocessor = TimeSeriesPreprocessor(
    target_column='value',
    sequence_length=30,
)

# Create time-based features
df_features = df.copy()
df_features['year'] = df_features['date'].dt.year
df_features['month'] = df_features['date'].dt.month
df_features['day_of_week'] = df_features['date'].dt.dayofweek
df_features['day_of_year'] = df_features['date'].dt.dayofyear
df_features['quarter'] = df_features['date'].dt.quarter
df_features['is_weekend'] = (df_features['day_of_week'] >= 5).astype(int)

# Lag features
for lag in [1, 7, 30]:
    df_features[f'value_lag_{lag}'] = df_features['value'].shift(lag)

# Rolling statistics
for window in [7, 30]:
    df_features[f'value_rolling_mean_{window}'] = df_features['value'].rolling(window=window).mean()
    df_features[f'value_rolling_std_{window}'] = df_features['value'].rolling(window=window).std()

print(f"Features created. New shape: {df_features.shape}")
print("\nNew features:")
new_features = [col for col in df_features.columns if col not in df.columns]
print(new_features)

## 5. Data Preprocessing

In [None]:
# Remove rows with NaN values (due to lag and rolling features)
df_clean = df_features.dropna().reset_index(drop=True)
print(f"Clean dataset shape: {df_clean.shape}")

# Fit the preprocessor and get scaled target values
scaled_values = preprocessor.fit_transform(df_clean)
print(f"Scaled values shape: {scaled_values.shape}")

# Create a DataFrame with only numeric features for sequence creation
# Exclude date column and keep only numeric features
numeric_features = [col for col in df_clean.columns if col != 'date' and pd.api.types.is_numeric_dtype(df_clean[col])]
print(f"Numeric features: {numeric_features}")

# Create feature matrix (excluding date)
feature_matrix = df_clean[numeric_features].values
print(f"Feature matrix shape: {feature_matrix.shape}")

## 6. Train/Validation/Test Split

In [None]:
# Time-based split (chronological order is important for time series)
total_samples = len(df_clean)
train_size = int(0.7 * total_samples)
val_size = int(0.2 * total_samples)

train_data = df_clean[:train_size]
val_data = df_clean[train_size:train_size + val_size]
test_data = df_clean[train_size + val_size:]

print(f"Train set: {len(train_data)} samples ({len(train_data)/total_samples*100:.1f}%)")
print(f"Validation set: {len(val_data)} samples ({len(val_data)/total_samples*100:.1f}%)")
print(f"Test set: {len(test_data)} samples ({len(test_data)/total_samples*100:.1f}%)")

# Visualize the split
plt.figure(figsize=(15, 6))
plt.plot(train_data['date'], train_data['value'], label='Train', alpha=0.8)
plt.plot(val_data['date'], val_data['value'], label='Validation', alpha=0.8)
plt.plot(test_data['date'], test_data['value'], label='Test', alpha=0.8)
plt.axvline(x=train_data['date'].iloc[-1], color='red', linestyle='--', alpha=0.7, label='Train/Val Split')
plt.axvline(x=val_data['date'].iloc[-1], color='orange', linestyle='--', alpha=0.7, label='Val/Test Split')
plt.title('Train/Validation/Test Split')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()

## 7. Create LSTM Sequences

In [None]:
# Split feature matrix for train/val/test
total_samples = len(feature_matrix)
train_size = int(0.7 * total_samples)
val_size = int(0.2 * total_samples)

train_features = feature_matrix[:train_size]
val_features = feature_matrix[train_size:train_size + val_size]
test_features = feature_matrix[train_size + val_size:]

# Create sequences for LSTM training using feature matrices
X_train, y_train = preprocessor.create_sequences(train_features)
X_val, y_val = preprocessor.create_sequences(val_features)
X_test, y_test = preprocessor.create_sequences(test_features)

print(f"Training sequences: X={X_train.shape}, y={y_train.shape}")
print(f"Validation sequences: X={X_val.shape}, y={y_val.shape}")
print(f"Test sequences: X={X_test.shape}, y={y_test.shape}")

# Visualize a few sequences (using the first feature which is the target 'value')
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i in range(4):
    # Plot the input sequence (first feature = target variable)
    axes[i].plot(X_train[i, :, 0], label='Input Sequence', marker='o', markersize=3)
    # Plot the target value (first feature of the target) - extract the scalar value
    target_value = y_train[i, 0, 0] if y_train.ndim == 3 else y_train[i, 0]
    axes[i].axhline(y=target_value, color='red', linestyle='--', label='Target')
    axes[i].set_title(f'Training Sequence {i+1}')
    axes[i].set_xlabel('Time Step')
    axes[i].set_ylabel('Scaled Value')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Display information about the sequences
print("\nSequence shapes:")
print(f"X_train sample shape: {X_train[0].shape} (30 timesteps, 16 features)")
print(f"y_train sample shape: {y_train[0].shape} (1 timestep, 16 features)")
print("\nFirst sequence target values (all features):")
print(y_train[0, 0])
print(f"\nTarget value (first feature only): {y_train[0, 0, 0]}")

## 8. Save Processed Data

In [None]:
# Create data directory if it doesn't exist
data_dir = Path('../data/processed')
data_dir.mkdir(parents=True, exist_ok=True)

# Save processed datasets
train_data.to_csv(data_dir / 'train_data.csv', index=False)
val_data.to_csv(data_dir / 'val_data.csv', index=False)
test_data.to_csv(data_dir / 'test_data.csv', index=False)

# Save sequences as numpy arrays
np.save(data_dir / 'X_train.npy', X_train)
np.save(data_dir / 'y_train.npy', y_train)
np.save(data_dir / 'X_val.npy', X_val)
np.save(data_dir / 'y_val.npy', y_val)
np.save(data_dir / 'X_test.npy', X_test)
np.save(data_dir / 'y_test.npy', y_test)

print("Processed data saved successfully!")
print(f"Files saved to: {data_dir.absolute()}")

## Summary

In this notebook, we:
1. ✅ Generated and explored synthetic time series data
2. ✅ Performed data quality assessment
3. ✅ Created time-based and lag features
4. ✅ Preprocessed and scaled the data
5. ✅ Split data chronologically for time series
6. ✅ Created LSTM input sequences
7. ✅ Saved processed data for model training

**Next Steps:**
- Use the processed data in `03_model_training.ipynb`
- Train and evaluate LSTM models
- Experiment with different architectures and hyperparameters