# Module 2: Exploratory Data Analysis & Feature Engineering

This notebook covers:
- Loading and exploring the datasets
- Visualizing distributions and correlations
- Creating derived features for ML models

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Load data
data_dir = Path('../data')
telematics = pd.read_csv(data_dir / 'telematics.csv', parse_dates=['timestamp'])
ratings = pd.read_csv(data_dir / 'ratings.csv')
feedback = pd.read_csv(data_dir / 'feedback.csv')

print(f"Loaded: {len(telematics)} telematics, {len(ratings)} ratings, {len(feedback)} feedback")

In [None]:
# Merge telematics with ratings
df = telematics.merge(ratings[['trip_id', 'rating']], on='trip_id')

# Correlation heatmap
numeric_cols = ['speed', 'throttle', 'brake', 'steering_angle', 'accel_x', 'accel_y',
                'trip_duration_sec', 'distance_km', 'hard_brake', 'overspeed', 'harsh_turn', 'rating']
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Feature Engineering - Aggregate per driver
driver_features = df.groupby('driver_id').agg({
    'speed': ['mean', 'std', 'max'],
    'hard_brake': 'sum',
    'overspeed': 'sum',
    'harsh_turn': 'sum',
    'trip_duration_sec': 'mean',
    'distance_km': 'sum',
    'rating': 'mean',
    'trip_id': 'count'
}).reset_index()

driver_features.columns = ['driver_id', 'speed_mean', 'speed_std', 'speed_max',
                           'hard_brake_count', 'overspeed_count', 'harsh_turn_count',
                           'avg_trip_duration', 'total_distance', 'avg_rating', 'trip_count']

# Save features
driver_features.to_csv(data_dir / 'driver_features.csv', index=False)
print(f"Created {len(driver_features)} driver feature records")
driver_features.head(10)