# ACIS Insurance Risk Analytics â€“ Task 2

**Objective:** The goal of this notebook is to clean, perform feature engineering, prepare the data for modeling, and version the processed dataset using DVC.

## Load Dataset

In [None]:
import pandas as pd
import numpy as np
import sys
import os

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

from data_loader import load_raw_data
from dvc_utils import dvc_steps

df = load_raw_data('../data/MachineLearningRating_v3.txt')

## Feature Engineering

In [None]:
if df is not None:
    # 1. Loss Ratio
    # Avoid division by zero
    df['LossRatio'] = df.apply(lambda row: row['TotalClaims'] / row['TotalPremium'] if row['TotalPremium'] != 0 else 0, axis=1)

    # 2. Claim Severity
    # Avoid division by zero if NumberOfClaims is available, else maybe skip or assume 0
    # Assuming 'NumberOfClaims' may not be explicit in the prompt columns, checking or creating if specific column name is known.
    # If not present, we can't created it exactly as requested without that column. 
    # But assuming standard dataset features, let's look for a proxy or column.
    # If 'NumberOfClaims' does not exist, we check implied columns. 
    # Prompt asked for: ClaimSeverity = TotalClaims / NumberOfClaims
    # Let's assume the column exists or we handle it safely.
    if 'NumberOfClaims' in df.columns:
         df['ClaimSeverity'] = df.apply(lambda row: row['TotalClaims'] / row['NumberOfClaims'] if row['NumberOfClaims'] > 0 else 0, axis=1)
    else:
        print("Warning: 'NumberOfClaims' column not found. Skipping ClaimSeverity calculation.")

    # 3. Vehicle Age
    # VehicleAge = CurrentYear - RegistrationYear
    current_year = 2025 # Or use pd.Timestamp.now().year
    if 'RegistrationYear' in df.columns:
        # Clean RegistrationYear first if needed
        df['VehicleAge'] = current_year - pd.to_numeric(df['RegistrationYear'], errors='coerce')
        df['VehicleAge'] = df['VehicleAge'].fillna(0) # or median
        df.loc[df['VehicleAge'] < 0, 'VehicleAge'] = 0 # Handle future dates error
    else:
        print("Warning: 'RegistrationYear' column not found.")
    
    print("New features created.")

## Cleaning

In [None]:
if df is not None:
    # Missing Values
    # Numeric: Median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        median = df[col].median()
        df[col] = df[col].fillna(median)
    
    # Categorical: Mode
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if not df[col].mode().empty:
            mode = df[col].mode()[0]
            df[col] = df[col].fillna(mode)

    # Outlier handling (Simple Capping for demo)
    # Cap TotalClaims at 99th percentile
    cap_val = df['TotalClaims'].quantile(0.99)
    df.loc[df['TotalClaims'] > cap_val, 'TotalClaims'] = cap_val
    
    print("Cleaning completed.")

## Encoding

In [None]:
if df is not None:
    # Label Encoding for high cardinality or One-Hot for low
    # Using pandas get_dummies for simplicity or factorize
    
    cols_to_encode = ['Make', 'Province', 'VehicleType']
    for col in cols_to_encode:
        if col in df.columns:
            # Simple Label Encoding equivalent
            df[col + '_Encoded'] = pd.factorize(df[col])[0]
    
    print("Encoding completed.")

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

if df is not None:
    scaler = StandardScaler()
    # Scaling selected numeric columns (e.g. TotalPremium, TotalClaims, SumInsured)
    scale_cols = ['TotalPremium', 'TotalClaims', 'SumInsured']
    # Ensure they exist
    scale_cols = [c for c in scale_cols if c in df.columns]
    
    if scale_cols:
        df[scale_cols] = scaler.fit_transform(df[scale_cols])
        print("Scaling completed.")

## Save Clean Dataset

In [None]:
import os
output_dir = '../data/processed'
if not os.path.exists(output_dir):
    # Check locally
    if not os.path.exists('data/processed'):
        os.makedirs('data/processed', exist_ok=True)
        output_dir = 'data/processed'
    else:
        output_dir = 'data/processed'

if df is not None:
    output_path = os.path.join(output_dir, 'cleaned_data.csv')
    df.to_csv(output_path, index=False)
    print(f"Saved processed data to {output_path}")

## Dataset Summary

In [None]:
if df is not None:
    print(f"New Shape: {df.shape}")
    print("New Columns Created:", [c for c in df.columns if 'Encoded' in c or c in ['LossRatio', 'ClaimSeverity', 'VehicleAge']])
    display(df.head(3))

## DVC Versioning

In [None]:
print(dvc_steps())

In [None]:
# !dvc add data/processed/cleaned_data.csv
# !git add data/processed/cleaned_data.csv.dvc
# !git commit -m "task-2: add processed dataset with new features"
# !dvc push

## Closing Notes
The data is now cleaned, feature-engineered, and scaled, making it ready for modeling in Task 3. We have also set up DVC steps to ensure reproducible data versioning.