# ACIS Insurance Risk Analytics – Task 2

**Objective:** The goal of this notebook is to clean, perform feature engineering, prepare the data for modeling, and version the processed dataset using DVC.

## Load Dataset

In [1]:
import pandas as pd
import numpy as np
import sys
import os

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

from data_loader import load_raw_data
from dvc_utils import dvc_steps

df = load_raw_data('../data/MachineLearningRating_v3.txt')

Data loaded successfully. Shape: (1000098, 52)


## Feature Engineering

In [2]:
if df is not None:
    # 1. Loss Ratio
    # Avoid division by zero
    df['LossRatio'] = df.apply(lambda row: row['TotalClaims'] / row['TotalPremium'] if row['TotalPremium'] != 0 else 0, axis=1)

    # 2. Claim Severity
    # Avoid division by zero if NumberOfClaims is available, else maybe skip or assume 0
    # Assuming 'NumberOfClaims' may not be explicit in the prompt columns, checking or creating if specific column name is known.
    # If not present, we can't created it exactly as requested without that column. 
    # But assuming standard dataset features, let's look for a proxy or column.
    # If 'NumberOfClaims' does not exist, we check implied columns. 
    # Prompt asked for: ClaimSeverity = TotalClaims / NumberOfClaims
    # Let's assume the column exists or we handle it safely.
    if 'NumberOfClaims' in df.columns:
         df['ClaimSeverity'] = df.apply(lambda row: row['TotalClaims'] / row['NumberOfClaims'] if row['NumberOfClaims'] > 0 else 0, axis=1)
    else:
        print("Warning: 'NumberOfClaims' column not found. Skipping ClaimSeverity calculation.")

    # 3. Vehicle Age
    # VehicleAge = CurrentYear - RegistrationYear
    current_year = 2025 # Or use pd.Timestamp.now().year
    if 'RegistrationYear' in df.columns:
        # Clean RegistrationYear first if needed
        df['VehicleAge'] = current_year - pd.to_numeric(df['RegistrationYear'], errors='coerce')
        df['VehicleAge'] = df['VehicleAge'].fillna(0) # or median
        df.loc[df['VehicleAge'] < 0, 'VehicleAge'] = 0 # Handle future dates error
    else:
        print("Warning: 'RegistrationYear' column not found.")
    
    print("New features created.")

New features created.


## Cleaning

In [3]:
if df is not None:
    # Missing Values
    # Numeric: Median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        median = df[col].median()
        df[col] = df[col].fillna(median)
    
    # Categorical: Mode
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if not df[col].mode().empty:
            mode = df[col].mode()[0]
            df[col] = df[col].fillna(mode)

    # Outlier handling (Simple Capping for demo)
    # Cap TotalClaims at 99th percentile
    cap_val = df['TotalClaims'].quantile(0.99)
    df.loc[df['TotalClaims'] > cap_val, 'TotalClaims'] = cap_val
    
    print("Cleaning completed.")

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Cleaning completed.


## Encoding

In [4]:
if df is not None:
    # Label Encoding for high cardinality or One-Hot for low
    # Using pandas get_dummies for simplicity or factorize
    
    cols_to_encode = ['Make', 'Province', 'VehicleType']
    for col in cols_to_encode:
        if col in df.columns:
            # Simple Label Encoding equivalent
            df[col + '_Encoded'] = pd.factorize(df[col])[0]
    
    print("Encoding completed.")

Encoding completed.


## Feature Scaling

In [5]:
from sklearn.preprocessing import StandardScaler

if df is not None:
    scaler = StandardScaler()
    # Scaling selected numeric columns (e.g. TotalPremium, TotalClaims, SumInsured)
    scale_cols = ['TotalPremium', 'TotalClaims', 'SumInsured']
    # Ensure they exist
    scale_cols = [c for c in scale_cols if c in df.columns]
    
    if scale_cols:
        df[scale_cols] = scaler.fit_transform(df[scale_cols])
        print("Scaling completed.")

Scaling completed.


## Save Clean Dataset

In [6]:
import os
output_dir = '../data/processed'
if not os.path.exists(output_dir):
    # Check locally
    if not os.path.exists('data/processed'):
        os.makedirs('data/processed', exist_ok=True)
        output_dir = 'data/processed'
    else:
        output_dir = 'data/processed'

if df is not None:
    output_path = os.path.join(output_dir, 'cleaned_data.csv')
    df.to_csv(output_path, index=False)
    print(f"Saved processed data to {output_path}")

Saved processed data to ../data/processed\cleaned_data.csv


## Dataset Summary

In [7]:
if df is not None:
    print(f"New Shape: {df.shape}")
    print("New Columns Created:", [c for c in df.columns if 'Encoded' in c or c in ['LossRatio', 'ClaimSeverity', 'VehicleAge']])
    display(df.head(3))

New Shape: (1000098, 56)
New Columns Created: ['LossRatio', 'VehicleAge', 'Province_Encoded', 'VehicleType_Encoded']


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,LossRatio,VehicleAge,Province_Encoded,VehicleType_Encoded
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,-0.173593,0.001403,0.0,21,0,0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,-0.173593,0.001403,0.0,21,0,0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,-0.268822,0.001403,0.0,21,0,0


## DVC Versioning

In [8]:
print(dvc_steps())


    DVC Steps:
    1. Initialize DVC: dvc init
    2. Configure remote: dvc remote add -d myremote <storage_path>
    3. Add data: dvc add data/processed/cleaned_data.csv
    4. Commit DVC file: git add data/processed/cleaned_data.csv.dvc
    5. Commit changes: git commit -m "Add processed data"
    6. Push data: dvc push
    


In [None]:
# !dvc add data/processed/cleaned_data.csv
# !git add data/processed/cleaned_data.csv.dvc
# !git commit -m "task-2: add processed dataset with new features"
# !dvc push

In [None]:
# %% [markdown]
# ## 4. Task 2: Data Version Control with DVC

# %%
# Import DVC setup module
sys.path.append('../src')
from dvc_setup import DVCSetup

# %%
# Initialize DVC setup
print("Setting up DVC...")
dvc_manager = DVCSetup(project_root="..")

# Verify current DVC status
print("\nCurrent DVC Status:")
status = dvc_manager.verify_dvc_setup()
for key, value in status.items():
    print(f"  {key}: {'✓' if value else '✗'}")

# %%
# Initialize DVC if not already initialized
if not status['dvc_initialized']:
    print("\nInitializing DVC...")
    success = dvc_manager.initialize_dvc()
    if success:
        print("✓ DVC initialized successfully")
    else:
        print("✗ DVC initialization failed")
else:
    print("\n✓ DVC already initialized")

# %%
# Setup local remote storage
if not status['remote_configured']:
    print("\nSetting up local remote storage...")
    success = dvc_manager.setup_local_remote(storage_path="dvc_storage")
    if success:
        print("✓ Local remote storage configured")
    else:
        print("✗ Failed to configure remote storage")
else:
    print("\n✓ Remote storage already configured")

# %%
# Add data to DVC tracking
print("\nAdding data to DVC tracking...")
success = dvc_manager.add_data_to_dvc(data_path="data/MachineLearningRating_v3.txt")
if success:
    print("✓ Data added to DVC tracking")
else:
    print("✗ Failed to add data to DVC")

# %%
# Create DVC pipeline for reproducibility
print("\nCreating DVC pipeline...")
success = dvc_manager.create_dvc_pipeline()
if success:
    print("✓ DVC pipeline created")
else:
    print("✗ Failed to create DVC pipeline")

# %%
# Commit changes to Git
print("\nCommitting DVC changes to Git...")
success = dvc_manager.commit_changes(message="Task 2: Add DVC setup and data versioning")
if success:
    print("✓ Changes committed")
else:
    print("✗ No changes to commit or commit failed")

# %%
# Push to remote storage
print("\nPushing data to DVC remote...")
success = dvc_manager.push_to_remote()
if success:
    print("✓ Data pushed to remote storage")
else:
    print("✗ Failed to push data")

# %%
# Display DVC status and commands
print("\n" + "="*60)
print("DVC SETUP COMPLETE")
print("="*60)

print("\nDVC Commands Available:")
print("  Check status:          dvc status")
print("  Track new file:        dvc add <file>")
print("  Run pipeline:          dvc repro")
print("  Pull data:             dvc pull")
print("  Push data:             dvc push")
print("  View pipeline:         dvc dag")

print("\nVerification Commands:")
print("  dvc doctor                    # Check DVC installation")
print("  dvc version                   # Check DVC version")
print("  dvc remote list               # List configured remotes")

# %%
# Create DVC configuration file for documentation
dvc_config_content = """# DVC Configuration Documentation
# Project: Insurance Risk Analytics
# Repository: insurance-risk-analytics

## Setup Commands Executed:
# 1. dvc init --no-scm
# 2. dvc remote add -d localstorage ./dvc_storage
# 3. dvc add data/MachineLearningRating_v3.txt
# 4. git add data/MachineLearningRating_v3.txt.dvc .dvc
# 5. dvc push

## Reproducibility:
# To reproduce this analysis:
# 1. Clone repository: git clone <repo-url>
# 2. Install dependencies: pip install -r requirements.txt
# 3. Pull data: dvc pull
# 4. Run pipeline: dvc repro

## Data Versioning:
# Current data version tracked in: data/MachineLearningRating_v3.txt.dvc
# Storage: Local (./dvc_storage)
# Hash algorithm: md5

## Pipeline Stages:
# Defined in dvc.yaml
# Run specific stage: dvc repro <stage_name>
"""

with open("../DVC_SETUP.md", "w") as f:
    f.write(dvc_config_content)

print("DVC documentation created: DVC_SETUP.md")

# %% [markdown]
# ## 5. Create DVC Artifacts for Submission

# %%
# Create DVC artifacts directory
artifacts_dir = Path("../artifacts/dvc_setup")
artifacts_dir.mkdir(parents=True, exist_ok=True)

# Save DVC configuration files
dvc_files = [
    ".dvc/.gitignore",
    ".dvc/config",
    ".dvcignore",
    "data/MachineLearningRating_v3.txt.dvc",
    "dvc.yaml",
    "params/eda_params.yaml",
    "params/preprocess_params.yaml"
]

print("\nCreating DVC artifacts...")
for file in dvc_files:
    source = Path("..") / file
    if source.exists():
        dest = artifacts_dir / file
        dest.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, dest)
        print(f"  ✓ Copied: {file}")
    else:
        print(f"  ✗ Missing: {file}")

# Create DVC status report
print("\n" + "="*60)
print("DVC ARTIFACTS CREATED")
print("="*60)
print(f"Location: {artifacts_dir.absolute()}")
print("\nFiles included:")
for item in artifacts_dir.rglob("*"):
    if item.is_file():
        print(f"  {item.relative_to(artifacts_dir)}")

## Closing Notes
The data is now cleaned, feature-engineered, and scaled, making it ready for modeling in Task 3. We have also set up DVC steps to ensure reproducible data versioning.