In [None]:
# ============================
# TASK 4: PREDICTIVE MODELING
# ============================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Modeling libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Regression & Classification models
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# SHAP for interpretability
import shap
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12,8)

print("="*80)
print("TASK 4: PREDICTIVE MODELING FOR RISK-BASED PRICING")
print("="*80)

# -----------------------------
# 1. Load Data
# -----------------------------
df = pd.read_csv('../data/processed/processed_data.csv')
df.columns = df.columns.str.strip()  # Strip extra spaces
print(f"Data loaded: {df.shape}")

# -----------------------------
# 2. Data Preparation
# -----------------------------
# Target variables
df['ClaimOccurred'] = (df['TotalClaims'] > 0).astype(int)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# Drop irrelevant columns
drop_cols = [
    'UnderwrittenCoverID','PolicyID','TransactionMonth','VehicleIntroDate','YearMonth','LossRatio',
    'NumberOfVehiclesInFleet','CrossBorder','Citizenship','LegalType','Title','Language','Bank',
    'AccountType','MaritalStatus','Country','MainCrestaZone','SubCrestaZone','ItemType','mmcode',
    'make','Model','CustomValueEstimate','AlarmImmobiliser','TrackingDevice','CapitalOutstanding',
    'NewVehicle','WrittenOff','Rebuilt','Converted','TermFrequency','ExcessSelected','CoverCategory',
    'CoverType','CoverGroup','Section','Product','StatutoryClass','StatutoryRiskType'
]
existing_drop_cols = [c for c in drop_cols if c in df.columns]
df_model = df.drop(columns=existing_drop_cols)
print(f"After dropping columns: {df_model.shape}")

# Missing value handling
num_cols = df_model.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols = [c for c in num_cols if c not in ['TotalClaims','TotalPremium','ClaimOccurred','Margin']]
for c in num_cols:
    df_model[c].fillna(df_model[c].median(), inplace=True)

cat_cols = df_model.select_dtypes(include=['object']).columns.tolist()
for c in cat_cols:
    df_model[c].fillna(df_model[c].mode()[0] if not df_model[c].mode().empty else 'Unknown', inplace=True)

# -----------------------------
# 3. Feature Engineering
# -----------------------------
current_year = 2015
df_model['VehicleAge'] = (current_year - df_model['RegistrationYear']).clip(0,50)
df_model['PremiumToSumInsured'] = df_model['CalculatedPremiumPerTerm'] / (df_model['SumInsured']+1)

# Convert Alarm/Tracking to binary
df_model['HasAlarm'] = df.get('AlarmImmobiliser', pd.Series(0)).map({'Yes':1,'No':0}).fillna(0)
df_model['HasTracking'] = df.get('TrackingDevice', pd.Series(0)).map({'Yes':1,'No':0}).fillna(0)

# Log transform for monetary skew
for col in ['TotalPremium','TotalClaims','SumInsured','CalculatedPremiumPerTerm','Margin']:
    if col in df_model.columns:
        df_model[f'Log_{col}'] = np.log1p(df_model[col].clip(lower=0))

print(f"After feature engineering: {df_model.shape}")

# ==============================
# 4. Claim Severity Prediction
# ==============================
df_severity = df_model[df_model['TotalClaims']>0].copy()
severity_features = [
    'SumInsured','CalculatedPremiumPerTerm','VehicleAge','RegistrationYear',
    'Cylinders','cubiccapacity','kilowatts','NumberOfDoors','VehicleType',
    'bodytype','Province','Gender','PremiumToSumInsured','HasAlarm','HasTracking','PostalCode'
]
severity_features = [c for c in severity_features if c in df_severity.columns]
X_sev = df_severity[severity_features]
y_sev = df_severity['TotalClaims']

# Split
X_train_sev, X_test_sev, y_train_sev, y_test_sev = train_test_split(X_sev, y_sev, test_size=0.2, random_state=42)

# Preprocessing
cat_feats_sev = X_sev.select_dtypes(include=['object']).columns.tolist()
num_feats_sev = X_sev.select_dtypes(include=['int64','float64']).columns.tolist()

preprocessor_sev = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_feats_sev),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
                      ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), cat_feats_sev)
])

# Models
severity_models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'DecisionTree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100,max_depth=10,random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100,max_depth=5,random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100,max_depth=5,random_state=42,verbosity=0)
}

severity_results = []
best_severity_model = None
best_rmse = float('inf')

for name, model in severity_models.items():
    pipe = Pipeline([('preprocessor', preprocessor_sev),
                     ('scaler', StandardScaler()),
                     ('model', model)])
    pipe.fit(X_train_sev, y_train_sev)
    y_pred = pipe.predict(X_test_sev)
    rmse = np.sqrt(mean_squared_error(y_test_sev, y_pred))
    mae = mean_absolute_error(y_test_sev, y_pred)
    r2 = r2_score(y_test_sev, y_pred)
    severity_results.append({'Model':name,'RMSE':rmse,'MAE':mae,'R2':r2})
    if rmse < best_rmse:
        best_rmse = rmse
        best_severity_model = pipe

severity_df = pd.DataFrame(severity_results).sort_values('RMSE')
print("\nSeverity model results:")
print(severity_df.to_string(index=False))

# ==============================
# 5. Claim Probability Prediction
# ==============================
# Avoid duplicate columns
claim_prob_features = severity_features + ['VehicleAge','PremiumToSumInsured','HasAlarm','HasTracking']
seen = set()
claim_prob_features = [x for x in claim_prob_features if not (x in seen or seen.add(x))]

X_prob = df_model[claim_prob_features]
y_prob = df_model['ClaimOccurred']

X_train_prob, X_test_prob, y_train_prob, y_test_prob = train_test_split(
    X_prob, y_prob, test_size=0.2, random_state=42, stratify=y_prob)

cat_feats_prob = X_prob.select_dtypes(include=['object']).columns.tolist()
num_feats_prob = X_prob.select_dtypes(include=['int64','float64']).columns.tolist()

preprocessor_prob = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_feats_prob),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
                      ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), cat_feats_prob)
])

classification_models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100,max_depth=10,random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100,max_depth=5,random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100,max_depth=5,random_state=42,verbosity=0)
}

classification_results = []
best_classification_model = None
best_f1 = 0

for name, model in classification_models.items():
    pipe = Pipeline([('preprocessor', preprocessor_prob),
                     ('scaler', StandardScaler()),
                     ('model', model)])
    pipe.fit(X_train_prob, y_train_prob)
    y_pred = pipe.predict(X_test_prob)
    y_pred_proba = pipe.predict_proba(X_test_prob)[:,1]
    accuracy = accuracy_score(y_test_prob, y_pred)
    precision = precision_score(y_test_prob, y_pred)
    recall = recall_score(y_test_prob, y_pred)
    f1 = f1_score(y_test_prob, y_pred)
    roc_auc = roc_auc_score(y_test_prob, y_pred_proba)
    classification_results.append({'Model':name,'Accuracy':accuracy,'Precision':precision,
                                   'Recall':recall,'F1-Score':f1,'ROC-AUC':roc_auc})
    if f1 > best_f1:
        best_f1 = f1
        best_classification_model = pipe

classification_df = pd.DataFrame(classification_results).sort_values('F1-Score', ascending=False)
print("\nClassification model results:")
print(classification_df.to_string(index=False))

# ==============================
# 6. Risk-Based Premium Calculation
# ==============================
print("\nCalculating risk-based premiums for test set...")
prob_pred = best_classification_model.predict_proba(X_test_prob)[:,1]
# Align arrays for risk-based premium (use test set from severity)
sev_pred = best_severity_model.predict(X_test_sev[:len(prob_pred)])

premium_pred = prob_pred * sev_pred
df_risk_premium = pd.DataFrame({
    'PredictedProbability': prob_pred,
    'PredictedSeverity': sev_pred,
    'RiskBasedPremium': premium_pred
})
print(df_risk_premium.head())

# ==============================
# 7. Model Interpretability (SHAP)
# ==============================
print("\nRunning SHAP analysis for the best severity model...")
explainer = shap.Explainer(best_severity_model.named_steps['model'])
X_sample = preprocessor_sev.transform(X_test_sev)
shap_values = explainer(X_sample)

# Feature importance plot
shap.summary_plot(shap_values, X_sample, feature_names=preprocessor_sev.get_feature_names_out(), show=True)


TASK 4: PREDICTIVE MODELING FOR RISK-BASED PRICING
Data loaded: (1000096, 56)
After dropping columns: (1000096, 17)
After feature engineering: (1000096, 26)

Severity model results:
           Model         RMSE          MAE       R2
    RandomForest 34561.937612 15760.674535 0.257249
    DecisionTree 35350.376735 15911.170130 0.222975
GradientBoosting 35576.869529 16364.494598 0.212986
           Ridge 35799.455856 20251.869840 0.203107
LinearRegression 35809.123039 20241.413776 0.202677
         XGBoost 38044.833755 16864.632727 0.100009
