In [1]:
# Clone the repository
!git clone https://github.com/habeshaethiopia/AlphaCare-Insurance-Solutions.git
%cd AlphaCare-Insurance-Solutions

# Change the working directory

# Install the necessary dependencies (if any)
# !pip install -r requirements.txt  # If you have a requirements.txt file

Cloning into 'AlphaCare-Insurance-Solutions'...
remote: Enumerating objects: 97, done.[K
remote: Counting objects: 100% (97/97), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 97 (delta 44), reused 50 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (97/97), 1.62 MiB | 3.74 MiB/s, done.
Resolving deltas: 100% (44/44), done.
/content/AlphaCare-Insurance-Solutions


In [2]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import sys
import os

# Ensure the src directory is in the Python path
sys.path.append(
    os.path.abspath(
        os.path.join(
            "..", "src" # Adjust the directory path as necessary
        )
    )
)
print(os.path.abspath(
        os.path.join(
            "..", "src" # Adjust the directory path as necessary
        )
    ))
from src.data_preparation import DataLoader
# Importing the necessary functions from data_processing.py

import pandas as pd


/content/src


In [None]:
# Load a sample dataset (replace with your own file path)
file_path = r"https://drive.google.com/file/d/1GOLY8weQTQxcv3zyTN35z7va-WZobgpU/view?usp=drive_link" # Change this to your actual CSV file path
DL=DataLoader(file_path)
df = DL.load_data_from_drive_zip(file_path)

# Show the first few rows of the dataset
df.head()


In [None]:
# Impute or remove missing data
df.fillna(df.median(numeric_only=True), inplace=True)  # For numeric columns
df.fillna("Unknown", inplace=True)  # For categorical columns


In [None]:
# Example of creating new features
df['ClaimsToPremiumRatio'] = df['TotalClaims'] / (df['TotalPremium'] + 1e-5)  # Avoid division by zero
df['IsHighRisk'] = (df['ClaimsToPremiumRatio'] > 0.5).astype(int)


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Perform one-hot encoding on categorical features
categorical_cols = ['Province', 'VehicleType', 'Gender']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [None]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['TotalClaims', 'TotalPremium'])
y_claims = df_encoded['TotalClaims']
y_premium = df_encoded['TotalPremium']

# Split data into training and testing sets (80:20)
X_train, X_test, y_claims_train, y_claims_test = train_test_split(X, y_claims, test_size=0.2, random_state=42)
_, _, y_premium_train, y_premium_test = train_test_split(X, y_premium, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Build and evaluate Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_claims_train)
y_claims_pred = linear_model.predict(X_test)

mse = mean_squared_error(y_claims_test, y_claims_pred)
r2 = r2_score(y_claims_test, y_claims_pred)
print("Linear Regression Claims - MSE:", mse, "R2:", r2)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Build and evaluate Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_claims_train)
y_claims_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_claims_test, y_claims_pred_rf)
r2_rf = r2_score(y_claims_test, y_claims_pred_rf)
print("Random Forest Claims - MSE:", mse_rf, "R2:", r2_rf)


In [None]:
import xgboost as xgb

# Build and evaluate XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_claims_train)
y_claims_pred_xgb = xgb_model.predict(X_test)

mse_xgb = mean_squared_error(y_claims_test, y_claims_pred_xgb)
r2_xgb = r2_score(y_claims_test, y_claims_pred_xgb)
print("XGBoost Claims - MSE:", mse_xgb, "R2:", r2_xgb)


In [None]:
# Compare model performances
print("Model Comparison for Total Claims:")
print(f"Linear Regression - MSE: {mse}, R2: {r2}")
print(f"Random Forest - MSE: {mse_rf}, R2: {r2_rf}")
print(f"XGBoost - MSE: {mse_xgb}, R2: {r2_xgb}")


In [None]:
import shap

# Explain model predictions using SHAP
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test)

# Visualize feature importance
shap.summary_plot(shap_values, X_test)


In [None]:
from lime.lime_tabular import LimeTabularExplainer

# Explain specific predictions using LIME
lime_explainer = LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=['TotalClaims'], mode='regression')
exp = lime_explainer.explain_instance(X_test.iloc[0].values, rf_model.predict)
exp.show_in_notebook()
