In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt
import os

In [9]:
txt_file_path = r'c:\data science\10 academy\week3\Technical Content\Data\MachineLearningRating_v3.txt'

# Read the TXT file using '|' as the delimiter
df = pd.read_csv(txt_file_path, delimiter='|')

# Define the output path with a .csv filename
csv_file_path = r'c:\data science\10 academy\week3\Technical Content\MachineLearningRating_v3.csv'

# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

# Print confirmation message
print(f"Data successfully converted from .txt to {csv_file_path}")

# Load the newly created CSV file for further analysis


df = pd.read_csv(r'c:\data science\10 academy\week3\Technical Content\MachineLearningRating_v3.csv'  )

  df = pd.read_csv(txt_file_path, delimiter='|')


Data successfully converted from .txt to c:\data science\10 academy\week3\Technical Content\MachineLearningRating_v3.csv


  df = pd.read_csv(r'c:\data science\10 academy\week3\Technical Content\MachineLearningRating_v3.csv'  )


In [10]:
# Step 1: Data Preparation

In [None]:

df.fillna(df.select_dtypes(include='number').median(), inplace=True)

# For non-numeric columns (e.g., categorical, datetime), (impute using mode) (most frequent value)
df.fillna(df.select_dtypes(exclude='number').mode().iloc[0], inplace=True)


# Feature Engineering: Create a new feature (e.g., ClaimsRatio: TotalClaims / TotalPremium)
df['ClaimsRatio'] = df['TotalClaims'] / df['TotalPremium']


# Limit the number of unique categories for high-cardinality features like PostalCode
top_n = 100  
df['PostalCode'] = df['PostalCode'].apply(lambda x: x if x in df['PostalCode'].value_counts().index[:top_n] else 'Other')
# Encoding categorical data: One-Hot Encoding for categorical variables like Province, Gender, PostalCode
df_encoded = pd.get_dummies(df, columns=['Province', 'Gender', 'PostalCode'])

# Splitting features and labels
X = df_encoded.drop(columns=['TotalPremium', 'TotalClaims'])  # Features
y = df_encoded['TotalPremium']  # Target variable: TotalPremium (TotalClaims)

# Train-Test Split (80% training and 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [None]:
# Step 2: Model Building


In [None]:

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# XGBoost Regressor Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
#Step 3: Model Evaluation

In [None]:
evaluate_model(lr_model, X_test, y_test, "Linear Regression")

# Evaluate Random Forest
evaluate_model(rf_model, X_test, y_test, "Random Forest")

# Evaluate XGBoost
evaluate_model(xgb_model, X_test, y_test, "XGBoost")


In [None]:
# Step 4: Feature Importance Analysis

In [None]:
rf_importances = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_importances})
rf_importance_df = rf_importance_df.sort_values(by='Importance', ascending=False)
print("\nRandom Forest Feature Importances:\n", rf_importance_df)

# Feature Importance for XGBoost
xgb_importances = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': xgb_importances})
xgb_importance_df = xgb_importance_df.sort_values(by='Importance', ascending=False)
print("\nXGBoost Feature Importances:\n", xgb_importance_df)

# Plotting Feature Importance for XGBoost
plt.figure(figsize=(10, 6))
plt.barh(xgb_importance_df['Feature'], xgb_importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('XGBoost Feature Importance')
plt.show()

In [None]:
# Step 5: Model Interpretability using SHAP

In [None]:
# Initialize SHAP for XGBoost
explainer = shap.Explainer(xgb_model)
shap_values = explainer.shap_values(X_test)

# SHAP Summary Plot
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X_test.columns)

# SHAP Dependence Plot for a specific feature (example: 'ClaimsRatio')
shap.dependence_plot('ClaimsRatio', shap_values, X_test, feature_names=X_test.columns)

# SHAP Force Plot for a single instance
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:], feature_names=X_test.columns)