In [1]:
import os

In [2]:
%pwd

'c:\\Users\\KUNAL MEHTA\\Desktop\\Data Science Training\\Projects\\Auto-Insurance-Risk-Profiling\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\KUNAL MEHTA\\Desktop\\Data Science Training\\Projects\\Auto-Insurance-Risk-Profiling'

In [5]:
import pandas as pd
import numpy as np

In [14]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class UserAppConfig:
    root_dir: Path
    risk_profiles_path: Path
    class_model_path: Path
    reg_model_path: Path
    test_data_path: Path
    scaler_path: Path
    params: dict

In [7]:
from AutoInsurance.constants import *
from AutoInsurance.utils.common import read_yaml, create_directories, save_json

In [20]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_user_app_config(self) -> UserAppConfig:
        config = self.config.user_app
        params = self.params.RiskProfiles

        create_directories([config.root_dir])

        user_app_config = UserAppConfig(
            root_dir= Path(config.root_dir),
            test_data_path= Path(config.test_data_path),
            risk_profiles_path= Path(config.risk_profiles_path),
            class_model_path= Path(config.class_model_path),
            reg_model_path= Path(config.reg_model_path),
            scaler_path= Path(config.scaler_path),
            params= params
        )

        return user_app_config       

In [9]:
import pandas as pd
import joblib

In [48]:
class RiskProfileModel:
    def __init__(self, config: UserAppConfig):
        self.config = config
        self.params = config.params

    # def load_data(self):
    #     return pd.read_csv(Path(self.config.risk_profiles_path))

    def transform_user_data_to_df(self, user_data, columns, dtypes):
        data = {col: np.zeros(1, dtype=dt) if dt == 'float64' else np.zeros(1, dtype=bool) for col, dt in dtypes.items()}
        df = pd.DataFrame(data)
        
        if 'credit_score' in user_data:
            df['credit_score'] = user_data['credit_score']
        if 'traffic_index' in user_data:
            df['traffic_index'] = user_data['traffic_index']
        if 'veh_value' in user_data:
            df['veh_value'] = user_data['veh_value']
        
        for key, value in user_data.items():
            if key in ['gender', 'area', 'veh_body', 'agecat', 'veh_age']:
                column_name = f'{key}_{value}'
                if column_name in df.columns:
                    df[column_name] = True
        
        return df
    
    def predict(self, user_data, columns, dtypes):
        df = self.transform_user_data_to_df(user_data, columns, dtypes)
        class_predictions_probs = joblib.load(Path(self.config.class_model_path)).predict_proba(df)[:, 1]
        reg_predictions = joblib.load(Path(self.config.reg_model_path)).predict(df)
        reg_predictions = np.expm1(reg_predictions)
        
        claim_likelihood = class_predictions_probs[0]
        claim_amount = reg_predictions[0]
        
        return claim_likelihood, claim_amount
    
    def normalize_predictions(self, claim_likelihood, claim_amount):

        features = pd.DataFrame([[claim_likelihood, claim_amount]], columns=['claim_probability', 'claim_amount'])
        norm_values =  joblib.load(Path(self.config.scaler_path)).transform(features)
        normalized_claim_likelihood = norm_values[0, 0]
        normalized_claim_amount = norm_values[0, 1]
        
        return normalized_claim_likelihood, normalized_claim_amount
    
    def classify_risk(self, normalized_claim_likelihood, normalized_claim_amount, claim_amount):

        risk_profiles_df = pd.read_csv(Path(self.config.risk_profiles_path))

        quantiles_prob = risk_profiles_df['claim_probability'].quantile(self.params['claim_probability_thresholds'])
        if normalized_claim_likelihood <= quantiles_prob.iloc[0]:
            risk_profile_probability = 'Low'
        elif normalized_claim_likelihood <= quantiles_prob.iloc[1]:
            risk_profile_probability = 'Medium'
        else:
            risk_profile_probability = 'High'
        
        if claim_amount == 0:
            risk_profile_cost = 'No Claim'
        else:
            quantiles_cost = risk_profiles_df.loc[risk_profiles_df['claim_amount'] > 0, 'claim_amount'].quantile(self.params['claim_amount_thresholds'])
            if normalized_claim_amount <= quantiles_cost.iloc[0]:
                risk_profile_cost = 'Low'
            elif normalized_claim_amount <= quantiles_cost.iloc[1]:
                risk_profile_cost = 'Medium'
            else:
                risk_profile_cost = 'High'

        weights_probability = self.params['weights_probability']
        weights_cost = self.params['weights_cost']
        
        weighted_probability_score = normalized_claim_likelihood * weights_probability[risk_profile_probability]
        weighted_cost_score = normalized_claim_amount * weights_cost[risk_profile_cost]
        dynamic_combined_risk_score = weighted_probability_score + weighted_cost_score
        
        quantiles_risk = risk_profiles_df['dynamic_combined_risk_score'].quantile(self.params['risk_score_thresholds'])
        if dynamic_combined_risk_score <= quantiles_risk.iloc[0]:
            risk_group = 'Low Risk'
        elif dynamic_combined_risk_score <= quantiles_risk.iloc[1]:
            risk_group = 'Medium Risk'
        else:
            risk_group = 'High Risk'
        
        return {
            'claim_likelihood': claim_likelihood,
            'claim_amount': claim_amount,
            'normalized_claim_likelihood': normalized_claim_likelihood,
            'normalized_claim_amount': normalized_claim_amount,
            'risk_profile_probability': risk_profile_probability,
            'risk_profile_cost': risk_profile_cost,
            'dynamic_combined_risk_score': dynamic_combined_risk_score,
            'risk_group': risk_group
        }


In [49]:
config = ConfigurationManager()
user_app_config = config.get_user_app_config()
risk_profile_model = RiskProfileModel(config = user_app_config)

[2024-05-29 00:03:05,438: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-29 00:03:05,443: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-29 00:03:05,448: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-05-29 00:03:05,449: INFO: common: created directory at: artifacts]
[2024-05-29 00:03:05,451: INFO: common: created directory at: artifacts/user_app]


In [42]:
test = pd.read_csv(user_app_config.test_data_path)
columns = test.columns
dtypes = test.dtypes

In [52]:
def user_input_features():
    data = {
        'gender': 'F',
        'agecat': '2',
        'credit score': 800,
        'area': 'A',
        'traffic index': 1.2,
        'vehicle age': '2',
        'vehicle body': 'SEDAN',
        'vehicle value': 0.9
        }
    return data

In [53]:
user_data = user_input_features()

In [54]:
user_data_df = pd.DataFrame(list(user_data.items()), columns=['Feature', 'Value']).astype(str)

In [55]:
claim_likelihood, claim_amount = risk_profile_model.predict(user_data, columns, dtypes)
normalized_claim_likelihood, normalized_claim_amount = risk_profile_model.normalize_predictions(claim_likelihood, claim_amount)
risk_profile = risk_profile_model.classify_risk(normalized_claim_likelihood, normalized_claim_amount, claim_amount)

In [56]:
risk_profile

{'claim_likelihood': 0.1380524665259975,
 'claim_amount': 2922.3115154880243,
 'normalized_claim_likelihood': 0.14037485951549103,
 'normalized_claim_amount': 0.0693165265079635,
 'risk_profile_probability': 'High',
 'risk_profile_cost': 'Low',
 'dynamic_combined_risk_score': 0.11195152631248002,
 'risk_group': 'High Risk'}

# Testing

In [30]:
test = pd.read_csv('artifacts\data_transformation\Processed_test_data.csv')

In [32]:
test.columns

Index(['credit_score', 'traffic_index', 'veh_value', 'gender_M', 'area_B',
       'area_C', 'area_D', 'area_E', 'area_F', 'veh_body_CONVT',
       'veh_body_COUPE', 'veh_body_HBACK', 'veh_body_HDTOP', 'veh_body_MCARA',
       'veh_body_MIBUS', 'veh_body_PANVN', 'veh_body_RDSTR', 'veh_body_SEDAN',
       'veh_body_STNWG', 'veh_body_TRUCK', 'veh_body_UTE', 'agecat_2',
       'agecat_3', 'agecat_4', 'agecat_5', 'agecat_6', 'veh_age_2',
       'veh_age_3', 'veh_age_4'],
      dtype='object')

In [97]:
test.dtypes

credit_score      float64
traffic_index     float64
veh_value         float64
gender_M             bool
area_B               bool
area_C               bool
area_D               bool
area_E               bool
area_F               bool
veh_body_CONVT       bool
veh_body_COUPE       bool
veh_body_HBACK       bool
veh_body_HDTOP       bool
veh_body_MCARA       bool
veh_body_MIBUS       bool
veh_body_PANVN       bool
veh_body_RDSTR       bool
veh_body_SEDAN       bool
veh_body_STNWG       bool
veh_body_TRUCK       bool
veh_body_UTE         bool
agecat_2             bool
agecat_3             bool
agecat_4             bool
agecat_5             bool
agecat_6             bool
veh_age_2            bool
veh_age_3            bool
veh_age_4            bool
dtype: object

In [152]:
import pandas as pd
import numpy as np

# Define the function to transform user data into the DataFrame
def transform_user_data_to_df(user_data, columns, dtypes):
    # Initialize the DataFrame with zeros and the specified data types
    data = {col: np.zeros(1, dtype=dt) if dt == 'float64' else np.zeros(1, dtype=bool) for col, dt in dtypes.items()}
    df = pd.DataFrame(data)
    
    # Fill in the continuous/numerical data
    if 'credit_score' in user_data:
        df['credit_score'] = user_data['credit_score']
    if 'traffic_index' in user_data:
        df['traffic_index'] = user_data['traffic_index']
    if 'veh_value' in user_data:
        df['veh_value'] = user_data['veh_value']
    
    # Handle categorical data by setting the relevant column to True
    for key, value in user_data.items():
        if key in ['gender', 'area', 'veh_body', 'agecat', 'veh_age']:
            column_name = f'{key}_{value}'
            if column_name in df.columns:
                df[column_name] = True
    
    # Return the resulting DataFrame
    return df

# Example usage
user_data = {
    'gender': 'F',
    'agecat': '3',
    'credit_score': 500,
    'area': 'A',
    'traffic_index': 1.9,
    'veh_age': '1',
    'veh_body': 'SEDAN',
    'veh_value': 1.5
}

# Initialize an empty DataFrame with the specified columns and data types
columns = test.columns
dtypes = test.dtypes

# Transform the user data into the DataFrame
df = transform_user_data_to_df(user_data, columns, dtypes)
print(df)
print(df.dtypes)


   credit_score  traffic_index  veh_value  gender_M  area_B  area_C  area_D  \
0           500            1.9        1.5     False   False   False   False   

   area_E  area_F  veh_body_CONVT  ...  veh_body_TRUCK  veh_body_UTE  \
0   False   False           False  ...           False         False   

   agecat_2  agecat_3  agecat_4  agecat_5  agecat_6  veh_age_2  veh_age_3  \
0     False      True     False     False     False      False      False   

   veh_age_4  
0      False  

[1 rows x 29 columns]
credit_score        int64
traffic_index     float64
veh_value         float64
gender_M             bool
area_B               bool
area_C               bool
area_D               bool
area_E               bool
area_F               bool
veh_body_CONVT       bool
veh_body_COUPE       bool
veh_body_HBACK       bool
veh_body_HDTOP       bool
veh_body_MCARA       bool
veh_body_MIBUS       bool
veh_body_PANVN       bool
veh_body_RDSTR       bool
veh_body_SEDAN       bool
veh_body_STNWG      

In [153]:
import joblib
from pathlib import Path 

In [154]:
class_model = joblib.load(Path("artifacts\model_trainer\class_model.joblib"))

In [155]:
reg_model = joblib.load(Path('artifacts/model_trainer/reg_model.joblib'))

In [156]:
class_predictions_probs = class_model.predict_proba(df)[:, 1]

In [157]:
class_predictions_probs[0]

0.07961550057185947

In [158]:
reg_predictions = reg_model.predict(df)
reg_predictions = np.expm1(reg_predictions)
reg_predictions[0]

6434.927335660211

In [159]:
claim_likelihood = class_predictions_probs[0]
claim_amount = reg_predictions[0]

In [160]:
scaler = joblib.load(Path('artifacts/risk_profiles/minmax_scaler.pkl'))

In [161]:
predictions_df = pd.read_csv('artifacts/risk_profiles/risk_profiles.csv')

In [162]:
norm_values = scaler.transform([[claim_likelihood, claim_amount]])
normalized_claim_likelihood = norm_values[0, 0]
normalized_claim_amount = norm_values[0, 1]



In [163]:
predictions_df.columns

Index(['quote_number', 'gender', 'agecat', 'date_of_birth', 'credit_score',
       'area', 'traffic_index', 'veh_age', 'veh_body', 'veh_value', 'age',
       'claim_probability', 'claim', 'claim_amount',
       'normalized_claim_probability', 'normalized_claim_amount',
       'risk_profile_probability', 'risk_profile_cost',
       'weighted_probability_score', 'weighted_cost_score',
       'dynamic_combined_risk_score', 'risk_group'],
      dtype='object')

In [164]:
claim_probability_thresholds = [0.2, 0.45]
claim_amount_thresholds = [0.2, 0.45]
weights_probability = {'Low': 0.4, 'Medium': 0.5, 'High': 0.6}
weights_cost = {'No Claim': 0.3, 'Low': 0.4, 'Medium': 0.5, 'High': 0.6}
risk_score_thresholds = [0.2, 0.45]

In [165]:
quantiles_prob = predictions_df['claim_probability'].quantile(claim_probability_thresholds)
if normalized_claim_likelihood <= quantiles_prob.iloc[0]:
    risk_profile_probability = 'Low'
elif normalized_claim_likelihood <= quantiles_prob.iloc[1]:
    risk_profile_probability = 'Medium'
else:
    risk_profile_probability = 'High'

In [166]:
if claim_amount == 0:
    risk_profile_cost = 'No Claim'
else:
    quantiles_cost = predictions_df.loc[predictions_df['claim_amount'] > 0, 'claim_amount'].quantile(claim_amount_thresholds)
    if normalized_claim_amount <= quantiles_cost.iloc[0]:
        risk_profile_cost = 'Low'
    elif normalized_claim_amount <= quantiles_cost.iloc[1]:
        risk_profile_cost = 'Medium'
    else:
        risk_profile_cost = 'High'

In [167]:
weighted_probability_score = normalized_claim_likelihood * weights_probability[risk_profile_probability]
weighted_cost_score = normalized_claim_amount * weights_cost[risk_profile_cost]
    

In [168]:
dynamic_combined_risk_score = weighted_probability_score + weighted_cost_score

In [169]:
quantiles_risk = predictions_df['dynamic_combined_risk_score'].quantile(risk_score_thresholds)
if dynamic_combined_risk_score <= quantiles_risk.iloc[0]:
    risk_group = 'Low Risk'
elif dynamic_combined_risk_score <= quantiles_risk.iloc[1]:
    risk_group = 'Medium Risk'
else:
    risk_group = 'High Risk'
{
    'claim_likelihood': claim_likelihood,
    'claim_amount': claim_amount,
    'normalized_claim_likelihood': normalized_claim_likelihood,
    'normalized_claim_amount': normalized_claim_amount,
    'risk_profile_probability': risk_profile_probability,
    'risk_profile_cost': risk_profile_cost,
    'dynamic_combined_risk_score': dynamic_combined_risk_score,
    'risk_group': risk_group
    
}

{'claim_likelihood': 0.07961550057185947,
 'claim_amount': 6434.927335660211,
 'normalized_claim_likelihood': 0.07556854055743428,
 'normalized_claim_amount': 0.19096796236790398,
 'risk_profile_probability': 'Medium',
 'risk_profile_cost': 'Low',
 'dynamic_combined_risk_score': 0.11417145522587874,
 'risk_group': 'High Risk'}

In [61]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path

class RiskProfileModel:
    def __init__(self, model_paths, scaler_path, risk_profile_path):
        self.class_model = joblib.load(Path(model_paths['class_model']))
        self.reg_model = joblib.load(Path(model_paths['reg_model']))
        self.scaler = joblib.load(Path(scaler_path))
        self.risk_profiles_df = pd.read_csv(risk_profile_path)
    
    def transform_user_data_to_df(self, user_data, columns, dtypes):
        data = {col: np.zeros(1, dtype=dt) if dt == 'float64' else np.zeros(1, dtype=bool) for col, dt in dtypes.items()}
        df = pd.DataFrame(data)
        
        if 'credit_score' in user_data:
            df['credit_score'] = user_data['credit_score']
        if 'traffic_index' in user_data:
            df['traffic_index'] = user_data['traffic_index']
        if 'veh_value' in user_data:
            df['veh_value'] = user_data['veh_value']
        
        for key, value in user_data.items():
            if key in ['gender', 'area', 'veh_body', 'agecat', 'veh_age']:
                column_name = f'{key}_{value}'
                if column_name in df.columns:
                    df[column_name] = True
        
        return df
    
    def predict(self, user_data, columns, dtypes):
        df = self.transform_user_data_to_df(user_data, columns, dtypes)
        class_predictions_probs = self.class_model.predict_proba(df)[:, 1]
        reg_predictions = self.reg_model.predict(df)
        reg_predictions = np.expm1(reg_predictions)
        
        claim_likelihood = class_predictions_probs[0]
        claim_amount = reg_predictions[0]
        
        return claim_likelihood, claim_amount
    
    def normalize_predictions(self, claim_likelihood, claim_amount):
        
        features = pd.DataFrame([[claim_likelihood, claim_amount]], columns=['claim_probability', 'claim_amount'])

        norm_values = self.scaler.transform(features)
        # norm_values = self.scaler.transform([[claim_likelihood, claim_amount]])
        normalized_claim_likelihood = norm_values[0, 0]
        normalized_claim_amount = norm_values[0, 1]
        
        return normalized_claim_likelihood, normalized_claim_amount
    
    def classify_risk(self, normalized_claim_likelihood, normalized_claim_amount, claim_amount):
        claim_probability_thresholds = [0.2, 0.45]
        claim_amount_thresholds = [0.2, 0.45]
        weights_probability = {'Low': 0.4, 'Medium': 0.5, 'High': 0.6}
        weights_cost = {'No Claim': 0.3, 'Low': 0.4, 'Medium': 0.5, 'High': 0.6}
        risk_score_thresholds = [0.2, 0.45]
        
        quantiles_prob = self.risk_profiles_df['claim_probability'].quantile(claim_probability_thresholds)
        if normalized_claim_likelihood <= quantiles_prob.iloc[0]:
            risk_profile_probability = 'Low'
        elif normalized_claim_likelihood <= quantiles_prob.iloc[1]:
            risk_profile_probability = 'Medium'
        else:
            risk_profile_probability = 'High'
        
        if claim_amount == 0:
            risk_profile_cost = 'No Claim'
        else:
            quantiles_cost = self.risk_profiles_df.loc[self.risk_profiles_df['claim_amount'] > 0, 'claim_amount'].quantile(claim_amount_thresholds)
            if normalized_claim_amount <= quantiles_cost.iloc[0]:
                risk_profile_cost = 'Low'
            elif normalized_claim_amount <= quantiles_cost.iloc[1]:
                risk_profile_cost = 'Medium'
            else:
                risk_profile_cost = 'High'
        
        weighted_probability_score = normalized_claim_likelihood * weights_probability[risk_profile_probability]
        weighted_cost_score = normalized_claim_amount * weights_cost[risk_profile_cost]
        dynamic_combined_risk_score = weighted_probability_score + weighted_cost_score
        
        quantiles_risk = self.risk_profiles_df['dynamic_combined_risk_score'].quantile(risk_score_thresholds)
        if dynamic_combined_risk_score <= quantiles_risk.iloc[0]:
            risk_group = 'Low Risk'
        elif dynamic_combined_risk_score <= quantiles_risk.iloc[1]:
            risk_group = 'Medium Risk'
        else:
            risk_group = 'High Risk'
        
        return {
            'claim_likelihood': claim_likelihood,
            'claim_amount': claim_amount,
            'normalized_claim_likelihood': normalized_claim_likelihood,
            'normalized_claim_amount': normalized_claim_amount,
            'risk_profile_probability': risk_profile_probability,
            'risk_profile_cost': risk_profile_cost,
            'dynamic_combined_risk_score': dynamic_combined_risk_score,
            'risk_group': risk_group
        }


model_paths = {
    'class_model': "artifacts/model_trainer/class_model.joblib",
    'reg_model': 'artifacts/model_trainer/reg_model.joblib'
}
scaler_path = 'artifacts/risk_profiles/minmax_scaler.pkl'
risk_profile_path = 'artifacts/risk_profiles/risk_profiles.csv'

# Initialize the model
risk_profile_model = RiskProfileModel(model_paths, scaler_path, risk_profile_path)

# Load test data to get columns and dtypes
test = pd.read_csv('artifacts/data_transformation/Processed_test_data.csv')
columns = test.columns
dtypes = test.dtypes

# Define user data
user_data = {
        'gender': 'F',
        'agecat': '2',
        'credit score': 800,
        'area': 'A',
        'traffic index': 0,
        'vehicle age': '2',
        'vehicle body': 'SEDAN',
        'vehicle value': 0
        }

# Get predictions
claim_likelihood, claim_amount = risk_profile_model.predict(user_data, columns, dtypes)

# Normalize predictions
normalized_claim_likelihood, normalized_claim_amount = risk_profile_model.normalize_predictions(claim_likelihood, claim_amount)

# Classify risk
risk_profile = risk_profile_model.classify_risk(normalized_claim_likelihood, normalized_claim_amount, claim_amount)

# Output the risk profile
print(risk_profile)


{'claim_likelihood': 0.1380524665259975, 'claim_amount': 2922.3115154880243, 'normalized_claim_likelihood': 0.14037485951549103, 'normalized_claim_amount': 0.0693165265079635, 'risk_profile_probability': 'High', 'risk_profile_cost': 'Low', 'dynamic_combined_risk_score': 0.11195152631248002, 'risk_group': 'High Risk'}


In [59]:
test.columns

Index(['credit_score', 'traffic_index', 'veh_value', 'gender_M', 'area_B',
       'area_C', 'area_D', 'area_E', 'area_F', 'veh_body_CONVT',
       'veh_body_COUPE', 'veh_body_HBACK', 'veh_body_HDTOP', 'veh_body_MCARA',
       'veh_body_MIBUS', 'veh_body_PANVN', 'veh_body_RDSTR', 'veh_body_SEDAN',
       'veh_body_STNWG', 'veh_body_TRUCK', 'veh_body_UTE', 'agecat_2',
       'agecat_3', 'agecat_4', 'agecat_5', 'agecat_6', 'veh_age_2',
       'veh_age_3', 'veh_age_4'],
      dtype='object')

In [188]:
test.veh_value.describe()

count    7464.000000
mean        1.003805
std         0.376802
min         0.000000
25%         0.747162
50%         0.970400
75%         1.210154
max         2.980619
Name: veh_value, dtype: float64