In [1]:
import os

In [2]:
%pwd

'c:\\Users\\KUNAL MEHTA\\Desktop\\Data Science Training\\Projects\\Auto-Insurance-Risk-Profiling\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\KUNAL MEHTA\\Desktop\\Data Science Training\\Projects\\Auto-Insurance-Risk-Profiling'

In [5]:
os.environ["MLFLOW_TRACKING_URI"]="https://dagshub.com/kunal1406/Auto-Insurance-Risk-Profiling.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"]="kunal1406"
os.environ["MLFLOW_TRACKING_PASSWORD"]="c1f8c1d6722f50e4980aec7e9eba0c1df1353ad6"

In [42]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ClassModelEvaluationConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    class_model_path: Path
    all_params: dict
    class_metric_file_name: Path
    mlflow_uri: str

@dataclass(frozen=True)
class RegModelEvaluationConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    reg_model_path: Path
    all_params: dict
    reg_metric_file_name: Path
    mlflow_uri: str

In [43]:
from AutoInsurance.constants import *
from AutoInsurance.utils.common import read_yaml, create_directories, save_json

In [44]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_class_model_evaluation_config(self) -> ClassModelEvaluationConfig:
        config = self.config.class_model_evaluation
        params = self.params.GradientBoostingClassifier

        create_directories([config.root_dir])

        class_model_evaluation_config = ClassModelEvaluationConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_class_path,
            test_data_path= config.test_data_path,
            class_model_path = config.class_model_path,
            all_params= params,
            class_metric_file_name = config.class_metric_file_name,
            mlflow_uri = "https://dagshub.com/kunal1406/Auto-Insurance-Risk-Profiling.mlflow"
        )

        return class_model_evaluation_config
    
    def get_reg_model_evaluation_config(self) -> RegModelEvaluationConfig:
        config = self.config.reg_model_evaluation
        params = self.params.GradientBoostingRegressor

        create_directories([config.root_dir])

        reg_model_evaluation_config = RegModelEvaluationConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_reg_path,
            test_data_path= config.test_data_path,
            reg_model_path = config.reg_model_path,
            all_params= params,
            reg_metric_file_name = config.reg_metric_file_name,
            mlflow_uri = "https://dagshub.com/kunal1406/Auto-Insurance-Risk-Profiling.mlflow"
        )

        return reg_model_evaluation_config

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as datetime
import seaborn as sns
import math
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc, mean_squared_error, r2_score, classification_report
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.cluster import KMeans
import scipy.stats as stats
import joblib
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

In [46]:
class ClassModelEvaluation:
    def __init__(self, config: ClassModelEvaluationConfig):
        self.config = config

    def perform_k_fold(self, X, y):
        kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)
        cv_scores = []
        pred_full = np.zeros(y.shape[0]) 
        true_full = np.zeros(y.shape[0]) 

        i = 1

        for train_index, test_index in kf.split(X, y):
            print(f"Fold {i} started of {kf.n_splits}")
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            gb = GradientBoostingClassifier(learning_rate=0.1,max_depth=4,max_features=0.3,min_samples_leaf=5,n_estimators=100)
            gb.fit(X_train, y_train)
            pred_probs = gb.predict_proba(X_test)[:, 1]

            pred_full[test_index] = pred_probs  
            true_full[test_index] = y_test  

            score = roc_auc_score(y_test, pred_probs)
            print('roc_auc_score', score)
            cv_scores.append(score)

            i += 1
        
        fpr, tpr, thresholds = roc_curve(true_full, pred_full)
        auc_val = auc(fpr, tpr)
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        print("optimal threshold is", optimal_threshold)

        predicted_labels = (pred_full >= optimal_threshold)
        report = classification_report(true_full, predicted_labels, output_dict=True)
        print(report)

        return cv_scores, optimal_threshold, report

    def evaluate_model(self, X, y):
        cv_scores, optimal_threshold, report = self.perform_k_fold(X, y)
        mean_score = np.mean(cv_scores)
        std_score = np.std(cv_scores)
        print(f"Mean roc_auc_score: {mean_score}")
        print(f"Std roc_auc_score: {std_score}")
        return mean_score, std_score, optimal_threshold, report
    def log_into_mlflow(self):

        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        with mlflow.start_run():

            data = pd.read_csv(self.config.train_data_path)
            X = data.drop('claim', axis=1)
            print(X.shape)
            y = data['claim']
            print(y.shape)
            roc_auc_score, std_roc_auc_score, optimal_threshold, report = self.evaluate_model(X, y)

            scores = {"roc_auc_score": roc_auc_score, "optimal_threshold": optimal_threshold}
            save_json(path=Path(self.config.class_metric_file_name), data=scores)

            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("roc_auc_score", roc_auc_score)
            mlflow.log_metric("std roc_auc_score", std_roc_auc_score)
            mlflow.log_metric("optimal_threshold", optimal_threshold)

            for label, metric in report.items():
                if label not in ["accuracy", "macro avg", "weighted avg"]:
                    for metric_name, metric_value in metric.items():
                        mlflow.log_metric(f"{label}_{metric_name}", metric_value)

In [47]:
class RegModelEvaluation:
    def __init__(self, config: RegModelEvaluationConfig):
        self.config = config

    def perform_k_fold(self, X, y):
        model = GradientBoostingRegressor(
            learning_rate=0.1,
            max_depth=4,
            max_features=0.3,
            min_samples_leaf=5,
            n_estimators=100
        )
        kf = KFold(n_splits=10, shuffle=True, random_state=45)
        cv_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')

        return model, cv_scores
    
    def evaluate_model(self, model, X_train, y_train, X_test, y_test):
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        mae = mean_absolute_error(y_test, predictions)

        return r2, rmse, mae, predictions
    
    def log_into_mlflow(self):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        with mlflow.start_run():
            data = pd.read_csv(self.config.train_data_path)
            X = data.drop('log_amount', axis=1)
            y = data['log_amount']

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

            model, cv_scores = self.perform_k_fold(X, y)

            r2, rmse, mae, predictions = self.evaluate_model(model, X_train, y_train, X_test, y_test)

            scores = {"rmse": rmse, "mae": mae, "r2": r2}
            save_json(path=Path(self.config.reg_metric_file_name), data=scores)

            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("mean_cv_r2_score", np.mean(cv_scores))
            mlflow.log_metric("std_cv_r2_score", np.std(cv_scores))
            mlflow.log_metric("r2_score", r2)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
        


In [48]:
config = ConfigurationManager()

class_model_evaluation_config = config.get_class_model_evaluation_config()
class_model_evaluation_config = ClassModelEvaluation(config=class_model_evaluation_config)
class_model_evaluation_config.log_into_mlflow()

reg_model_evaluation_config = config.get_reg_model_evaluation_config()
reg_model_evaluation_config = RegModelEvaluation(config=reg_model_evaluation_config)
reg_model_evaluation_config.log_into_mlflow()

[2024-05-22 23:58:02,320: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-22 23:58:02,325: INFO: common: yaml file: params.yaml loaded successfully]


[2024-05-22 23:58:02,330: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-05-22 23:58:02,332: INFO: common: created directory at: artifacts]
[2024-05-22 23:58:02,334: INFO: common: created directory at: artifacts/model_evaluation]
(60392, 29)
(60392,)
Fold 1 started of 10
roc_auc_score 0.8161758520349216
Fold 2 started of 10
roc_auc_score 0.8270359657576805
Fold 3 started of 10
roc_auc_score 0.8319984446976783
Fold 4 started of 10
roc_auc_score 0.8331943209291902
Fold 5 started of 10
roc_auc_score 0.8089516201197836
Fold 6 started of 10
roc_auc_score 0.8237128368666835
Fold 7 started of 10
roc_auc_score 0.8115629283713593
Fold 8 started of 10
roc_auc_score 0.8232392774021067
Fold 9 started of 10
roc_auc_score 0.8287553938660587
Fold 10 started of 10
roc_auc_score 0.8128729379771725
optimal threshold is 0.15642630430428492
{'0.0': {'precision': 0.9361289997987523, 'recall': 0.7389102895039911, 'f1-score': 0.8259094036442728, 'support': 50362.0}, '1.0': {'precision': 0.36