### Import Libraries

In [None]:
import numpy as np
import pandas as pd
from pandas import read_csv
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt # data visualization
import seaborn as sns #statistcal data visualization
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.inspection import PartialDependenceDisplay


# Data Preparation

In [None]:
df = pd.read_csv('train_test.csv') # loading data 

In [None]:
# Initial Data Overview
print(df.head(10))
print(df.shape)
print(df.info())

In [None]:
#checking for missing values
print(df.isnull().sum())

In [None]:
# Check for duplicate rows in the dataset
duplicates = df.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')


### Convert dates to datetime

In [None]:
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'])
df['Cover_Start_Date'] = pd.to_datetime(df['Cover_Start_Date'])
print(df['Purchase_Date'])
print(df['Cover_Start_Date'])

## Exploratory Data Analysis (EDA)

In [None]:
#Statistical summary of the numerical columns
print(df.describe())

In [None]:
#Correlation Heatmap
corr_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Enhanced Correlation Analysis for features vs. Sale_FLag sorted
correlation_with_sale = df.corr()['Sale_Flag'].sort_values()
print(correlation_with_sale)

In [None]:
# Scatter matrix for a few selected features
attributes = ['Claims_Amount', 'Claims_Count', 'Purchase_Price', 'Premium', 'Age']
scatter_matrix(df[attributes], figsize=(12, 8))
plt.show()

In [None]:
#Visualization for the categoroical variables 
categorical_vars = ['Plan_Flag', 'PriceTest', 'Account', 'Category']
for var in categorical_vars:
    sns.countplot(x=var, data=df)
    plt.title(f'Distribution of {var}')
    plt.show()

In [None]:
#Total counts for each category
for var in categorical_vars:
    print(f"Counts for {var}: \n{df[var].value_counts()}\n")

#Distribution of sales_Flag within each category
sale_distribution = df.groupby(var)['Sale_Flag'].value_counts(normalize=True).unstack()*100
print(f"Distribution of Sale_Flag within {var}:\n{sale_distribution}\n")                               

In [None]:
#Boxplots for Catorgorical Variables
sns.boxplot(x='Sale_Flag', y='Premium', data=df)
plt.title('Premium Distribution by Sale_Flag')
plt.show()

In [None]:
#Visualizing the distribution of numerical features
df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#Visualizing Key Numerical Feature Distribution
df[['Claims_Amount', 'Claims_Count', 'Purchase_Price', 'Age', 'Price_Diff']].hist(bins=30, figsize=(15,10), edgecolor='k')
plt.tight_layout()
plt.show()

In [None]:
#Scatter Plots for Key Numerical Relationships
sns.scatterplot(x= 'Purchase_Price', y='Premium', hue='Sale_Flag', data=df)
plt.title('Purchase Price vs. Premium by Sale_Flag')
plt.show()


In [None]:
#Distribution of the target variable 'Sale_Flag'
print(df['Sale_Flag'].value_counts(normalize=True))
#visual distribtion of target variables
sns.countplot(x='Sale_Flag', data=df)
plt.title('Distribution of Sale_Flag')
plt.show()

# Price Elasticity Analysis 

In [None]:
#Segmenting Data
flat_price_df = df[df['PriceTest'] == 0]
random_price_df = df[df['PriceTest'] == 1]

In [None]:
#calculation of metrics
conversion_flat = flat_price_df['Sale_Flag'].mean()
conversion_random = random_price_df['Sale_Flag'].mean()
avg_premium_flat = flat_price_df['Premium'].mean()
avg_premium_random = random_price_df['Premium'].mean()

In [None]:
#Computing Price Elasticity
price_elasticity = ((conversion_random - conversion_flat) / conversion_flat) / ((avg_premium_random - avg_premium_flat) / avg_premium_flat)
print("Price Elasticity:", price_elasticity)

# Feature Engineering

In [None]:
# Creating a new Feature 
# Time-related new feature: Days from purchase to cover start
df['Days_till_Cover_Start'] = (df['Cover_Start_Date'] - df['Purchase_Date']).dt.days

In [None]:
#binning the 'Age' feature
kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
df['Age_binned'] = kbd.fit_transform(df[['Age']])

In [None]:
# Dropping constant and non-predictive features 
df.drop(['Period_of_Cover', 'PriceTest', 'Purchase_Date', 'Cover_Start_Date'], axis=1, inplace=True)

In [None]:
# Encoding categorical variables usign dummy variables
df_encoded = pd.get_dummies(df, columns=['Account', 'Category'], drop_first=True)

In [None]:
# Splitting the dataset into training + validation (80%) and test (20%)
X = df_encoded.drop('Sale_Flag', axis=1)
y = df_encoded['Sale_Flag']
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Further splitting the training + validation set into training (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [None]:
# Applying SMOTE to the training data for adressign the class imbalance
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [None]:
print("Class distribution before SMOTE:")
print(y_train.value_counts())
print("Class distribution after SMOTE:")
print(y_train_sm.value_counts())


# Demand Modeling

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_sm_scaled = scaler.fit_transform(X_train_sm)
X_val_scaled = scaler.transform(X_val)  # Scaling the validation set
X_test_scaled = scaler.transform(X_test)  # Scaling the test set

### Initialize models 

In [None]:
# Define models
logreg_model = LogisticRegression()
knn_model = KNeighborsClassifier()
gb_model = GradientBoostingClassifier()
rf_model = RandomForestClassifier()

### Hyperparameter 

In [None]:
# Hyperparameter grids
logreg_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

knn_param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Preparation for gridsearch
models = [logreg_model, knn_model, gb_model, rf_model]
param_grids = [logreg_param_grid, knn_param_grid, gb_param_grid, rf_param_grid]
model_names = ['Logistic Regression', 'KNN', 'Gradient Boosting', 'Random Forest']

# Dictionary to hold the best models
best_models = {}

for model, param_grid, name in zip(models, param_grids, model_names):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_sm_scaled, y_train_sm)  
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

## Evaluated each best model on the validation set

In [None]:
# Lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Iteration over each best model
for model_name, best_model in best_models.items():
    # Predict on the validation set
    y_pred_val = best_model.predict(X_val_scaled)
    pos_label_encoded = 1  

    # Calculating and storing evaluation metrics
    accuracy_scores.append(np.round(accuracy_score(y_val, y_pred_val), 3))
    precision_scores.append(np.round(precision_score(y_val, y_pred_val, pos_label=pos_label_encoded), 3))
    recall_scores.append(np.round(recall_score(y_val, y_pred_val, pos_label=pos_label_encoded), 3))
    f1_scores.append(np.round(f1_score(y_val, y_pred_val, pos_label=pos_label_encoded), 3))

    # Print classification report for each model
    print(f"Classification Report - {model_name} Model:")
    print(classification_report(y_val, y_pred_val))

    # Confusion matrix heatmap
    cm = confusion_matrix(y_val, y_pred_val)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name} Model')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.xticks([0, 1], ['Class 0', 'Class 1'])
    plt.yticks([0, 1], ['Class 0', 'Class 1'])
    plt.show()


In [None]:
# Updated list of model names
model_names = ['Logistic Regression', 'KNN', 'Gradient Boosting', 'Random Forest']

# Updated list of corresponding metrics based on the classification reports
accuracy_scores = [0.78, 0.74, 0.75, 0.77]  # Updated accuracy scores
precision_scores = [0.45, 0.34, 0.39, 0.44]  # Updated precision scores for class '1'
recall_scores = [0.28, 0.15, 0.05, 0.13]  # Updated recall scores for class '1'
f1_scores = [0.35, 0.21, 0.09, 0.23]  # Updated F1 scores for class '1'

# Setting positions for the bars
x = np.arange(len(model_names))
width = 0.2

# Creating bar plots for each metric
plt.figure(figsize=(10, 6))  # Adjusting figure size for better readability
plt.bar(x - width*1.5, accuracy_scores, width, label='Accuracy')
plt.bar(x - width/2, precision_scores, width, label='Precision')
plt.bar(x + width/2, recall_scores, width, label='Recall')
plt.bar(x + width*1.5, f1_scores, width, label='F1-Score')

# Setting labels and title
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Model Evaluation Metrics')
plt.xticks(x, model_names)
plt.legend()

# Displaying the plot
plt.tight_layout()
plt.show()


## Model Comparison using ROC AUC 

In [None]:
models = [logreg_model, knn_model, gb_model, rf_model]
model_names = ['Logistic Regression', 'KNN', 'Gradient Boosting', 'Random Forest']

# Initializing empty lists to store mean ROC AUC scores for each model
mean_roc_auc_scores = []

# Performing cross-validation for each model
for model in models:
    roc_auc_scores = cross_val_score(model, X_train_sm_scaled, y_train_sm, cv=5, scoring='roc_auc', n_jobs=-1)
    mean_roc_auc_scores.append(np.mean(roc_auc_scores))

# Displaying the mean ROC AUC scores for each model
for model_name, roc_auc in zip(model_names, mean_roc_auc_scores):
    print(f"Model: {model_name}")
    print(f"Mean ROC AUC Score: {roc_auc}")
    print("=============================")


In [None]:
model_names = ['Logistic Regression', 'KNN', 'Gradient Boosting', 'Random Forest']

# Mean ROC AUC scores for each model based on results
mean_roc_auc_scores = [0.874357756029327, 0.8694375133233214, 0.8729172205575555, 0.912243773068807]

# Bar plot for Mean ROC AUC Scores
plt.figure(figsize=(10, 6))
plt.bar(model_names, mean_roc_auc_scores, color=['blue', 'orange', 'green', 'red'])
plt.xlabel('Models')
plt.ylabel('Mean ROC AUC Score')
plt.title('Model Comparison using Mean ROC AUC Scores')
plt.ylim([0, 1]) 
plt.xticks(rotation=45)  # Rotate model names for better readability
plt.show()


## Best Model Evaluation on Test Set

In [None]:
rf_model_index = model_names.index('Random Forest')
best_rf_model = best_models['Random Forest']  # Directly access by name

# Testing the best-performing Random Forest model on the test data
y_pred_test_rf = best_rf_model.predict(X_test_scaled)

# Classification report for the test set with meaningful class names
print("Classification Report - Best Random Forest Model (Test Set):")
print(classification_report(y_test, y_pred_test_rf, target_names=['Not Accepted', 'Accepted']))

# Confusion matrix heatmap for the test set
cm_test_rf = confusion_matrix(y_test, y_pred_test_rf)
sns.heatmap(cm_test_rf, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Best Random Forest Model (Test Set)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks([0.5, 1.5], ['Not Accepted', 'Accepted'])
plt.yticks([0.5, 1.5], ['Not Accepted', 'Accepted'], rotation=0)  
plt.show()


# Feature Importance and Partial Dependence Plots

In [None]:
# Assuming 'best_rf_model' is trained RandomForest model

# Calculation of feature importances and sorting them
feature_importances = best_rf_model.feature_importances_
features_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
top_features_df = features_df.sort_values(by='Importance', ascending=False).head(5)

# Plotting feature importances for top 5 features
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_features_df, palette='viridis')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 5 Feature Importances from Random Forest Model')
plt.show()

# Extracting the names of the top 5 features
top_features_names = top_features_df['Feature'].tolist()

# Generating Partial Dependence Plots for each of the top 5 features
for feature_name in top_features_names:
    fig, ax = plt.subplots(figsize=(8, 4))
    PartialDependenceDisplay.from_estimator(best_rf_model, X_train, features=[feature_name], ax=ax, feature_names=X_train.columns.tolist())
    ax.set_title(f'Partial Dependence Plot for {feature_name}')
    plt.tight_layout()
plt.show()


# Impact of 10% Price Increase

In [None]:
#'X_test_scaled' is a numpy array and the column index of 'Premium' 
premium_col_index = X_train.columns.get_loc('Premium')  #column index of 'Premium' before scaling

# Making a copy of the scaled test dataset
X_test_adjusted = X_test_scaled.copy()

# Applying a 10% increase to the 'Premium' feature in the adjusted test dataset
X_test_adjusted[:, premium_col_index] *= 1.10

# Making predictions with the original and adjusted test datasets
original_predictions = best_rf_model.predict_proba(X_test_scaled)[:, 1]
adjusted_predictions = best_rf_model.predict_proba(X_test_adjusted)[:, 1]

# Comparing the average probabilities of acceptance before and after the premium adjustment
print(f"Average probability of acceptance before adjustment: {np.mean(original_predictions)}")
print(f"Average probability of acceptance after adjustment: {np.mean(adjusted_predictions)}")

# Visualizing the distribution of probabilities before and after the price adjustment
plt.figure(figsize=(12, 6))
plt.hist(original_predictions, bins=50, alpha=0.5, label='Original Probabilities')
plt.hist(adjusted_predictions, bins=50, alpha=0.5, label='Adjusted Probabilities', color='red')
plt.xlabel('Probability of Acceptance')
plt.ylabel('Frequency')
plt.title('Distribution of Acceptance Probabilities Before and After 10% Premium Increase')
plt.legend()
plt.show()
