# ASNA HACKATHON 2025

### IMPORT FILES

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import mutual_info_classif

### Load & Prepare Data

In [None]:
df = pd.read_csv('./Data/train.csv')
df.columns

Index(['CustomerID', 'State', 'Customer Lifetime Value', 'Response',
       'Coverage', 'Coverage Index', 'Education', 'Education Index',
       'Effective To Date', 'Employment Status', 'Employment Status Index',
       'Gender', 'Income', 'Marital Status', 'Marital Status Index',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type',
       'Policy Type Index', 'Policy', 'Policy Index', 'Renew Offer Type',
       'Sales Channel', 'Sales Channel Index', 'Vehicle Size',
       'Vehicle Size Index', 'Claim over 1k'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   CustomerID                     7290 non-null   object 
 1   State                          7290 non-null   object 
 2   Customer Lifetime Value        7290 non-null   float64
 3   Response                       7290 non-null   object 
 4   Coverage                       7290 non-null   object 
 5   Coverage Index                 7290 non-null   int64  
 6   Education                      7290 non-null   object 
 7   Education Index                7290 non-null   int64  
 8   Effective To Date              7290 non-null   object 
 9   Employment Status              7290 non-null   object 
 10  Employment Status Index        7290 non-null   int64  
 11  Gender                         7290 non-null   object 
 12  Income                         7290 non-null   i

### Categorical Variables

In [None]:
# Extracting Categorical Variables

columns_drop = ["CustomerID", 
                "Effective To Date", 
                "Marital Status", 
                "Coverage", 
                "Education", 
                "Employment Status", 
                "Policy", 
                "Policy Type",
                "Sales Channel",
                "Vehicle Size"]
objects = df.drop(columns=columns_drop).select_dtypes(include='object')
rows = [
    {
        'Column': column,
        'Number of Unique Values': objects[column].nunique(),
        'Unique Values': ", ".join(map(str, objects[column].unique()))
    } 
    for column in objects.columns
]

unique_df = pd.DataFrame(rows)
unique_df

Unnamed: 0,Column,Number of Unique Values,Unique Values
0,State,5,"California, Washington, Oregon, Arizona, Nevada"
1,Response,2,"No, Yes"
2,Gender,2,"F, M"


In [None]:
# One Hot Encoding for Categorical Variables
le = LabelEncoder()
for column in objects:
    objects[column] = le.fit_transform(objects[column])

objects

Unnamed: 0,State,Response,Gender
0,1,0,0
1,4,0,1
2,3,0,1
3,4,0,0
4,0,0,1
...,...,...,...
7285,1,0,0
7286,1,0,0
7287,1,0,0
7288,1,0,0


### Feature Selection

In [None]:
# Feature Estimates
numerical = df.select_dtypes(include=["float64", "int64"])
df_new = pd.concat([numerical, objects], axis=1)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer Lifetime Value        7290 non-null   float64
 1   Coverage Index                 7290 non-null   int64  
 2   Education Index                7290 non-null   int64  
 3   Employment Status Index        7290 non-null   int64  
 4   Income                         7290 non-null   int64  
 5   Marital Status Index           7290 non-null   int64  
 6   Months Since Last Claim        7290 non-null   int64  
 7   Months Since Policy Inception  7290 non-null   int64  
 8   Number of Open Complaints      7290 non-null   int64  
 9   Number of Policies             7290 non-null   int64  
 10  Policy Type Index              7290 non-null   int64  
 11  Policy Index                   7290 non-null   int64  
 12  Renew Offer Type               7290 non-null   i

In [95]:
# Features
X = df_new.drop(columns=['Claim over 1k'])
# Target Variable 
y = df_new['Claim over 1k']

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Random Forest

In [99]:
# Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=100)
model.fit(X_train, y_train)

importances = model.feature_importances_
features = X_train.columns
indices = np.argsort(importances)[::-1]

# Print the feature importances
print("Feature Importances:")
for i in indices:
    print(f"{features[i]}: {importances[i]:.4f}")

Feature Importances:
Customer Lifetime Value: 0.2036
Months Since Policy Inception: 0.0946
Number of Policies: 0.0931
Income: 0.0929
Months Since Last Claim: 0.0818
Coverage Index: 0.0624
Marital Status Index: 0.0594
Employment Status Index: 0.0435
Policy Index: 0.0388
State: 0.0380
Education Index: 0.0350
Sales Channel Index: 0.0342
Renew Offer Type: 0.0315
Vehicle Size Index: 0.0253
Number of Open Complaints: 0.0248
Gender: 0.0185
Policy Type Index: 0.0118
Response: 0.0110


#### Accuracy

In [100]:
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
print(classification_report(y_test, y_pred))

Accuracy: 0.9218106995884774
ROC AUC: 0.9148758519961051
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1300
           1       0.82      0.35      0.50       158

    accuracy                           0.92      1458
   macro avg       0.88      0.67      0.73      1458
weighted avg       0.92      0.92      0.91      1458



In [101]:
# Cross-validation
cv = StratifiedKFold(n_splits=10)

score = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
avg_score = score.mean()
avg_score

0.9137174211248285

### Graident Boosting

In [102]:
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=100)
gb_model.fit(X_train, y_train)

gb_importances = gb_model.feature_importances_
gb_indices = np.argsort(gb_importances)[::-1]

# Print the feature importances
print("Feature Importances for Gradient Boosting:")
for i in gb_indices:
    print(f"{features[i]}: {gb_importances[i]:.4f}")

Feature Importances for Gradient Boosting:
Customer Lifetime Value: 0.2967
Number of Policies: 0.2687
Income: 0.1725
Marital Status Index: 0.1024
Coverage Index: 0.0785
Months Since Policy Inception: 0.0155
Months Since Last Claim: 0.0154
Education Index: 0.0126
Response: 0.0082
Number of Open Complaints: 0.0074
Gender: 0.0071
Employment Status Index: 0.0062
Vehicle Size Index: 0.0039
Policy Index: 0.0026
State: 0.0017
Renew Offer Type: 0.0004
Sales Channel Index: 0.0002
Policy Type Index: 0.0000


In [103]:
y_pred_gb = gb_model.predict(X_test)

# Evaluate the predictions
print("Gradient Boosting Model Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))

Gradient Boosting Model Accuracy: 0.9320987654320988
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96      1300
           1       0.83      0.47      0.60       158

    accuracy                           0.93      1458
   macro avg       0.89      0.73      0.78      1458
weighted avg       0.93      0.93      0.92      1458



In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(GradientBoostingClassifier(random_state=100), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Best cross-validation score: 0.93
