This code is using HistGradientBoostingClassifier and RandomSearchCV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint

# Setting random seed for reproducibility
np.random.seed(42)

# Load the train and test datasets
train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for null values
print("Train data null values:\n", train.isnull().sum())
print("Test data null values:\n", test_data.isnull().sum())

# Remove duplicates from train data if any
train = train.drop_duplicates()

# Store LoanID from test data for the final submission
test_loan_ids = test_data['LoanID']

# Define the OutlierRemoval class
class OutlierRemoval:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + inter_quartile_range * 1.5
        self.lower_whisker = q1 - inter_quartile_range * 1.5

    def remove(self, row):
        if self.lower_whisker <= row <= self.upper_whisker:
            return row
        elif row < self.lower_whisker:
            return self.lower_whisker
        else:
            return self.upper_whisker

# Identify numerical columns
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

# Apply outlier removal on numerical columns in train and test data
for column in numerical_columns:
    remover = OutlierRemoval(train[column])
    train[column] = train[column].apply(remover.remove)
    
    test_remover = OutlierRemoval(test_data[column])
    test_data[column] = test_data[column].apply(test_remover.remove)

# Define mappings for categorical columns
mappings = {
    'MaritalStatus': {'Married': 0, 'Single': 1, 'Divorced': 2},
    'Education': {'High School': 0, "Bachelor's": 1, 'Master\'s': 2},
    'EmploymentType': {'Full-time': 0, 'Part-time': 1, 'Self-employed': 2, 'Unemployed': 3},
    'HasMortgage': {'Yes': 1, 'No': 0},
    'HasDependents': {'Yes': 1, 'No': 0},
    'LoanPurpose': {'Home': 0, 'Auto': 1, 'Education': 2, 'Business': 3, 'Other': 4},
    'HasCoSigner': {'Yes': 1, 'No': 0}
}

# Apply mappings to train and test data
for col, mapping in mappings.items():
    train[col] = train[col].map(mapping)
    test_data[col] = test_data[col].map(mapping)

# Feature engineering: Combine MaritalStatus and HasDependents into a new feature
train['Marital_Dependents'] = train['MaritalStatus'] + train['HasDependents']
test_data['Marital_Dependents'] = test_data['MaritalStatus'] + test_data['HasDependents']

# Drop the LoanID column from train data (not needed for training)
train = train.drop(columns=['LoanID'])
test_data = test_data.drop(columns=['LoanID'])

# Separate target variable
X = train.drop(columns=['Default'])
y = train['Default']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_val[numerical_columns] = scaler.transform(X_val[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(knn_imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(knn_imputer.transform(X_val), columns=X_val.columns)
test_data = pd.DataFrame(knn_imputer.transform(test_data), columns=test_data.columns)

# Define parameter distributions for RandomizedSearchCV
param_dist_hgb = {
    'learning_rate': uniform(0.01, 0.2), 
    'max_depth': randint(3, 10),          
    'max_iter': randint(100, 300),        
    'min_samples_leaf': randint(20, 100), 
    'l2_regularization': uniform(0.0, 5.0),
}

# Train HistGradientBoostingClassifier with RandomizedSearchCV
hgb = HistGradientBoostingClassifier(random_state=42)
random_search_hgb = RandomizedSearchCV(
    estimator=hgb, param_distributions=param_dist_hgb, n_iter=30, cv=3, n_jobs=-1, scoring='accuracy', random_state=42
)
random_search_hgb.fit(X_train, y_train)

# Print the best parameters for HistGradientBoostingClassifier
print("Best hyperparameters for HistGradientBoostingClassifier: ", random_search_hgb.best_params_)

# Make predictions on the validation set
y_pred_hgb = random_search_hgb.best_estimator_.predict(X_val)

# Evaluate the model
accuracy_hgb = accuracy_score(y_val, y_pred_hgb)
print(f"Validation accuracy for HistGradientBoostingClassifier: {accuracy_hgb:.4f}")

# Make predictions on the test set
test_predictions_hgb = random_search_hgb.best_estimator_.predict(test_data)

# Prepare submission file
sample_submission_hgb = pd.DataFrame({'LoanID': test_loan_ids, 'Default': test_predictions_hgb})
sample_submission_hgb.to_csv('sample_submission_hgb.csv', index=False)

print("Submission file 'sample_submission_hgb.csv' created successfully.")
