# Home Credit Gradient Boosted Trees Model

Explaination of the business problem

## Importing packages and data

In [34]:
#Import necessary libraries

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.model_selection import cross_val_score
import warnings

In [35]:
# Load the train and test data
train_data = pd.read_csv('application_train.csv')
test_data = pd.read_csv('application_test.csv')

## EDA Continued

### Missing Data

In [36]:
# Remove columns with more than 30% null values
train_data = train_data.dropna(thresh=len(train_data) * 0.7, axis=1)
test_data = test_data.dropna(thresh=len(test_data) * 0.7, axis=1)

# Select valid numeric columns
numeric_columns = train_data.select_dtypes(include=np.number).columns

# Impute mean for numeric columns with less than 30% null values in train_data
train_data = train_data.loc[:, numeric_columns].fillna(train_data.loc[:, numeric_columns].mean())

# Select valid numeric columns in test
test_numeric_columns = test_data.select_dtypes(include=np.number).columns

# Impute mean for numeric columns with less than 30% null values in test_data
test_data = test_data.loc[:, test_numeric_columns].fillna(test_data.loc[:, test_numeric_columns].mean())

In [37]:
# Check for null values in train_data
null_counts = train_data.isnull().sum()

print(null_counts)


SK_ID_CURR                    0
TARGET                        0
CNT_CHILDREN                  0
AMT_INCOME_TOTAL              0
AMT_CREDIT                    0
                             ..
AMT_REQ_CREDIT_BUREAU_DAY     0
AMT_REQ_CREDIT_BUREAU_WEEK    0
AMT_REQ_CREDIT_BUREAU_MON     0
AMT_REQ_CREDIT_BUREAU_QRT     0
AMT_REQ_CREDIT_BUREAU_YEAR    0
Length: 61, dtype: int64


### Encoding Categorical Variables

In [38]:
# Encode categorical variables in train data
categorical_cols = train_data.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_cols = pd.DataFrame(encoder.fit_transform(train_data[categorical_cols]))
train_data = pd.concat([train_data, encoded_cols], axis=1).drop(categorical_cols, axis=1)

# Encode categorical variables in test data
encoded_cols = pd.DataFrame(encoder.transform(test_data[categorical_cols]))
test_data = pd.concat([test_data, encoded_cols], axis=1).drop(categorical_cols, axis=1)


### Scale and Normalized Numeric Features

In [24]:
# Exclude 'SK_ID_CURR' and 'TARGET' from the numerical columns
#numerical_cols = train_data.select_dtypes(include=['float', 'int']).columns
#numerical_cols = numerical_cols.drop(['SK_ID_CURR', 'TARGET'])

# Scale/Normalize numerical features in train data
#scaler = StandardScaler()
#train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols].values)

# Exclude 'SK_ID_CURR' from the numerical columns
#test_numerical_cols = test_data.select_dtypes(include=['float', 'int']).columns
#test_numerical_cols = test_numerical_cols.drop('SK_ID_CURR')

# Scale/Normalize numerical features in test data
#scaler = StandardScaler()
#test_data[test_numerical_cols] = scaler.fit_transform(test_data[test_numerical_cols].values)

## Modeling Process

### Oversampling on the Training Set 

In [39]:
# Split the data into training and validation sets
y = train_data['TARGET']
X = train_data.drop('TARGET', axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform oversampling on the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

### Hyperparameter Tuning

In [40]:
# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Define the parameter distributions for randomized search
param_dist = {
    'n_estimators': randint(100, 500),       
    'max_depth': randint(3, 10),             
    'learning_rate': uniform(0.01, 0.1)    
}

# Perform randomized search for hyperparameter tuning with increased iterations and cv folds
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=5, cv=2, scoring='accuracy')
random_search.fit(X_train_resampled, y_train_resampled)

### Training the Model and Cross Validation

In [41]:
# Get the best estimator from randomized search
best_xgb_model = random_search.best_estimator_

In [42]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(best_xgb_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')

### Accuracy of the Model 

In [43]:
# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))

Cross-validation scores: [0.87453152 0.87457575 0.87896477 0.87794766 0.87981471]
Mean Accuracy: 0.8771668818385423


### Make Prediction on the Test Set

In [44]:
# Make predictions on the test set
test_predictions = best_xgb_model.predict(test_data)

In [45]:
# Create a submission DataFrame
submission = pd.DataFrame({
    "SK_ID_CURR": test_data["SK_ID_CURR"], 
    "TARGET": test_predictions
})

# Save the submission DataFrame to a CSV file
submission.to_csv("submission.csv", index=False)