In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score,classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE

# Load Data


In [3]:
# Load train and test data
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/GreatLearning/Hackahon/Hackathon_25May2024/Train_set_(1)_(1).csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/GreatLearning/Hackahon/Hackathon_25May2024/Test_set_(1)_(2).csv')


In [4]:
# Inspect the data
print("Inspect the data in train data:\n", train_data.head())
print("\nInspect the data in test data:\n", test_data.head())


Inspect the data in train data:
          ID  loan_amnt loan_term  interest_rate loan_grade loan_subgrade  \
0  72199369       9000   3 years           9.17          B            B2   
1  14257956      18000   3 years          13.65          C            C1   
2  66216451      16000   3 years           7.26          A            A4   
3  46974169      25000   3 years          13.99          C            C4   
4  46725961      17000   3 years           6.39          A            A2   

  job_experience home_ownership  annual_income income_verification_status  \
0       <5 Years            OWN        85000.0               Not Verified   
1       <5 Years            OWN        64000.0                   Verified   
2       <5 Years       MORTGAGE       150000.0            Source Verified   
3            NaN       MORTGAGE        59800.0                   Verified   
4      10+ years       MORTGAGE        72000.0            Source Verified   

   ... delinq_2yrs public_records  revolving_ba

# Exploratory Data Analysis (EDA)

In [5]:
# Check for missing values in train and test data
missing_values_train = train_data.isnull().sum()
missing_values_test = test_data.isnull().sum()

In [6]:
print("Missing values in train data:\n", missing_values_train)
print("\nMissing values in test data:\n", missing_values_test)

Missing values in train data:
 ID                               0
loan_amnt                        0
loan_term                        0
interest_rate                    0
loan_grade                       0
loan_subgrade                    0
job_experience                4702
home_ownership                   0
annual_income                    1
income_verification_status       0
loan_purpose                     0
state_code                       0
debt_to_income                   0
delinq_2yrs                      2
public_records                   2
revolving_balance                0
total_acc                        2
interest_receive                 0
application_type                 0
last_week_pay                 1924
total_current_balance         7386
total_revolving_limit         7386
default                          0
dtype: int64

Missing values in test data:
 ID                               0
loan_amnt                        0
loan_term                        0
interest_rate  

In [7]:
train_data.dtypes

ID                              int64
loan_amnt                       int64
loan_term                      object
interest_rate                 float64
loan_grade                     object
loan_subgrade                  object
job_experience                 object
home_ownership                 object
annual_income                 float64
income_verification_status     object
loan_purpose                   object
state_code                     object
debt_to_income                float64
delinq_2yrs                   float64
public_records                float64
revolving_balance               int64
total_acc                     float64
interest_receive              float64
application_type               object
last_week_pay                 float64
total_current_balance         float64
total_revolving_limit         float64
default                         int64
dtype: object

In [4]:
# Identify numerical and categorical columns
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()


In [9]:
numerical_cols

['ID',
 'loan_amnt',
 'interest_rate',
 'annual_income',
 'debt_to_income',
 'delinq_2yrs',
 'public_records',
 'revolving_balance',
 'total_acc',
 'interest_receive',
 'last_week_pay',
 'total_current_balance',
 'total_revolving_limit',
 'default']

In [10]:
categorical_cols

['loan_term',
 'loan_grade',
 'loan_subgrade',
 'job_experience',
 'home_ownership',
 'income_verification_status',
 'loan_purpose',
 'state_code',
 'application_type']

In [5]:
# Remove target and ID columns from the list of features
numerical_cols.remove('default')
if 'ID' in numerical_cols:
    numerical_cols.remove('ID')
if 'ID' in categorical_cols:
    categorical_cols.remove('ID')

In [6]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())
])


In [7]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [9]:
# Split the train data into train and validation sets
X = train_data.drop(['ID', 'default'], axis=1)
y = train_data['default']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Training the Model

In [16]:
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [17]:
# Train the model
model.fit(X_train, y_train)

In [18]:
# Validate the model
y_val_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_val_pred))

Accuracy: 0.820821035685538
              precision    recall  f1-score   support

           0       0.81      0.99      0.89     14083
           1       0.89      0.30      0.45      4552

    accuracy                           0.82     18635
   macro avg       0.85      0.65      0.67     18635
weighted avg       0.83      0.82      0.79     18635

Confusion Matrix:
 [[13915   168]
 [ 3171  1381]]


# Hyperparameter Tuning

In [19]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30]
}

In [20]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
print('Best Params:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

Best Params: {'classifier__max_depth': 30, 'classifier__n_estimators': 200}
Best Score: 0.8221602204844348


In [23]:

# Use the best model from grid search
best_model = grid_search.best_estimator_

In [24]:
# Predict on validation set
y_val_pred = best_model.predict(X_val)

In [25]:
# Validate the model
accuracy = accuracy_score(y_val, y_val_pred)
print('Accuracy:', accuracy)
print(classification_report(y_val, y_val_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_val_pred))

Accuracy: 0.8203380735175745
              precision    recall  f1-score   support

           0       0.81      0.99      0.89     14083
           1       0.89      0.30      0.45      4552

    accuracy                           0.82     18635
   macro avg       0.85      0.64      0.67     18635
weighted avg       0.83      0.82      0.78     18635

Confusion Matrix:
 [[13912   171]
 [ 3177  1375]]


In [30]:
# Remove the prefix 'classifier__' from best_params_
best_params = {param.replace('classifier__', ''): value for param, value in grid_search.best_params_.items()}


In [32]:
# Train the final model with the best parameters on the entire training data
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(**best_params, random_state=42))
])

In [33]:
final_model.fit(X, y)

In [34]:
# Prepare the test data
X_test = test_data.drop('ID', axis=1)

In [35]:
# Predict on test data
test_predictions = final_model.predict(X_test)


In [36]:
# Prepare submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'default': test_predictions
})

In [37]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/GreatLearning/Hackahon/Hackathon_25May2024/Sample_Submission.csv', index=False)

In [38]:
print('Submission file created successfully.')

Submission file created successfully.


# New Section

In [10]:
# Preprocess data and generate polynomial features
preprocessed_X_train = preprocessor.fit_transform(X_train)
preprocessed_X_val = preprocessor.transform(X_val)

In [11]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly_train = poly.fit_transform(preprocessed_X_train)
X_poly_val = poly.transform(preprocessed_X_val)

In [12]:
# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_poly_train, y_train)

In [13]:
# Define model candidates
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=42)
}

In [14]:
# Perform GridSearchCV for each model
best_models = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('classifier', model)
    ])

    param_grid = {
        'RandomForest': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [10, 20, 30]
        },
        'GradientBoosting': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7]
        },
        'XGBoost': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7]
        },
        'LightGBM': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [10, 20, 30]
        }
    }

In [None]:
grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='accuracy')
grid_search.fit(X_resampled, y_resampled)

[LightGBM] [Info] Number of positive: 45570, number of negative: 45569
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 9.032474 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 603159
[LightGBM] [Info] Number of data points in the train set: 91139, number of used features: 4206
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500005 -> initscore=0.000022
[LightGBM] [Info] Start training from score 0.000022
[LightGBM] [Info] Number of positive: 45570, number of negative: 45569
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 7.481227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 585628
[LightGBM] [Info] Number of data points in the train set: 91139, number of used features: 4199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500005 -> initscore=0.0000