In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Columns setup
columns_to_drop = ['cb_person_default_on_file']
onehot_cols = ['person_home_ownership', 'loan_intent']   # Specify your one-hot encoded columns here
ordinal_cols = ['loan_grade'] # Specify your ordinal encoded columns here

# Custom transformer to drop columns
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
    
    def fit(self, X, y=None):
        return self  # No fitting required for dropping columns
    
    def transform(self, X):
        return X.drop(columns=self.columns_to_drop, errors='ignore')

# Preprocessing pipelines
Dropper = Pipeline(steps=[
   ('drop_columns', DropColumns(columns_to_drop=columns_to_drop))
])

OneHot_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Ordinal_transformer = Pipeline(steps=[
    ('label', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Bundle preprocessing for transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('dropper', Dropper, columns_to_drop),
        ('onehot', OneHot_transformer, onehot_cols),
        ('label', Ordinal_transformer, ordinal_cols)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps =[
    ('preprocessing', preprocessor)
])


In [3]:
train_df = pd.read_csv(r'C:\Users\Karahan C\Desktop\Portfolio Projects\Kaggle\loan-approval-competion\playground-series-s4e10\train.csv', index_col='id')
test_df = pd.read_csv(r'C:\Users\Karahan C\Desktop\Portfolio Projects\Kaggle\loan-approval-competion\playground-series-s4e10\test.csv', index_col='id')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58645 entries, 0 to 58644
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  58645 non-null  int64  
 1   person_income               58645 non-null  int64  
 2   person_home_ownership       58645 non-null  object 
 3   person_emp_length           58645 non-null  float64
 4   loan_intent                 58645 non-null  object 
 5   loan_grade                  58645 non-null  object 
 6   loan_amnt                   58645 non-null  int64  
 7   loan_int_rate               58645 non-null  float64
 8   loan_percent_income         58645 non-null  float64
 9   cb_person_default_on_file   58645 non-null  object 
 10  cb_person_cred_hist_length  58645 non-null  int64  
 11  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 5.8+ MB


In [5]:
train_df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [6]:
print(train_df.isna().sum(),
      train_df.isnull().sum())

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64 person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64


In [7]:
train_df.head()

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [8]:
processed_train = pipeline.fit_transform(train_df)
processed_test = pipeline.fit_transform(test_df)

In [9]:
# Retrieve feature names for one-hot and ordinal transformations
onehot_feature_names = pipeline.named_steps['preprocessing'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(onehot_cols)
ordinal_feature_names = ordinal_cols  # Ordinal columns will keep their names
# Remaining numeric feature names (those that were passed through)
numeric_feature_names = train_df.select_dtypes(include='number').columns.tolist()

# Combine all feature names
all_feature_names = list(onehot_feature_names) + ordinal_feature_names + numeric_feature_names

processed_train = pd.DataFrame(processed_train, columns=all_feature_names)

# Display the first few rows to verify the headers
print(processed_train.head())


   person_home_ownership_MORTGAGE  person_home_ownership_OTHER  \
0                             0.0                          0.0   
1                             0.0                          0.0   
2                             0.0                          0.0   
3                             0.0                          0.0   
4                             0.0                          0.0   

   person_home_ownership_OWN  person_home_ownership_RENT  \
0                        0.0                         1.0   
1                        1.0                         0.0   
2                        1.0                         0.0   
3                        0.0                         1.0   
4                        0.0                         1.0   

   loan_intent_DEBTCONSOLIDATION  loan_intent_EDUCATION  \
0                            0.0                    1.0   
1                            0.0                    0.0   
2                            0.0                    0.0   
3     

In [10]:
# Retrieve feature names for one-hot and ordinal transformations
onehot_feature_names = pipeline.named_steps['preprocessing'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(onehot_cols)
ordinal_feature_names = ordinal_cols  # Ordinal columns will keep their names
# Remaining numeric feature names (those that were passed through)
numeric_feature_names = test_df.select_dtypes(include='number').columns.tolist()

# Combine all feature names
all_feature_names = list(onehot_feature_names) + ordinal_feature_names + numeric_feature_names

processed_test = pd.DataFrame(processed_test, columns=all_feature_names)

# Display the first few rows to verify the headers
print(processed_test.head())
processed_test

   person_home_ownership_MORTGAGE  person_home_ownership_OTHER  \
0                             0.0                          0.0   
1                             1.0                          0.0   
2                             0.0                          0.0   
3                             0.0                          0.0   
4                             1.0                          0.0   

   person_home_ownership_OWN  person_home_ownership_RENT  \
0                        0.0                         1.0   
1                        0.0                         0.0   
2                        0.0                         1.0   
3                        0.0                         1.0   
4                        0.0                         0.0   

   loan_intent_DEBTCONSOLIDATION  loan_intent_EDUCATION  \
0                            0.0                    0.0   
1                            0.0                    0.0   
2                            0.0                    0.0   
3     

Unnamed: 0,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,23.0,69000.0,3.0,25000.0,15.76,0.36,2.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,26.0,96000.0,6.0,10000.0,12.68,0.10,4.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,26.0,30000.0,5.0,4000.0,17.19,0.13,2.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,50000.0,4.0,7000.0,8.90,0.14,7.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,26.0,102000.0,8.0,15000.0,16.32,0.15,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39093,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,31200.0,2.0,3000.0,10.37,0.10,4.0
39094,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,22.0,48000.0,6.0,7000.0,6.03,0.15,3.0
39095,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,51.0,60000.0,0.0,15000.0,7.51,0.25,25.0
39096,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,22.0,36000.0,4.0,14000.0,15.62,0.39,4.0


In [11]:
y_train = processed_train.loan_status
x_train = processed_train.drop(columns=['loan_status'])


x_test = processed_test


In [14]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Initialize model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_


# Fit model with best parameters
best_xgb_model = grid_search.best_estimator_


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


In [18]:
print("Best parameters found: ", best_params)

Best parameters found:  {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 300, 'subsample': 0.8}


In [12]:
import xgboost as xgb

model = xgb.XGBRegressor(n_estimator=99, learning_rate=0.99) 
model.fit(x_train, y_train)
                

Parameters: { "n_estimator" } are not used.



In [15]:
y_preds = best_xgb_model.predict(x_test)
y_preds

array([1.0458772 , 0.03164774, 0.5534691 , ..., 0.01359512, 0.2297924 ,
       0.98628515], dtype=float32)

In [17]:
# Save test predictions to file
output = pd.DataFrame({'id': test_df.index,
                       'loan_status': y_preds})
output.to_csv('submission.csv', index=False)