### Загрузка датасета

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')

### EDA

In [3]:
df.isna().mean()

id                            0.0
person_age                    0.0
person_income                 0.0
person_home_ownership         0.0
person_emp_length             0.0
loan_intent                   0.0
loan_grade                    0.0
loan_amnt                     0.0
loan_int_rate                 0.0
loan_percent_income           0.0
cb_person_default_on_file     0.0
cb_person_cred_hist_length    0.0
loan_status                   0.0
dtype: float64

In [4]:
df.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,16929.497605,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [5]:
df.describe(include='object')

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
count,58645,58645,58645,58645
unique,4,6,7,2
top,RENT,EDUCATION,A,N
freq,30594,12271,20984,49943


In [6]:
df = df[df['person_age'] < 85]

### Train-test

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df.drop(['id', 'loan_status'], axis=1)
y = df['loan_status']

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, random_state=22, train_size=0.8)

In [10]:
len(Xtrain) == len(ytrain)

True

### Preprocessing

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

In [12]:
nonum_pipe = Pipeline([('ohe', OneHotEncoder(drop='first'))])

In [13]:
num_pipe = Pipeline([('minmax', MinMaxScaler()), 
               ('feature_eng', PolynomialFeatures(include_bias=False))])

In [14]:
ct_preproc = ColumnTransformer([('num', num_pipe, X.select_dtypes(include='number').columns),
                               ('nonum', nonum_pipe, X.select_dtypes(exclude='number').columns)])

### Baseline

### catboost test hyperparametrs

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.pipeline import FunctionTransformer

In [63]:
def to_df(X):
    df = pd.DataFrame(X, columns=ct_cat.get_feature_names_out())
    df = df.apply(pd.to_numeric, errors='ignore')
    global cat_features
    cat_features = list(df.select_dtypes(exclude='number').columns)
    return df
    
ct_cat = ColumnTransformer([('poly_cat', PolynomialFeatures(), X.select_dtypes(include='number').columns)], remainder='passthrough')

cat_pipe = Pipeline([('ct_category', ct_cat),
                     ('to_fr', FunctionTransformer(to_df)),
                    ('cat', CatBoostClassifier(verbose=0, cat_features=cat_features))])

In [65]:
params_cat = {'cat__iterations': [100, 300, 500, 1000],  
    'cat__learning_rate': [0.01, 0.03, 0.1, 0.3],  
    'cat__max_depth': [4, 6, 8],  
    'cat__l2_leaf_reg': [1, 3, 5, 7, 9],
              'ct_category__poly_cat__degree':[1,2,3]}

In [66]:
cat_mode = RandomizedSearchCV(cat_pipe, params_cat, cv=4, n_jobs=-1, scoring='roc_auc', n_iter=17)

In [68]:
cat_mode.fit(Xtrain, ytrain)

36 fits failed out of a total of 68.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\

In [70]:
cat_mode.best_score_

0.9481633903878303

In [71]:
cat_mode.best_params_

{'ct_category__poly_cat__degree': 2,
 'cat__max_depth': 6,
 'cat__learning_rate': 0.1,
 'cat__l2_leaf_reg': 7,
 'cat__iterations': 500}

In [77]:
cat_mode.best_estimator_.named_steps['cat'].get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,remainder__person_home_ownership,14.5703
1,poly_cat__loan_percent_income,12.328204
2,remainder__loan_grade,10.352306
3,remainder__loan_intent,9.080523
4,poly_cat__person_income,8.412421
5,poly_cat__person_income^2,7.337766
6,poly_cat__loan_int_rate^2,5.704876
7,poly_cat__loan_percent_income^2,5.363114
8,poly_cat__loan_int_rate,3.00893
9,poly_cat__loan_int_rate loan_percent_income,1.937079
