### Загрузка датасета

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')

### EDA

In [3]:
df.isna().mean()

id                            0.0
person_age                    0.0
person_income                 0.0
person_home_ownership         0.0
person_emp_length             0.0
loan_intent                   0.0
loan_grade                    0.0
loan_amnt                     0.0
loan_int_rate                 0.0
loan_percent_income           0.0
cb_person_default_on_file     0.0
cb_person_cred_hist_length    0.0
loan_status                   0.0
dtype: float64

In [4]:
df.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,16929.497605,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [5]:
df.describe(include='object')

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
count,58645,58645,58645,58645
unique,4,6,7,2
top,RENT,EDUCATION,A,N
freq,30594,12271,20984,49943


In [6]:
df = df[df['person_age'] < 85]

### Train-test

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df.drop(['id', 'loan_status'], axis=1)
y = df['loan_status']

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, random_state=22, train_size=0.8)

In [10]:
len(Xtrain) == len(ytrain)

True

### Preprocessing

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

In [12]:
nonum_pipe = Pipeline([('ohe', OneHotEncoder(drop='first'))])

In [13]:
num_pipe = Pipeline([('minmax', MinMaxScaler()), 
               ('feature_eng', PolynomialFeatures(include_bias=False))])

In [14]:
ct_preproc = ColumnTransformer([('num', num_pipe, X.select_dtypes(include='number').columns),
                               ('nonum', nonum_pipe, X.select_dtypes(exclude='number').columns)])

### Baseline

### catboost test hyperparametrs (with error)

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.pipeline import FunctionTransformer
from sklearn.feature_selection import SelectKBest

In [28]:

def to_df(X):
    df = pd.DataFrame(X, columns=ct_cat.get_feature_names_out())
    df = df.apply(pd.to_numeric, errors='ignore')
    global cat_features
    cat_features = list(df.select_dtypes(exclude='number').columns)
    return df
    
ct_cat = ColumnTransformer([('poly_cat', PolynomialFeatures(), X.select_dtypes(include='number').columns)], remainder='passthrough')

cat_pipe = Pipeline([('ct_category', ct_cat),
                     ('to_fr', FunctionTransformer(to_df)),
                    ('cat', CatBoostClassifier(verbose=0, cat_features=cat_features))])

In [29]:
cat_pipe.fit(Xtrain[:50], ytrain[:50])

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [30]:
params_cat = {'cat__iterations': [100, 300, 500, 1000],  
    'cat__learning_rate': [0.01, 0.03, 0.1, 0.3],  
    'cat__max_depth': [4, 6, 8],  
    'cat__l2_leaf_reg': [1, 3, 5, 7, 9],
              'ct_category__poly_cat__degree':[1,2,3]}

In [31]:
cat_mode = RandomizedSearchCV(cat_pipe, params_cat, cv=4, n_jobs=-1, scoring='roc_auc', n_iter=17)

In [32]:
cat_mode.fit(Xtrain[:50], ytrain[:50])

44 fits failed out of a total of 68.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\

In [33]:
cat_mode.best_score_

0.9090909090909092

In [70]:
cat_mode.best_score_

0.9481633903878303

In [71]:
cat_mode.best_params_

{'ct_category__poly_cat__degree': 2,
 'cat__max_depth': 6,
 'cat__learning_rate': 0.1,
 'cat__l2_leaf_reg': 7,
 'cat__iterations': 500}

In [77]:
cat_mode.best_estimator_.named_steps['cat'].get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,remainder__person_home_ownership,14.5703
1,poly_cat__loan_percent_income,12.328204
2,remainder__loan_grade,10.352306
3,remainder__loan_intent,9.080523
4,poly_cat__person_income,8.412421
5,poly_cat__person_income^2,7.337766
6,poly_cat__loan_int_rate^2,5.704876
7,poly_cat__loan_percent_income^2,5.363114
8,poly_cat__loan_int_rate,3.00893
9,poly_cat__loan_int_rate loan_percent_income,1.937079


In [78]:
test = pd.read_csv('test.csv')

In [84]:
Xt = test.drop('id', axis=1)
test['loan_status'] = cat_mode.best_estimator_.predict(Xt)

In [85]:
test[['id', 'loan_status']].to_csv('pipe_pred.csv', index=False)

### Linear models

In [35]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [36]:
mod_base = cross_val_score(SVC(class_weight='balanced'), Xtrain.select_dtypes(include='number'), ytrain, cv=5, scoring='roc_auc')

In [37]:
np.mean(mod_base)

0.7786289605454184

In [38]:
pipe_num = Pipeline([('pipe_n', MinMaxScaler()),
             ('selector', SelectKBest())])

ct_svc = ColumnTransformer([('obj', OneHotEncoder(drop='first'), Xtrain.select_dtypes(exclude='number').columns),
                  ('num', pipe_num, Xtrain.select_dtypes(include='number').columns)])

In [39]:
pipe_svc = Pipeline([('ct', ct_svc),
                    ('svc', SVC(class_weight='balanced'))])

params_svc = { 'svc__C': [120, 130, 150, 170],                  # Регуляризация
    'svc__kernel': ['rbf', 'poly'],      # ЯдраF
    'svc__degree': [2, 3], 
              'ct__num__selector__k':[6,7,8]}

svc_mod = RandomizedSearchCV(pipe_svc, params_svc, n_jobs=-1, scoring='roc_auc', cv=5)

In [40]:
svc_mod.fit(Xtrain, ytrain)



In [74]:
svc_mod.best_score_, svc_mod.best_params_

(0.9181019593719235,
 {'svc__kernel': 'poly',
  'svc__degree': 2,
  'svc__C': 150,
  'ct__num__selector__k': 8})

In [41]:
svc_mod.best_score_, svc_mod.best_params_

(0.9181019593719235,
 {'svc__kernel': 'poly',
  'svc__degree': 2,
  'svc__C': 150,
  'ct__num__selector__k': 8})

### Logreg

In [42]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

In [43]:
pipe_num_log = Pipeline([('pipe_n', MinMaxScaler()),
                     ('poly_f', PolynomialFeatures(include_bias=False)),
             ('selector', SelectKBest())])

ct_log = ColumnTransformer([('obj', OneHotEncoder(drop='first'), Xtrain.select_dtypes(exclude='number').columns),
                  ('num', pipe_num_log, Xtrain.select_dtypes(include='number').columns)])

In [44]:
pipe_las = Pipeline([('ct', ct_log),
                    ('log', LogisticRegression())])

params_las = { 'log__C': [5, 8, 10, 15, 20],      # Обратная сила регуляризации (1/alpha)
    'log__penalty': ['l1'],                    # L1-регуляризация
    'log__solver': ['liblinear'],      # Решатели, поддерживающие L1
    'log__max_iter': [40, 80, 100, 150],
              'ct__num__selector__k':[6,7,8],
             'ct__num__poly_f__degree':[None,2]}

log_mod = RandomizedSearchCV(pipe_las, params_las, n_jobs=-1, scoring='roc_auc', cv=4)

In [45]:
log_mod.fit(Xtrain, ytrain)

24 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\9b_gr\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\

In [68]:
log_mod.best_score_, log_mod.best_params_

(0.9069747977637368,
 {'log__solver': 'liblinear',
  'log__penalty': 'l1',
  'log__max_iter': 100,
  'log__C': 10,
  'ct__num__selector__k': 7,
  'ct__num__poly_f__degree': 2})

In [46]:
log_mod.best_score_, log_mod.best_params_

(0.9069325410627118,
 {'log__solver': 'liblinear',
  'log__penalty': 'l1',
  'log__max_iter': 80,
  'log__C': 10,
  'ct__num__selector__k': 7,
  'ct__num__poly_f__degree': 2})

### XGBoost

In [47]:
from xgboost import XGBClassifier


In [48]:
pipe_num_xgb = Pipeline([('pipe_n', MinMaxScaler()),
                     ('poly_f', PolynomialFeatures(include_bias=False)),
             ('selector', SelectKBest())])

ct_xgb = ColumnTransformer([('obj', OneHotEncoder(drop='first'), Xtrain.select_dtypes(exclude='number').columns),
                  ('num', pipe_num_xgb, Xtrain.select_dtypes(include='number').columns)])

In [49]:
pipe_xgb = Pipeline([('ct', ct_xgb),
                    ('xgb', XGBClassifier())])

params_xgb = {   # Важные параметры
    'xgb__n_estimators': [300, 500, 800],       # Количество деревьев
    'xgb__max_depth': [8, 9, 11, 14],                 # Глубина деревьев
    'xgb__learning_rate': [0.03, 0.05, 0.08],    # Скорость обучения
    'xgb__subsample': [0.4, 0.5, 0.6],              
    'xgb__colsample_bytree': [0.4, 0.6,0.7],       
    'xgb__gamma': [ 0.3, 0.4, 0.6],                     # Минимальный loss для split
    'xgb__reg_alpha': [0.05, 0.1, 0.2],
              'ct__num__selector__k':[ 9, 10, 12],
             'ct__num__poly_f__degree':[2,3]}

mod_xgb = RandomizedSearchCV(pipe_xgb, params_xgb, n_jobs=-1, scoring='roc_auc', cv=4)

In [50]:
mod_xgb.fit(Xtrain, ytrain)

In [51]:
mod_xgb.best_score_, mod_xgb.best_params_

(0.9476737852423799,
 {'xgb__subsample': 0.6,
  'xgb__reg_alpha': 0.2,
  'xgb__n_estimators': 500,
  'xgb__max_depth': 9,
  'xgb__learning_rate': 0.03,
  'xgb__gamma': 0.6,
  'xgb__colsample_bytree': 0.7,
  'ct__num__selector__k': 12,
  'ct__num__poly_f__degree': 2})

In [86]:
mod_xgb.best_score_, mod_xgb.best_params_

(0.9494196816875388,
 {'xgb__subsample': 0.6,
  'xgb__reg_alpha': 0.1,
  'xgb__n_estimators': 300,
  'xgb__max_depth': 8,
  'xgb__learning_rate': 0.05,
  'xgb__gamma': 0.3,
  'xgb__colsample_bytree': 0.7,
  'ct__num__selector__k': 9,
  'ct__num__poly_f__degree': 2})

### Stacking mod

In [52]:
from sklearn.ensemble import StackingClassifier

In [79]:
estimators = [('fst', svc_mod.best_estimator_),
             ('scd', log_mod.best_estimator_),
             ('thrd', mod_xgb.best_estimator_)]

In [80]:
stack_mod = StackingClassifier(estimators, final_estimator=LogisticRegression(),cv=5, n_jobs=-1, passthrough=False)

In [84]:
stack_mod.fit(Xtrain, ytrain)

In [85]:
from sklearn.metrics import roc_auc_score

In [86]:
stack_mod.

array(['stackingclassifier_fst', 'stackingclassifier_scd',
       'stackingclassifier_thrd'], dtype=object)

In [87]:
roc_auc_score(ytest, stack_mod.predict(Xtest))

0.8463864992948789