In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

In [35]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

In [37]:
train.describe()

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,credit
count,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0
mean,13228.0,0.428658,187306.5,-15958.053899,59068.750728,1.0,0.224742,0.294251,0.09128,2.196848,-26.123294,1.51956
std,7637.622372,0.747326,101878.4,4201.589022,137475.427503,0.0,0.41742,0.455714,0.288013,0.916717,16.55955,0.702283
min,0.0,0.0,27000.0,-25152.0,-15713.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,6614.0,0.0,121500.0,-19431.0,-3153.0,1.0,0.0,0.0,0.0,2.0,-39.0,1.0
50%,13228.0,0.0,157500.0,-15547.0,-1539.0,1.0,0.0,0.0,0.0,2.0,-24.0,2.0
75%,19842.0,1.0,225000.0,-12446.0,-407.0,1.0,0.0,1.0,0.0,3.0,-12.0,2.0
max,26456.0,19.0,1575000.0,-7705.0,365243.0,1.0,1.0,1.0,1.0,20.0,0.0,2.0


In [38]:
test.describe()

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,31456.5,0.4347,185043.3,-16020.4664,59776.6904,1.0,0.2276,0.2963,0.0856,2.2027,-26.2724
std,2886.89568,0.729102,101539.8,4197.672887,138121.224504,0.0,0.419304,0.456648,0.279786,0.898272,16.348557
min,26457.0,0.0,27000.0,-25152.0,-15661.0,1.0,0.0,0.0,0.0,1.0,-60.0
25%,28956.75,0.0,121500.0,-19483.25,-3153.0,1.0,0.0,0.0,0.0,2.0,-39.0
50%,31456.5,0.0,157500.0,-15606.0,-1577.0,1.0,0.0,0.0,0.0,2.0,-25.0
75%,33956.25,1.0,225000.0,-12539.0,-410.0,1.0,0.0,1.0,0.0,3.0,-12.0
max,36456.0,5.0,1575000.0,-7489.0,365243.0,1.0,1.0,1.0,1.0,7.0,0.0


In [39]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

In [40]:
train = train[(train['family_size'] <= 7) & (train['child_num'] <= 5)]
train = train.reset_index(drop=True)

In [41]:
train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

In [42]:
def to_zero(x):
    if x > 0:
        x = 0
    return x

In [43]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(to_zero)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].apply(to_zero)

In [44]:
feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for i in feats:
    train[i] = abs(train[i])
    test[i] = abs(test[i])

In [45]:
for df in [train, test]:
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

In [46]:
train['age'] = train['DAYS_BIRTH'] // 365
test['age'] = test['DAYS_BIRTH'] // 365
train['work_year'] = train['DAYS_EMPLOYED'] // 365
test['work_year'] = test['DAYS_EMPLOYED'] // 365

In [47]:
train.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
test.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

In [71]:
num = ['child_num', 'income_total', 'work_phone', 'phone',
       'email', 'family_size', 'begin_month', 'age', 'work_year']

In [72]:
cate = ['gender', 'car', 'reality', 'income_type', 'edu_type',
        'family_type', 'house_type', 'occyp_type', 'ID']

In [51]:
train['income_total'] = np.log1p(train['income_total'])
test['income_total'] = np.log1p(test['income_total'])

In [52]:
encoder = OrdinalEncoder(cate)

In [53]:
train[cate] = encoder.fit_transform(train[cate], train['credit'])
test[cate] = encoder.transform(test[cate])

train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

In [64]:
train_data = Pool(data=X, label=y, cat_features=cate)
model_cat = CatBoostClassifier()
model_cat.fit(train_data, use_best_model=True, early_stopping_rounds=500, verbose=100)
cat_pred_test = model_cat.predict_proba(X_test)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


Learning rate set to 0.093512
0:	learn: 1.0467333	total: 202ms	remaining: 3m 21s
100:	learn: 0.7069377	total: 3.89s	remaining: 34.7s
200:	learn: 0.6940443	total: 7.79s	remaining: 31s
300:	learn: 0.6823300	total: 11.9s	remaining: 27.6s
400:	learn: 0.6707725	total: 16.1s	remaining: 24.1s
500:	learn: 0.6594769	total: 20.3s	remaining: 20.2s
600:	learn: 0.6479815	total: 24.4s	remaining: 16.2s
700:	learn: 0.6373724	total: 28.6s	remaining: 12.2s
800:	learn: 0.6265669	total: 33s	remaining: 8.19s
900:	learn: 0.6156400	total: 37.3s	remaining: 4.1s
999:	learn: 0.6047288	total: 41.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1eefd732e20>

In [67]:
sub = pd.read_csv('sample_submission.csv')

In [68]:
sub.loc[:,['0','1','2']] = cat_pred_test

In [70]:
sub.to_csv('sub_last3.csv', index=False)