In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostRegressor,CatBoostClassifier
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
warnings.filterwarnings('ignore')

In [2]:
data_train =pd.read_csv('../01_inter_data/train_cat_cleaned.csv')
data_test_a = pd.read_csv('../01_inter_data/test_a_cat_cleaned.csv')
data_test_a_id = pd.read_csv('../00_data/testA.csv')

In [3]:
test_id = data_test_a_id['id']

In [4]:
data_train.info(max_cols = 1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 46 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  800000 non-null  int64  
 1   loanAmnt            800000 non-null  float64
 2   term                800000 non-null  int64  
 3   interestRate        800000 non-null  float64
 4   installment         800000 non-null  float64
 5   grade               800000 non-null  object 
 6   subGrade            800000 non-null  object 
 7   employmentTitle     800000 non-null  float64
 8   employmentLength    800000 non-null  float64
 9   homeOwnership       800000 non-null  int64  
 10  annualIncome        800000 non-null  float64
 11  verificationStatus  800000 non-null  int64  
 12  issueDate           800000 non-null  int64  
 13  isDefault           800000 non-null  int64  
 14  purpose             800000 non-null  int64  
 15  postCode            800000 non-nul

In [6]:
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', \
                 'regionCode', 'applicationType', 'initialListStatus', 'title','employmentLength']


In [7]:
data_train[cate_features].info(max_cols = 1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   grade               800000 non-null  object 
 1   subGrade            800000 non-null  object 
 2   employmentTitle     800000 non-null  float64
 3   homeOwnership       800000 non-null  int64  
 4   verificationStatus  800000 non-null  int64  
 5   purpose             800000 non-null  int64  
 6   postCode            800000 non-null  float64
 7   regionCode          800000 non-null  int64  
 8   applicationType     800000 non-null  int64  
 9   initialListStatus   800000 non-null  int64  
 10  title               800000 non-null  float64
 11  employmentLength    800000 non-null  float64
dtypes: float64(4), int64(6), object(2)
memory usage: 73.2+ MB


In [8]:
for data in [data_train, data_test_a]:
    data[['employmentTitle','postCode','title','issueDate']] = data[['employmentTitle','postCode','title','issueDate']].astype('int')
    data['employmentLength'] = data['employmentLength'].astype('str')

In [9]:
data_train[['employmentTitle','postCode','title','issueDate','employmentLength']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   employmentTitle   800000 non-null  int32 
 1   postCode          800000 non-null  int32 
 2   title             800000 non-null  int32 
 3   issueDate         800000 non-null  int32 
 4   employmentLength  800000 non-null  object
dtypes: int32(4), object(1)
memory usage: 18.3+ MB


In [36]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2019
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
  
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 6, 'l2_leaf_reg': 15, 'bootstrap_type': 'No',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False,
                      'class_weights' : {0:0.6246193725698402, 1: 2.506108639809536},'eval_metric':"AUC",
                      'loss_function':'Logloss'}
            
            model = clf(iterations=20000,task_type="CPU", **params)
            #,gpu_ram_part=0.9,gpu_cat_features_storage = 'CpuPinnedMemory',max_ctr_complexity=1
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=cate_features, use_best_model=True, verbose=500)
        
            val_pred  = model.predict_proba(val_x)[:,1]
            test_pred = model.predict_proba(test_x)[:,1]
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [37]:
def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

In [38]:
features = [f for f in data_train.columns if f not in ['id','isDefault']]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']

In [39]:
cat_train, cat_test = cat_model(x_train, y_train, x_test)

************************************ 1 ************************************
0:	test: 0.6984700	best: 0.6984700 (0)	total: 701ms	remaining: 3h 53m 34s
500:	test: 0.7408745	best: 0.7408745 (500)	total: 5m 42s	remaining: 3h 42m 5s
1000:	test: 0.7436331	best: 0.7436342 (995)	total: 11m 38s	remaining: 3h 41m
1500:	test: 0.7446771	best: 0.7446771 (1500)	total: 17m 33s	remaining: 3h 36m 22s
2000:	test: 0.7452198	best: 0.7452224 (1999)	total: 23m 40s	remaining: 3h 33m
2500:	test: 0.7455951	best: 0.7456028 (2493)	total: 29m 49s	remaining: 3h 28m 42s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7458353078
bestIteration = 2842

Shrink model to first 2843 iterations.
[0.7458353077885853]
************************************ 2 ************************************
0:	test: 0.7037753	best: 0.7037753 (0)	total: 692ms	remaining: 3h 50m 40s
500:	test: 0.7398550	best: 0.7398550 (500)	total: 5m 43s	remaining: 3h 43m 7s
1000:	test: 0.7418690	best: 0.7418724 (999)	total: 11m 31s	remai

In [40]:
data_test_a_id = pd.read_csv('../00_data/testA.csv')
test_id = data_test_a_id['id']

In [2]:
def generate_result(train_pred,test_pred):
    test_result =pd.concat([test_id,pd.DataFrame(test_pred)],axis = 1)
    test_result.columns = ['id','pred_test']
    train_result = pd.concat([data_train.id,pd.DataFrame(train_pred),data_train.isDefault],axis = 1)
    train_result.columns = ['id','pred_train','isDefault']
    return test_result, train_result

In [None]:
generate_result(cat_train,cat_test)

In [None]:
test_result.to_csv('../03_result/testA_pred_cat_cpu_7434.csv',index = False)