In [72]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [73]:
!pip install kaggler



In [74]:
import torch

import torchvision

from torchvision import transforms

import torchvision.transforms as transforms  # data processing

from torch.utils.data import DataLoader  # mini-batch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kaggler
from kaggler.preprocessing import LabelEncoder
from kaggler.model import AutoLGB
import kaggler
import itertools 
import torch.nn as nn  # loss
import time
import statsmodels.api as sm
import torch.optim as optim  # optimizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cuda:0


In [75]:
pip install --user lightgbm==3.1.1



In [76]:
#import required packages
import lightgbm as lgb
import xgboost as xgb
import gc
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
#optional but advised
import warnings
warnings.filterwarnings('ignore')

#GLOBAL HYPEROPT PARAMETERS
NUM_EVALS = 1000 #number of hyperopt evaluation rounds
N_FOLDS = 5 #number of cross-validation folds on data in each evaluation round

#LIGHTGBM PARAMETERS
LGBM_MAX_LEAVES = 2**11 #maximum number of leaves per tree for LightGBM
LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM
EVAL_METRIC_LGBM_REG = 'mae' #LightGBM regression metric. Note that 'rmse' is more commonly used 
EVAL_METRIC_LGBM_CLASS = 'auc'#LightGBM classification metric

#XGBOOST PARAMETERS
XGB_MAX_LEAVES = 2**12 #maximum number of leaves when using histogram splitting
XGB_MAX_DEPTH = 25 #maximum tree depth for XGBoost
EVAL_METRIC_XGB_REG = 'mae' #XGBoost regression metric
EVAL_METRIC_XGB_CLASS = 'auc' #XGBoost classification metric

#CATBOOST PARAMETERS
CB_MAX_DEPTH = 8 #maximum tree depth in CatBoost
OBJECTIVE_CB_REG = 'MAE' #CatBoost regression metric
OBJECTIVE_CB_CLASS = 'Logloss' #CatBoost classification metric

#OPTIONAL OUTPUT
BEST_SCORE = 0

def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, diagnostic=False):
    
    #==========
    #LightGBM
    #==========
    
    if package=='lgbm':
        
        print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth',
                         'num_leaves',
                          'max_bin',
                         'min_data_in_leaf',
                         'min_data_in_bin']
        
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate
            
            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            
            #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS
            cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=False,
                                early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_REG, seed=42)
            
            best_loss = cv_results['l1-mean'][-1] #'l2-mean' for rmse
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['auc-mean'][-1]
            #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = lgb.Dataset(data, labels)
                
        #integer and string parameters, used with hp.choice()
        boosting_list = [{'boosting': 'gbdt',
                          'subsample': hp.uniform('subsample', 0.5, 1)},
                         {'boosting': 'goss',
                          'subsample': 1.0,
                         'top_rate': hp.uniform('top_rate', 0, 0.5),
                         'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc'] #modify as required for other classification metrics
        objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
        objective_list_class = ['binary', 'cross_entropy']
        #for classification set objective_list = objective_list_class
        objective_list = objective_list_reg

        space ={'boosting' : hp.choice('boosting', boosting_list),
                'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
                'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
                'max_bin': hp.quniform('max_bin', 32, 255, 1),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
                'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
                'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
                'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
                'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : hp.choice('metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
                'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01)
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
        best['metric'] = metric_list[best['metric']]
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)

In [77]:
def check_duplicate(df, *args): #중복 체크
  check_list = list(set(df.columns.tolist())- set(args))
  
  # 중복되는 데이터 모두 저장하는 데이터 프레임  [boolean Seires]
  dup_pair_df = df.duplicated(subset=check_list, keep=False) # 모든 중복 샘플을 모두 True 처리 
  
  sum_dup = dup_pair_df.sum()# 중복 데이터 개수 확인
  print(f"{args}을(를) 제외하고 중복되는 데이터 개수는", sum_dup, "입니다")
  

  # 중복이 있을 경우 해당 데이터들의 index 반환
  if dup_pair_df.sum() != 0:
    print("중복 데이터 비율:", round((sum_dup/len(df)*100),2))
    print()
    return df.loc[dup_pair_df].index

In [78]:

from sklearn.metrics import accuracy_score
def run_randomForest(X_train,X_test,y_train,y_test):  #변수선택기법 RF
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print('정확도 : ' , accuracy_score(y_test, y_pred))
def run_LGBM(X_train,X_test,y_train,y_test):          #변수선택기법 LGBM
    clf = LGBMClassifier(n_estimators=100, random_state=0, n_jobs = -1)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print('정확도 : ' , accuracy_score(y_test, y_pred))

In [79]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
import random
from sklearn.preprocessing import OneHotEncoder


In [80]:
train=pd.read_csv('/content/drive/My Drive/train.csv')
test=pd.read_csv('/content/drive/My Drive/test.csv')
submit = pd.read_csv('/content/drive/My Drive/sample_submission.csv')
train = train.drop(['index'], axis=1)
test = test.drop(['index'], axis=1)
train['DAYS_BIRTH'] = train.DAYS_BIRTH.apply(lambda x: abs(x))
test['DAYS_BIRTH'] = test.DAYS_BIRTH.apply(lambda x: abs(x))
train['begin_month'] = train.begin_month.apply(lambda x: abs(x))
test['begin_month'] = test.begin_month.apply(lambda x: abs(x))
train['DAYS_EMPLOYED'] = train.DAYS_EMPLOYED.apply(lambda x: abs(x))
test['DAYS_EMPLOYED'] = test.DAYS_EMPLOYED.apply(lambda x: abs(x))

In [81]:
cat_cols = [x for x in train.columns if train[x].dtype == 'object']
num_cols = [x for x in train.columns if x not in cat_cols  + ['credit']]
num_cols.remove('DAYS_BIRTH')
num_cols.remove('DAYS_EMPLOYED')
num_cols.remove('begin_month')
feature_cols = num_cols + cat_cols
for col in ['DAYS_BIRTH']:
    train[col] = train[col]/365
    test[col] = train[col]/365
for col in ['DAYS_EMPLOYED']:
    train[col] = train[col]/365
    test[col] = train[col]/365
for col in ['begin_month']:
    train[col] = train[col]/12
    test[col] = train[col]/12
lbe = LabelEncoder(min_obs=10)
print(feature_cols)
train[cat_cols] = lbe.fit_transform(train[cat_cols])
test[cat_cols] = lbe.transform(test[cat_cols])

['child_num', 'income_total', 'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'family_size', 'gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']


In [82]:
train

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,1,0,202500.0,2,1,0,2,38.079452,12.901370,1,0,0,0,0,2.0,0.500000,1.0
1,0,0,0,1,247500.0,2,0,2,0,31.178082,4.219178,1,0,0,1,1,3.0,0.416667,1.0
2,1,1,0,0,450000.0,1,1,0,0,52.293151,12.147945,1,0,1,0,4,2.0,1.833333,2.0
3,0,0,0,0,202500.0,2,0,0,0,41.336986,5.731507,1,0,1,0,3,2.0,3.083333,0.0
4,0,1,0,0,157500.0,4,1,0,0,41.197260,5.767123,1,0,0,0,4,2.0,2.166667,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,1,2,225000.0,4,0,0,0,33.093151,5.435616,1,0,0,0,2,4.0,0.166667,1.0
26453,0,0,0,1,180000.0,1,1,3,0,41.893151,6.780822,1,0,0,0,0,2.0,3.916667,2.0
26454,0,1,1,0,292500.0,1,0,2,1,27.621918,5.520548,1,0,0,0,2,2.0,2.083333,2.0
26455,1,0,0,0,171000.0,1,2,1,0,27.794521,0.293151,1,0,0,0,1,1.0,4.916667,2.0


In [83]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [84]:
train_x=train.drop(['credit'],axis=1)
train_y=train['credit']

In [85]:
#lgbm_params = quick_hyperopt(X_train_rfe, train_y, 'lgbm', 100)

In [86]:
print(train_x)

       gender  car  reality  ...  occyp_type  family_size  begin_month
0           0    0        1  ...           0    -0.214735     0.500000
1           0    0        0  ...           1     0.876135     0.416667
2           1    1        0  ...           4    -0.214735     1.833333
3           0    0        0  ...           3    -0.214735     3.083333
4           0    1        0  ...           4    -0.214735     2.166667
...       ...  ...      ...  ...         ...          ...          ...
26452       0    0        1  ...           2     1.967005     0.166667
26453       0    0        0  ...           0    -0.214735     3.916667
26454       0    1        1  ...           2    -0.214735     2.083333
26455       1    0        0  ...           1    -1.305605     4.916667
26456       0    0        1  ...          10    -0.214735     0.750000

[26457 rows x 18 columns]


In [87]:
from sklearn.model_selection import KFold, StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=150)
folds=[]
for train_idx, valid_idx in skf.split(train_x, train_y):
    folds.append((train_idx, valid_idx))


In [88]:

'''for index in range(1,19): #변수선택과정
    rfe =RFE(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1), n_features_to_select=index)
    rfe.fit(X_train,y_train)
    X_train_rfe=rfe.transform(X_train)
    X_train_rfe=pd.DataFrame(X_train_rfe)
    X_valid_rfe=rfe.transform(X_valid)
    X_valid_rfe=pd.DataFrame(X_valid_rfe)
    print('feature : ' , index,'\n')
    run_randomForest(X_train_rfe,X_valid_rfe,y_train,y_valid)'''
    


"for index in range(1,19): #변수선택과정\n    rfe =RFE(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1), n_features_to_select=index)\n    rfe.fit(X_train,y_train)\n    X_train_rfe=rfe.transform(X_train)\n    X_train_rfe=pd.DataFrame(X_train_rfe)\n    X_valid_rfe=rfe.transform(X_valid)\n    X_valid_rfe=pd.DataFrame(X_valid_rfe)\n    print('feature : ' , index,'\n')\n    run_randomForest(X_train_rfe,X_valid_rfe,y_train,y_valid)"

In [89]:
from lightgbm import LGBMClassifier

X_train=train_x.iloc[train_idx]
y_train=train_y[train_idx]
y_valid= train_y[valid_idx]
X_valid = train_x.iloc[valid_idx]
for index in range(1,19):  #변수선택과정
    rfe=RFE(LGBMClassifier(n_estimators=100,random_state=0,n_jobs=-1), n_features_to_select=index)
    rfe.fit(X_train,y_train)
    X_train_rfe=rfe.transform(X_train)
    X_train_rfe=pd.DataFrame(X_train_rfe)
    X_valid_rfe=rfe.transform(X_valid)
    X_valid_rfe=pd.DataFrame(X_valid_rfe)
    print('feature : ' , index,'\n')
    run_LGBM(X_train_rfe,X_valid_rfe,y_train,y_valid)

feature :  1 

정확도 :  0.6414666414666415
feature :  2 

정확도 :  0.6471366471366471
feature :  3 

정확도 :  0.6528066528066528
feature :  4 

정확도 :  0.6958986958986959
feature :  5 

정확도 :  0.6964656964656964
feature :  6 

정확도 :  0.6981666981666982
feature :  7 

정확도 :  0.6977886977886978
feature :  8 

정확도 :  0.6977886977886978
feature :  9 

정확도 :  0.6996786996786997
feature :  10 

정확도 :  0.7004347004347005
feature :  11 

정확도 :  0.7011907011907011
feature :  12 

정확도 :  0.7000567000567001
feature :  13 

정확도 :  0.7006237006237006
feature :  14 

정확도 :  0.6991116991116991
feature :  15 

정확도 :  0.6994896994896995
feature :  16 

정확도 :  0.6994896994896995
feature :  17 

정확도 :  0.6994896994896995
feature :  18 

정확도 :  0.6994896994896995


In [90]:
lgb_params = {
  'bagging_fraction': 0.51,
  'boosting': 'goss',
  'feature_fraction': 0.8200000000000001,
  'lambda_l1': 2.0830165101593767,
  'lambda_l2': 3.5899014619176066,
  'learning_rate': 0.11565250641676775,
  'max_bin': 98,
  'max_depth': 11,
  'metric': 'multi_logloss',
  'min_data_in_bin': 151,
  'min_data_in_leaf': 2,
  'min_gain_to_split': 3.5100000000000002,
  'num_leaves': 412,
  'objective': 'multi_class',
  'other_rate': 0.09514122787212262,
  'top_rate': 0.19817258931181442
}


In [91]:

random.seed(42)

rfe=RFE(LGBMClassifier(n_estimators=100,random_state=0,n_jobs=-1), n_features_to_select=10)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx].values, train_x.iloc[valid_idx].values,train_y[train_idx].values, train_y[valid_idx].values 
    #print(X_train.shape,X_valid.shape,y_train.shape)
    rfe.fit(X_train,y_train)
    X_train=rfe.transform(X_train)
    X_train=pd.DataFrame(X_train)
    X_valid=rfe.transform(X_valid)
    X_valid=pd.DataFrame(X_valid)
    lgb = LGBMClassifier(**lgb_params)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=100,verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.691448	valid_1's multi_logloss: 0.77959
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.691448	valid_1's multi_logloss: 0.77959


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.689429	valid_1's multi_logloss: 0.791676
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.689429	valid_1's multi_logloss: 0.791676


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.691941	valid_1's multi_logloss: 0.779793
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.691941	valid_1's multi_logloss: 0.779793


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.690517	valid_1's multi_logloss: 0.78244
Did not meet early stopping. Best iteration is:
[100]	training's multi_loglo

In [92]:
test=rfe.transform(test)
test=pd.DataFrame(test)
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [93]:
submit.to_csv('/content/drive/My Drive/sample_submission.csv', index=False) # 0.7272812144