In [1]:
import joblib
import numpy as np
import pandas as pd
import gc
import time
import os
import sys
from contextlib import contextmanager
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import  f1_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from optbinning import OptimalBinning
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from sklearn.decomposition import PCA
from functools import reduce
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import optuna
from optuna import Trial
from sklearn.metrics import f1_score
from optuna.samplers import TPESampler
sampler = TPESampler(seed=42)


(CVXPY) Jan 31 10:10:22 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.8.3296). Expected < 9.8.0. Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Jan 31 10:10:22 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.8.3296). Expected < 9.8.0. Please open a feature request on cvxpy to enable support for this version.')


> Seed Setting 

In [2]:

pd.set_option('display.max_rows', 500)
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [3]:
# read data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')
original_columns = test.columns.tolist()

# Base Data Processing 

> to int, float 

In [4]:
import re
def extract_years(s):
    if "unknown" in s.lower():
        return np.nan  # or return some specific value
    elif "<" in s:
        return 0.5
    elif "+" in s :
        return 12 
    else:
        return int(re.search(r'\d+', s).group())
    
train['근로기간']= train['근로기간'].apply(extract_years)
test['근로기간']= test['근로기간'].apply(extract_years)

train['대출기간'] = train['대출기간'].apply(lambda x : int(x.split()[0]))
test['대출기간'] = test['대출기간'].apply(lambda x : int(x.split()[0]))


> Translate 

In [5]:
grade_dict = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6}
column_dict = {
    '대출금액': 'amount',
    '대출기간': 'term',
    '근로기간': 'workperiod',
    '주택소유상태': 'homeown',
    '연간소득': 'income',
    '부채_대비_소득_비율': 'detinratio',
    '총계좌수': 'account_cnt',
    '대출목적': 'purpose',
    '최근_2년간_연체_횟수': 'delay_cnt',
    '총상환원금': 'principal_paid',
    '총상환이자': 'interest_paid',
    '총연체금액': 'delinquent_amount',
    '연체계좌수': 'delinquent_accounts',
    '대출등급': 'grade'
}

train['대출등급'] = train['대출등급'].map(grade_dict)

train = train.rename(columns=column_dict)
test = test.rename(columns=column_dict)

> Log Transform 


In [6]:
for var in ['income','principal_paid','interest_paid','delinquent_amount']:
    train[var+'_log'] = np.log(train[var]+1)
    test[var+'_log'] = np.log(test[var]+1)

> ANY to other

In [7]:
median_train = train['workperiod'].median()
train['homeown'] = train['homeown'].replace('ANY', 'MORTGAGE')
train['workperiod'] = train['workperiod'].fillna(median_train)
test['workperiod'] = test['workperiod'].fillna(median_train)

# Additional Data Processing

>With domain 

In [8]:

# '원금대비이자' 컬럼의 평균값 계산
mean_value = train['interest_paid'].mean()/train['principal_paid'].mean()
m1,m2 = train['interest_paid'].mean(), train['principal_paid'].mean()
def make_cols(data):
    data['timerate'] = data['workperiod']-(data['term']//12)
    data['debt'] = data['income']*data['detinratio']

    data['amountperyear'] =data['amount']//data['term']
    data['totalpaid'] = data['interest_paid']+data['principal_paid']
    data['interestpaid_peryear'] = data['interest_paid']/data['term']
    data['principal_paid_peryear'] = data['principal_paid']/data['term']

    data['prin_int_rate'] = np.where((data['interest_paid'] == 0) & (data['principal_paid'] == 0), m1/(m1+m2), data['interest_paid']/(data['interest_paid']+data['principal_paid']))
    data['paid_rate']=np.where((data['interest_paid']> data['principal_paid']),1,0)

    data['bad1'] = data['debt']*data['prin_int_rate']
    data['bad2'] = data['prin_int_rate']/data['term']
    data['paidrate']= data['totalpaid']/data['amountperyear']
    
    
make_cols(train)
make_cols(test)



> WoE Features 

In [9]:
# Woe Features
for variable in ['amount','workperiod','income','detinratio','principal_paid','interest_paid','delinquent_amount']:

    optb = OptimalBinning(name=variable, dtype="numerical", solver="cp")

    x = train[variable].values
    y = train['grade']

    optb.fit(x, y)

    binning_table = optb.binning_table

    a1 = binning_table.build()
    
    train[variable+'_woe'] = pd.cut(train[variable],bins=optb.splits.tolist()).astype(str)
    test[variable+'_woe'] = pd.cut(test[variable],bins=optb.splits.tolist()).astype(str)

> Group Operations 

In [10]:
# Group Operation
for group in ['term', 'homeown', 'purpose' ]:
    for num_var in ['amount', 'income','detinratio','delay_cnt','principal_paid','interest_paid','delinquent_amount']:
      
        train,test= train.copy(), test.copy()
        
        for stat in ['mean', 'min','max']:
            stat_dict = train.groupby([group])[num_var].agg(stat).to_dict()
            
            train[num_var +'_'+group + '_'+ stat]= train[group].map(stat_dict)
            test[num_var +'_'+group + '_'+ stat]= test[group].map(stat_dict)
            
            train[num_var+'_minus_'+num_var+'_'+group+'_'+stat] = train[num_var] - train[num_var+'_'+group+'_'+stat]
            test[num_var+'_minus_'+num_var+'_'+group+'_'+stat] = test[num_var] - test[num_var+'_'+group+'_'+stat]
            
            train[num_var+'_divide_'+num_var+'_'+group+'_'+stat] = np.log(train[num_var]+0.00001) / np.log(train[num_var+'_'+group+'_'+stat]+0.00001)
            test[num_var+'_divide_'+num_var+'_'+group+'_'+stat] = np.log(test[num_var]+0.00001) / np.log(test[num_var+'_'+group+'_'+stat]+0.00001)

> Feature Clustering 

In [11]:
from sklearn.cluster import KMeans

# 클러스터링할 피처들을 지정합니다.
features = ['amount','workperiod','income','detinratio']

# 최적의 클러스터 수로 KMeans 모델을 생성하고 학습합니다.
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=800, n_init=25, random_state=42)
kmeans.fit(train[features])

train['clust_num'] = kmeans.predict(train[features])
test['clust_num'] = kmeans.predict(test[features])



> Encoding

In [12]:
# Catboost Encoder 
import category_encoders as ce

test['purpose'] = test['purpose'].replace('결혼', '기타')



In [13]:

# 원-핫 인코더 생성
encoder = OneHotEncoder(sparse=False)

# # 훈련 세트를 기반으로 인코더 학습
# encoder.fit(train[['homeown', 'term']])

# # 학습된 인코더를 훈련 세트와 테스트 세트에 적용
# train_encoded = encoder.transform(train[['homeown', 'term']])
# test_encoded = encoder.transform(test[['homeown', 'term']])

# # 인코딩된 결과를 데이터프레임으로 변환
# feature_names = np.concatenate(encoder.categories_).tolist()
# train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names)
# test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)

# # 원래의 데이터프레임에서 '주택소유상태', '대출기간' 칼럼을 제거하고, 인코딩된 데이터프레임을 합침
# train = pd.concat([train.drop(['homeown', 'term'], axis=1), train_encoded_df], axis=1)
# test = pd.concat([test.drop(['homeown', 'term'], axis=1), test_encoded_df], axis=1)



encoder.fit(train[['homeown','term']])


# 학습된 인코더를 훈련 세트와 테스트 세트에 적용
train_encoded = encoder.transform(train[['homeown','term']])
test_encoded = encoder.transform(test[['homeown','term']])

# 인코딩된 결과를 데이터프레임으로 변환
feature_names = np.concatenate(encoder.categories_).tolist()
train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names)
test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)

# 원래의 데이터프레임에서 '주택소유상태', '대출기간' 칼럼을 제거하고, 인코딩된 데이터프레임을 합침
train = pd.concat([train.drop(['homeown','term'], axis=1), train_encoded_df], axis=1)
test = pd.concat([test.drop(['homeown','term'], axis=1), test_encoded_df], axis=1)


encoder = LabelEncoder() 
    
for column in ['purpose','amount_woe','workperiod_woe','income_woe','detinratio_woe','principal_paid_woe','interest_paid_woe','delinquent_amount_woe']:
    # train 데이터를 기반으로 인코딩을 학습하고 적용
    encoder.fit(train[column])
    train[column] = encoder.transform(train[column])
    test[column] = encoder.transform(test[column])

> Feature Making

In [15]:
# feature making
from itertools import chain, product 
candidate_var = ['amount','workperiod','income','detinratio','account_cnt','principal_paid','interest_paid']
pairs = list(chain(product(candidate_var, candidate_var), product(candidate_var, candidate_var))) 
pairs = pd.Series([sorted([i,j]) for (i,j) in set(pairs) if i!=j]).drop_duplicates().reset_index(drop=True).tolist()
pairs = sorted(pairs)
# pairs_to_remove = [['detinratio', 'income'],['interest_paid', 'principal_paid']]

# pairs = [pair for pair in pairs if pair not in pairs_to_remove]

print(len(pairs))
epsilon = 0.001  # 분모에 더해줄 작은 값

for i in range(len(pairs)):
    train[pairs[i][0]+'M'+pairs[i][1]] = train[pairs[i][0]] * train[pairs[i][1]]
    test[pairs[i][0]+'M'+pairs[i][1]] = test[pairs[i][0]] * test[pairs[i][1]]
    
    # 분모에 작은 값을 더해서 나눗셈 수행
    train[pairs[i][0]+'D'+pairs[i][1]] = train[pairs[i][0]] / (train[pairs[i][1]] + epsilon)
    test[pairs[i][0]+'D'+pairs[i][1]] = test[pairs[i][0]] / (test[pairs[i][1]] + epsilon)


21


  train[pairs[i][0]+'D'+pairs[i][1]] = train[pairs[i][0]] / (train[pairs[i][1]] + epsilon)
  train[pairs[i][0]+'M'+pairs[i][1]] = train[pairs[i][0]] * train[pairs[i][1]]
  train[pairs[i][0]+'D'+pairs[i][1]] = train[pairs[i][0]] / (train[pairs[i][1]] + epsilon)
  train[pairs[i][0]+'M'+pairs[i][1]] = train[pairs[i][0]] * train[pairs[i][1]]
  train[pairs[i][0]+'D'+pairs[i][1]] = train[pairs[i][0]] / (train[pairs[i][1]] + epsilon)
  train[pairs[i][0]+'M'+pairs[i][1]] = train[pairs[i][0]] * train[pairs[i][1]]
  train[pairs[i][0]+'D'+pairs[i][1]] = train[pairs[i][0]] / (train[pairs[i][1]] + epsilon)
  train[pairs[i][0]+'M'+pairs[i][1]] = train[pairs[i][0]] * train[pairs[i][1]]
  train[pairs[i][0]+'D'+pairs[i][1]] = train[pairs[i][0]] / (train[pairs[i][1]] + epsilon)
  train[pairs[i][0]+'M'+pairs[i][1]] = train[pairs[i][0]] * train[pairs[i][1]]
  train[pairs[i][0]+'D'+pairs[i][1]] = train[pairs[i][0]] / (train[pairs[i][1]] + epsilon)
  train[pairs[i][0]+'M'+pairs[i][1]] = train[pairs[i][0]] *

In [102]:
# # 일단 보류
# scaler = StandardScaler() 

# cat_features = ['ID','purpose','homeown', 'term','amount_woe','workperiod_woe','income_woe','detinratio_woe','principal_paid_woe','interest_paid_woe','grade']

# # cat_features에 해당하지 않는 피처만 선택
# num_features = [col for col in train.columns if col not in cat_features]

# # StandardScaler 객체 생성
# scaler = StandardScaler()

# # num_features에 대해 스케일링 수행
# train[num_features] = scaler.fit_transform(train[num_features])
# test[num_features] = scaler.transform(test[num_features])

# Data Processing

In [16]:
# 'train' 데이터프레임의 칼럼 이름을 문자열로 변환
train.columns = train.columns.map(str)
test.columns = test.columns.map(str)

x = train.drop(['ID','grade'], axis=1 )
y = train['grade']
test  = test.drop('ID', axis=1)


# Feature Selection 

> Delete low Variances

In [69]:
from sklearn.feature_selection import VarianceThreshold
X = train.drop(['ID','grade'], axis=1 )
# VarianceThreshold는 분산이 특정 임계값 이하인 피처들을 제거합니다.
selector = VarianceThreshold()
x_new = selector.fit_transform(x)


# get_support 메소드를 사용해 선택된 피처의 인덱스를 가져옵니다.
selected_features = selector.get_support(indices=True)
selected_columns = x.columns[selected_features]

test = test.loc[:, selected_columns]
x = x.loc[:, selected_columns]



> Percentile

In [75]:
from sklearn.feature_selection import SelectPercentile

# SelectPercentile을 사용하여 특성의 %를 선택하는 모델 생성
select = SelectPercentile(percentile=90)

# 특성 선택
select.fit(x, y)
X_train_selected = select.transform(x)

selected_features_mask = select.get_support()
selected_columns = x.columns[selected_features_mask]

test = test.loc[:, selected_columns]
x = x.loc[:, selected_columns]


> RFE Selection 

In [18]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler


# Xtrain_fs = train.drop(['ID','grade'],axis=1)
# Ytrain_fs = train['grade']
# test_fs   = test.drop(['ID'],axis=1)


model = XGBClassifier() 
selectRFE = RFE(model, n_features_to_select=180)

xtrain_selected = selectRFE.fit_transform(x,y)

xtest_selected = selectRFE.transform(test)


KeyboardInterrupt: 

# Model Training 

In [113]:
# 데이터를 훈련 세트와 테스트 세트로 분리
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)


In [115]:
from xgboost import XGBClassifier

# 파라미터 설정
params = {'n_estimators': 752, 
          'learning_rate': 0.09452538539023868, 
          'lambda': 0.021601515127063885, 
          'alpha': 0.6741902847915616, 
          'colsample_bytree': 0.5685207565862274, 
          'subsample': 0.9909153361196764, 
          'max_depth': 8, 
          'min_child_weight': 1}

# XGBClassifier에 파라미터 설정
xgb = XGBClassifier(**params)

# 모델 학습
xgb.fit(x, y)

# 예측
pred = xgb.predict(test)
#  and parameters: {'n_estimators': 752, 'learning_rate': 0.08513602429461944, 'lambda': 0.03391515402646526, 'alpha': 0.6532768202909376, 'colsample_bytree': 0.4470774309800684, 'subsample': 0.7236838914160535, 'max_depth': 8, 'min_child_weight': 1}. Best is trial 13 with value: 0.937929329398787.

# [I 2024-01-22 14:42:19,100] Trial 8 finished with value: 0.9374472481030386 and parameters: {'n_estimators': 353, 'learning_rate': 0.08163306443215815, 'lambda': 0.014951498272501501, 'alpha': 0.8023947837732857, 'colsample_bytree': 0.44473038620786254, 'subsample': 0.9921321619603104, 'max_depth': 8, 'min_child_weight': 10}. Best is trial 8 with value: 0.9374472481030386.
# .9351960090507084 and parameters: {'n_estimators': 752, 'learning_rate': 0.09452538539023868, 'lambda': 0.021601515127063885, 'alpha': 0.6741902847915616, 'colsample_bytree': 0.5685207565862274, 'subsample': 0.9909153361196764, 'max_depth': 8, 'min_child_weight': 1}. Best is trial 13 with value: 0.9351960090507084.


In [55]:
from sklearn.tree import DecisionTreeClassifier

# 파라미터 설정
params = {'criterion': 'gini', 
          'splitter': 'best', 
          'max_depth': 20, 
          'min_samples_split': 5, 
          'min_samples_leaf': 2}

# DecisionTreeClassifier에 파라미터 설정
dtc = DecisionTreeClassifier(**params)

# 모델 학습
dtc.fit(x, y)

# 예측
pred = dtc.predict(test)


In [50]:
#LGBM 단일모델
params = {'num_leaves': 151, 
          'max_depth': 9, 
          'learning_rate': 0.01771521082629245, 
          'n_estimators': 199, 
          'min_child_weight': 3, 
          'min_child_samples': 8, 
          'subsample': 0.7287247706737471, 
          'colsample_bytree': 0.8848250935856582, 
          'reg_alpha': 0.5288165395694552, 
          'reg_lambda': 3.3416664166015955}


# XGBClassifier에 파라미터 설정
lgbm = LGBMClassifier(**params)


# 모델 훈련
lgbm.fit(x, y)
pred = lgbm.predict(test)

## Hyperparameter Tuning 

> Xgboost Gridsearch 

In [47]:
# XGBoost Grid
from sklearn.model_selection import GridSearchCV

xgb_cv = StratifiedKFold(n_splits=5, shuffle = True, random_state=42)
params = {
    # 상황 보면서 파라미터 조절하기 
    'n_estimators': [100,200,300],
    'learning_rate': [0.01,0.05,0.1,0.15],
    'max_depth':[3,4,5,6,7,8] , 
    # 'gamma': [0, 0.5, 1],
    # 'alpha': [0, 0.5, 1],
    # 'lambda': [0.5, 1, 5],
    # 'min_child_weight': [1,2,3]
}

xgb_grid = GridSearchCV(XGBClassifier(n_jobs=-1), params, n_jobs=-1, cv=xgb_cv, scoring="f1_macro")
xgb_grid.fit(X, y)

print('Best score:', xgb_grid.best_score_)
print('Best score:', xgb_grid.best_params_)

> LGBM Optuna

In [30]:
from functools import partial
def macro_f1_score(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

def objective(trial, train_x, train_y, val_x, val_y):
    
    params = {
       
        'objective': 'multiclass', 
        'num_leaves': trial.suggest_int('num_leaves', 5, 200, step=1, log=False), 
        'max_depth': trial.suggest_int('max_depth', 1, 10, step=1, log=False), 
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.15, log=False), 
        'n_estimators': trial.suggest_int('n_estimators', 8, 15000, step=1, log=True), 
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30, step=1, log=False), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 5.0)
    }
    model = LGBMClassifier(**params)  
    model.fit(train_x, train_y)
    preds = model.predict(val_x)
    score = macro_f1_score(val_y, preds)
    return score
study =  optuna.create_study(study_name='Lgbm', direction='maximize', sampler=TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, X_train,  y_train, X_valid, y_valid), n_trials=600, timeout=9000)
print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)

# Best trial: {'num_leaves': 151, 'max_depth': 9, 'learning_rate': 0.01771521082629245, 'n_estimators': 199, 'min_child_weight': 3, 'min_child_samples': 8, 'subsample': 0.7287247706737471, 'colsample_bytree': 0.8848250935856582, 'reg_alpha': 0.5288165395694552, 'reg_lambda': 3.3416664166015955}
# Best score: 0.9236156401294165


[I 2024-01-22 11:23:35,536] A new study created in memory with name: Lgbm
[I 2024-01-22 11:23:56,277] Trial 0 finished with value: 0.9038118240445676 and parameters: {'num_leaves': 78, 'max_depth': 10, 'learning_rate': 0.11006709732989936, 'n_estimators': 710, 'min_child_weight': 8, 'min_child_samples': 9, 'subsample': 0.7174250836504598, 'colsample_bytree': 0.8795585311974417, 'reg_alpha': 0.6011150117432088, 'reg_lambda': 3.540362888980227}. Best is trial 0 with value: 0.9038118240445676.
[I 2024-01-22 11:23:57,589] Trial 1 finished with value: 0.8941435470702823 and parameters: {'num_leaves': 9, 'max_depth': 10, 'learning_rate': 0.12503395347926283, 'n_estimators': 38, 'min_child_weight': 10, 'min_child_samples': 9, 'subsample': 0.7912726728878613, 'colsample_bytree': 0.5722807884690141, 'reg_alpha': 0.43194501864211576, 'reg_lambda': 1.4561457009902097}. Best is trial 0 with value: 0.9038118240445676.
[I 2024-01-22 11:23:58,968] Trial 2 finished with value: 0.829061443362989 and pa

Best trial: {'num_leaves': 151, 'max_depth': 9, 'learning_rate': 0.01771521082629245, 'n_estimators': 199, 'min_child_weight': 3, 'min_child_samples': 8, 'subsample': 0.7287247706737471, 'colsample_bytree': 0.8848250935856582, 'reg_alpha': 0.5288165395694552, 'reg_lambda': 3.3416664166015955}
Best score: 0.9236156401294165


>DecisionTree Optuna

In [54]:
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

def macro_f1_score(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1  # Optuna는 최대화를 목표로 하므로 1 - f1이 아닌 f1을 반환

def objective(trial, train_x, train_y, val_x, val_y):
    param = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 6),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
    }
    model = DecisionTreeClassifier(**param)  
    model.fit(train_x, train_y)
    preds = model.predict(val_x)
    score = macro_f1_score(val_y, preds)
    return score
          
study = optuna.create_study(study_name='DecisionTree', direction='maximize', sampler=TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=30, timeout=8300)

print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)

# Best trial: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}
# Best score: 0.9196209123092121


[I 2024-01-22 16:13:46,988] A new study created in memory with name: DecisionTree
[I 2024-01-22 16:13:50,912] Trial 0 finished with value: 0.8659258297235672 and parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.8659258297235672.
[I 2024-01-22 16:13:57,511] Trial 1 finished with value: 0.9018378053974401 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 32, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.9018378053974401.
[I 2024-01-22 16:13:58,233] Trial 2 finished with value: 0.7855091531894477 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.9018378053974401.
[I 2024-01-22 16:13:59,050] Trial 3 finished with value: 0.835505545483368 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 26, 'min_samples_sp

Best trial: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}
Best score: 0.9196209123092121


> Xgboost Optuna 

In [114]:
# XGBoost Optuna


def macro_f1_score(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1  # Optuna는 최대화를 목표로 하므로 1 - f1이 아닌 f1을 반환


def objective(trial, train_x, train_y, val_x, val_y):
    param = {
        'objective': 'multi:softmax',  # 목표를 다중 클래스 분류로 설정
        'n_estimators': trial.suggest_int('n_estimators', 170, 1000),
        'learning_rate': trial.suggest_float('learning_rate',0.0005, 0.15),
        'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'max_depth': trial.suggest_int('max_depth',2,10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }
    model =XGBClassifier(**param)  
    model.fit(train_x, train_y)
    preds = model.predict(val_x)
    score = macro_f1_score(val_y, preds)
    return score
          
# study_xgb.optimize(objective, n_trials=5)
study =  optuna.create_study(study_name='Xgb', direction='maximize',sampler=TPESampler(seed=42) )
study.optimize(lambda trial: objective(trial,X_train,  y_train ,X_valid , y_valid ),n_trials=150, timeout = 8300 )
print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)

# 기존 특성들 :[I 2024-01-22 14:42:19,100] Trial 8 finished with value: 0.9374472481030386 and parameters: {'n_estimators': 353, 'learning_rate': 0.08163306443215815, 'lambda': 0.014951498272501501, 'alpha': 0.8023947837732857, 'colsample_bytree': 0.44473038620786254, 'subsample': 0.9921321619603104, 'max_depth': 8, 'min_child_weight': 10}. Best is trial 8 with value: 0.9374472481030386.
# 새 특성+ 분산0인것 제외 :  and parameters: {'n_estimators': 752, 'learning_rate': 0.08513602429461944, 'lambda': 0.03391515402646526, 'alpha': 0.6532768202909376, 'colsample_bytree': 0.4470774309800684, 'subsample': 0.7236838914160535, 'max_depth': 8, 'min_child_weight': 1}. Best is trial 13 with value: 0.937929329398787.

[I 2024-01-24 23:46:58,992] A new study created in memory with name: Xgb
[I 2024-01-24 23:48:21,292] Trial 0 finished with value: 0.9143664778764311 and parameters: {'n_estimators': 481, 'learning_rate': 0.14263178880828245, 'lambda': 0.07346740023932911, 'alpha': 0.5990598257128396, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.49359671220172163, 'max_depth': 2, 'min_child_weight': 44}. Best is trial 0 with value: 0.9143664778764311.
[I 2024-01-24 23:52:11,980] Trial 1 finished with value: 0.9288955145072946 and parameters: {'n_estimators': 669, 'learning_rate': 0.1063568503805088, 'lambda': 0.0030378649352844423, 'alpha': 0.9699399423098324, 'colsample_bytree': 0.899465584480253, 'subsample': 0.5274034664069657, 'max_depth': 3, 'min_child_weight': 10}. Best is trial 1 with value: 0.9288955145072946.
[I 2024-01-24 23:55:24,779] Trial 2 finished with value: 0.928268861312954 and parameters: {'n_estimators': 422, 'learning_rate': 0.07895108652901955, 'lambda': 0.043762556845569

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import VotingClassifier,BaggingClassifier
from sklearn.model_selection import train_test_split

def objective(trial):
    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    xgb_n_estimators = trial.suggest_int('rf_n_estimators', 200, 900)
    xgb_max_depth = trial.suggest_int('rf_max_depth', 4, 10)
    logreg_C = trial.suggest_loguniform('logreg_C', 1e-3, 1e3)
    
    # rf = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=42)
    # logreg = LogisticRegression(C=logreg_C, random_state=42)
    xgb=XGBClassifier()
    dtc=DecisionTreeClassifier()
    bag=BaggingClassifier()
    voting_clf = VotingClassifier(estimators=[('xgb', xgb), ('dt', dtc)], voting='soft')
    voting_clf.fit(X_train, y_train)
    
    acc = voting_clf.score(X_val, y_val)
    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)


In [60]:
#voting with paras 
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
# 파라미터 설정
xgb_params = {'n_estimators': 353,          'learning_rate': 0.08163306443215815, 
          'reg_lambda': 0.014951498272501501,           'reg_alpha': 0.8023947837732857, 
          'colsample_bytree': 0.44473038620786254,           'subsample': 0.9921321619603104, 
          'max_depth': 8,          'min_child_weight': 10}

# 파라미터 설정
lgbm_params = {'num_leaves': 151,    'max_depth': 9, 
          'learning_rate': 0.01771521082629245,      'n_estimators': 199, 
          'min_child_weight': 3,  'min_child_samples': 8, 
          'subsample': 0.7287247706737471,  'colsample_bytree': 0.8848250935856582, 
          'reg_alpha': 0.5288165395694552,  'reg_lambda': 3.3416664166015955}

# 파라미터 설정
dtc_params = {'criterion': 'gini',  'splitter': 'best', 
          'max_depth': 20, 'min_samples_split': 5, 
          'min_samples_leaf': 2}
# 모델 정의
lgbm = LGBMClassifier(**lgbm_params)
xgb = XGBClassifier(**xgb_params)
dtc = DecisionTreeClassifier(**dtc_params)

# StratifiedKFold 정의
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 가중치 범위 설정
xgb_weights = np.arange(0.5, 0.75, 0.05)
dtc_weights = np.arange(0.2, 0.05, -0.05)

for xgb_weight in xgb_weights:
    for dtc_weight in dtc_weights:
        lgbm_weight = 1.0 - xgb_weight - dtc_weight # lgbm의 가중치는 나머지로 설정
        if lgbm_weight < 0: # 가중치 합이 1을 초과하면 스킵
            continue

        # VotingClassifier 생성
        voting_clf = VotingClassifier(
            estimators=[('lgbm', lgbm), ('xgb', xgb), ('dtc', dtc)],
            voting='soft', weights=[lgbm_weight, xgb_weight, dtc_weight]
        )

        scores = []
        for train_index, valid_index in skf.split(x, y):
            X_train, X_valid = x.iloc[train_index], x.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            # 모델 학습
            voting_clf.fit(X_train, y_train)
            # 예측
            pred_valid = voting_clf.predict(X_valid)

            # macro f1 score 계산
            score = f1_score(y_valid, pred_valid, average='macro')
            scores.append(score)
        print("-------------------------------------------------------------------------")
        print(f"xgb_weight: {xgb_weight}, dtc_weight: {dtc_weight}, lgbm_weight: {lgbm_weight}")
        print("CV scores: ", scores)
        print("Mean CV score: ", np.mean(scores))

# xgb_weight: 0.7000000000000002, dtc_weight: 0.10000000000000003, lgbm_weight: 0.1999999999999998
# CV scores:  [0.9339200181192654, 0.9391541865231544, 0.9345281684508818, 0.9373155689289511, 0.947064406546879]
# Mean CV score:  0.9383964697138264
#파라미터 아무것도 없:  xgb_weight: 0.7, dtc_weight: 0.15000000000000002, lgbm_weight: 0.15000000000000002 # Mean CV score:  0.9396220914803772

-------------------------------------------------------------------------
xgb_weight: 0.5, dtc_weight: 0.2, lgbm_weight: 0.3
CV scores:  [0.9293257515251347, 0.9355434863669104, 0.9258601665887695, 0.9378835741063993, 0.9445981427599752]
Mean CV score:  0.9346422242694379
-------------------------------------------------------------------------
xgb_weight: 0.5, dtc_weight: 0.15000000000000002, lgbm_weight: 0.35
CV scores:  [0.9294069962123965, 0.9345575179309373, 0.9278007412742211, 0.9377079672579952, 0.9430788605517424]
Mean CV score:  0.9345104166454584
-------------------------------------------------------------------------
xgb_weight: 0.5, dtc_weight: 0.10000000000000003, lgbm_weight: 0.39999999999999997
CV scores:  [0.9304859877037381, 0.937480293172517, 0.9290380057528927, 0.9348922786799697, 0.9431502549598967]
Mean CV score:  0.9350093640538029
-------------------------------------------------------------------------
xgb_weight: 0.5, dtc_weight: 0.050000000000000044, lgbm_wei

In [59]:
# 파라미터 설정 voting CV
xgb_params = {'n_estimators': 353, 
          'learning_rate': 0.08163306443215815, 
          'reg_lambda': 0.014951498272501501, 
          'reg_alpha': 0.8023947837732857, 
          'colsample_bytree': 0.44473038620786254, 
          'subsample': 0.9921321619603104, 
          'max_depth': 8, 
          'min_child_weight': 10}

# 파라미터 설정
lgbm_params = {'num_leaves': 151, 
          'max_depth': 9, 
          'learning_rate': 0.01771521082629245, 
          'n_estimators': 199, 
          'min_child_weight': 3, 
          'min_child_samples': 8, 
          'subsample': 0.7287247706737471, 
          'colsample_bytree': 0.8848250935856582, 
          'reg_alpha': 0.5288165395694552, 
          'reg_lambda': 3.3416664166015955}

# 파라미터 설정
dtc_params = {'criterion': 'gini', 
          'splitter': 'best', 
          'max_depth': 20, 
          'min_samples_split': 5, 
          'min_samples_leaf': 2}
# 모델 정의
lgbm = LGBMClassifier(**lgbm_params)
xgb = XGBClassifier(**xgb_params)
dtc = DecisionTreeClassifier(**dtc_params)

# VotingClassifier 생성
voting_clf = VotingClassifier(
    estimators=[('lgbm', lgbm), ('xgb', xgb), ('dtc', dtc) ],
    voting='soft',  weights=[0.38,0.5, 0.12]
)

# StratifiedKFold 정의
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []


for train_index, valid_index in skf.split(x, y):
    X_train, X_valid = x.iloc[train_index], x.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    # 모델 학습
    voting_clf.fit(X_train, y_train)
    # 예측
    pred_valid = voting_clf.predict(X_valid)

    # macro f1 score 계산
    score = f1_score(y_valid, pred_valid, average='macro')
    scores.append(score)
    
print("CV scores: ", scores)
print("Mean CV score: ", np.mean(scores))

# # 0120sub Mean CV score:  0.9294106085345734
# #0120high Mean CV score:  0.9303435456241373  0.94315
# # 0120high+minmax Mean CV score:  0.8112572177934949
# Mean CV score:  0.9351709674576272 0122 파라미터 형성 weights=[0.38,0.5, 0.12]
# # # 모델 학습
# # voting_clf.fit(x, y)

# # # 예측
# # pred = voting_clf.predict(test)

CV scores:  [0.9306073296381292, 0.9374277366421012, 0.9283484681604242, 0.9357816518389159, 0.943689651008565]
Mean CV score:  0.9351709674576272


# Voting

In [109]:
# VotingClassifier 생성'

# # 파라미터 설정
# xgb_params = {'n_estimators': 353, 
#           'learning_rate': 0.08163306443215815, 
#           'reg_lambda': 0.014951498272501501, 
#           'reg_alpha': 0.8023947837732857, 
#           'colsample_bytree': 0.44473038620786254, 
#           'subsample': 0.9921321619603104, 
#           'max_depth': 8, 
#           'min_child_weight': 10}

# # 파라미터 설정
# lgbm_params = {'num_leaves': 151, 
#           'max_depth': 9, 
#           'learning_rate': 0.01771521082629245, 
#           'n_estimators': 199, 
#           'min_child_weight': 3, 
#           'min_child_samples': 8, 
#           'subsample': 0.7287247706737471, 
#           'colsample_bytree': 0.8848250935856582, 
#           'reg_alpha': 0.5288165395694552, 
#           'reg_lambda': 3.3416664166015955}

# # 파라미터 설정
# dtc_params = {'criterion': 'gini', 
#           'splitter': 'best', 
#           'max_depth': 20, 
#           'min_samples_split': 5, 
#           'min_samples_leaf': 2}

lgbm= LGBMClassifier( random_state=42)
xgb =XGBClassifier(random_state =42)
dtc = DecisionTreeClassifier(random_state=42)
voting_clf = VotingClassifier(
    estimators=[('lgbm', lgbm), ('xgb', xgb), ('dtc', dtc) ],
    voting='soft',  weights=[0.15,0.7, 0.15]
)
# 모델 학습
voting_clf.fit(x, y)

# 예측
pred = voting_clf.predict(test)

In [116]:
sample_submission['대출등급'] = pred 

# 딕셔너리 뒤집기
inv_grade_dict = {v: k for k, v in grade_dict.items()}

# 'grade' 열을 다시 알파벳으로 변경
sample_submission['대출등급'] = sample_submission['대출등급'].map(inv_grade_dict)


In [117]:
sample_submission

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


In [118]:
sample_submission.to_csv('../data/0.9 up/newfeature2_paraxgb.csv', index= False)
# votinglxd_substitute_clust_MD_woe_group_percentile_normalize_42 