In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

In [2]:
data = pd.read_csv(r'../data/final_data.csv')

In [3]:
X = data.iloc[:, 1:]
y = data["TARGET"]

In [4]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, stratify = y, random_state = 2021)

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler().fit(train_x)
scaled_train_x = pd.DataFrame(scaler.transform(train_x), columns=train_x.columns)

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
smote = SMOTE(k_neighbors=3, sampling_strategy=0.3, random_state=2021)
new_train_x, new_train_y = smote.fit_resample(scaled_train_x, train_y)

In [8]:
model = LGBMClassifier()

In [9]:
model.fit(train_x, train_y)

[LightGBM] [Info] Number of positive: 449, number of negative: 7000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 219481
[LightGBM] [Info] Number of data points in the train set: 7449, number of used features: 3924
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060277 -> initscore=-2.746643
[LightGBM] [Info] Start training from score -2.746643


In [12]:
model = LGBMClassifier()
param_space = {'criterion': ['gini', 'entropy'],
               'max_depth': [3,5,7,10,15],
               'min_samples_leaf': [3,5,7,10,15]
               }

cv = StratifiedKFold(n_splits = 3)

grid_search = GridSearchCV(model, param_grid=param_space, cv=cv, refit=True, n_jobs=-1)
grid_search.fit(train_x, train_y)

[LightGBM] [Info] Number of positive: 449, number of negative: 7000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 219481
[LightGBM] [Info] Number of data points in the train set: 7449, number of used features: 3924
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060277 -> initscore=-2.746643
[LightGBM] [Info] Start training from score -2.746643


In [None]:
recall_score(test_y,pred)

In [14]:
print('best parameters : ', grid_search.best_params_)
print('best score : ', grid_search.best_score_)
best_model = grid_search.best_estimator_
pred = best_model.predict(test_x)
accuracy_score(test_y, pred)

best_model.feature_importances_
bmfi = best_model.feature_importances_
print('bmfi : ', best_model.feature_importances_)

best parameters :  {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 5}
best score :  0.9444220700765203
bmfi :  [5 0 1 ... 0 0 6]


In [16]:
for i in best_model.feature_importances_:
    print(i)

5
0
1
0
0
0
5
11
0
14
3
23
0
0
1
0
0
1
0
1
1
0
0
0
0
0
0
0
1
1
1
0
0
3
1
4
0
0
1
1
0
0
1
0
0
0
0
0
0
1
0
1
0
1
1
0
1
0
0
0
1
0
0
0
1
0
5
2
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
2
1
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
1
1
2
1
0
0
0
0
0
0
0
0
4
1
0
0
0
0
1
0
0
0
1
1
1
1
1
0
0
0
0
1
0
0
1
1
0
1
0
1
0
2
1
0
0
3
0
0
0
0
1
0
3
0
0
0
0
0
0
0
0
0
2
2
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
1
0
4
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
2
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
3
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
2
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
2
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
3
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
1
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0

In [1]:
recall_score(test_y,pred)

NameError: name 'recall_score' is not defined

In [11]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Bagging 모델을 생성합니다. 여기서는 LightGBM 모델을 기반으로 Bagging을 진행합니다.
bagging_model = BaggingRegressor(base_estimator=LGBMClassifier(), n_estimators=10, random_state=42)

# Bagging 모델을 학습시킵니다.
bagging_model.fit(train_x, train_y)

# 학습된 Bagging 모델을 사용하여 테스트 데이터에 대한 예측을 수행합니다.
bagging_predictions = bagging_model.predict(test_x)

# 평가 메트릭스를 출력합니다.
mse = mean_squared_error(test_y, bagging_predictions)
r2 = r2_score(test_y, bagging_predictions)
print("Mean Squared Error (Bagging):", mse)
print("R-squared (Bagging):", r2)




[LightGBM] [Info] Number of positive: 449, number of negative: 7000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 219481
[LightGBM] [Info] Number of data points in the train set: 7449, number of used features: 3924
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057592 -> initscore=-2.795062
[LightGBM] [Info] Start training from score -2.795062
[LightGBM] [Info] Number of positive: 449, number of negative: 7000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 219481
[LightGBM] [Info] Number of data points in the train set: 7449, number of used features: 3924
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061082 -> initscore=-2.732510
[LightGBM] [Info] Start training from score -2.732510
[LightGBM] [Info] Number of positive: 449, number of negative: 7000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 219481
[LightGBM] [Info] Number of data points in the train set: 