In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

In [20]:
credit = pd.read_csv('/content/creditcard.csv')
credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [21]:
# 결측치 있는 행 제거
credit.dropna(inplace=True)

In [22]:
# feature와 class 분류
X = credit.drop(['Class'],axis=1)
y = credit['Class']

In [23]:
# train, test 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
# score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

# LGBM
- Light라는 말 그대로 기존의 GradientBoosting보다 훨씬 빠름.
- 그리고 Grid Search가 지원되어서 최적의 하이퍼파리미터 튜닝이 가능함

- Parameter
  1. learning_rate : 학습률
  2. min_data_in_leaf : 큰 값 설정하면 깊이가 지나치게 깊어지는 걸 방지함
  3. num_leaves : 하나의 트리가 가질 수 있는 최대 리프의 수

In [25]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)

print('LGBM F1 Score:', f1_score(y_test, y_pred))
print('LGBM AUC Score:', roc_auc_score(y_test, y_pred))
print('LGBM Recall Score:', recall_score(y_test, y_pred))

LGBM F1 Score: 0.31055900621118016
LGBM AUC Score: 0.8266257537143341
LGBM Recall Score: 0.6578947368421053


# XGboost
- Gradient Boosting과 CART(Classification and Regression tree) 기반의 Tree
- 기존의 Gradient Boosting과 다르게 Overfitting을 방지하기 위한 파라미터가 추가됨 (마치 Ridge,Lasso처럼 페널티가 부여됨)
- 다만 작은 데이터에 대해 과적합 가능성이 있음.
- Parameter
  1. min_child_weight : child에서 필요한 모든 관측치에 대한 가중치의 최소합
  2. subsample : 각 트리마다 데이터 샘플링 비율로 overfitting을 방지함
  3. n_estimators : 트리 모델의 수

In [26]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print('XGb F1 Score:', f1_score(y_test, y_pred))
print('XGb AUC Score:', roc_auc_score(y_test, y_pred))
print('XGb Recall Score:', recall_score(y_test, y_pred))

XGb F1 Score: 0.8720379146919429
XGb AUC Score: 0.903469288686513
XGb Recall Score: 0.8070175438596491


# DecisionTree
- 나무가 가지를 치는 것처럼 학습을 통해 트리 기반의 분류 규칙을 만드는 모델
- 서브 트리가 추가될수록 과적합 가능성이 높아짐
- Parameters
  1. min_samples_split: 해당 노드가 가지고 있는 최소한의 샘플 갯수
  2. max_features : 최적의 분할을 위해 고려할 최대 feature 갯수

In [27]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print('DT F1 Score:', f1_score(y_test, y_pred))
print('DT AUC Score:', roc_auc_score(y_test, y_pred))
print('DT Recall Score:', recall_score(y_test, y_pred))

DT F1 Score: 0.8050847457627118
DT AUC Score: 0.9164534571527844
DT Recall Score: 0.8333333333333334


# Voting Classifier로 LightGBM과 XGboost를 함께 사용해보기
- hard voting

In [28]:
 from sklearn.ensemble import VotingClassifier

In [29]:
clf1 = LGBMClassifier()
clf2 = XGBClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],voting='hard')

eclf.fit(X_train, y_train)
y_pred = eclf.predict(X_test)

print('HardVoting (lgbm xgb) F1 Score:', f1_score(y_test, y_pred))
print('HardVoting (lgbm xgb) AUC Score:', roc_auc_score(y_test, y_pred))
print('HardVoting (lgbm xgb) Recall Score:', recall_score(y_test, y_pred))

HardVoting (lgbm xgb) F1 Score: 0.6892655367231639
HardVoting (lgbm xgb) AUC Score: 0.7675280663517983
HardVoting (lgbm xgb) Recall Score: 0.5350877192982456
