## Classification 종합실습

### 1. 환경준비

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 모델링
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import * 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### 2. 데이터 준비

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/credit_all.csv'
data = pd.read_csv(path)
data.loc[data['Payment'] == 4, 'Payment'] = 3
data.head()

Unnamed: 0,Creditability,AccountBalance,Duration,Payment,Purpose,CreditAmount,Employment,SexMarital,CurrentAddress,MostValuableAsset,Age,Apartment,CreditCount,Occupation,Dependents,Telephone,ForeignWorker
0,1,3,24,2,0,1249,2,4,2,1,28,2,1,3,1,1,1
1,1,2,9,2,0,276,3,4,4,1,22,1,1,2,1,1,1
2,1,1,18,3,2,1049,2,2,4,2,21,1,1,3,1,1,1
3,1,1,24,3,1,6419,5,2,4,4,44,3,2,4,2,2,1
4,1,3,12,2,2,1424,5,2,4,1,55,2,1,4,1,2,1


In [3]:
data.shape

(1000, 17)

In [4]:
# x, y 나누기
target = 'Creditability'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [None]:
# NA 조치

In [5]:
# 가변수화
cat_cols = ['Employment', 'CurrentAddress', 'CreditCount', 'Dependents', 'Telephone', 'AccountBalance', 'Payment', 'Purpose', 'SexMarital', 'MostValuableAsset', 'Apartment','Occupation','ForeignWorker']
x = pd.get_dummies(x, columns=cat_cols, drop_first=True)

In [6]:
# train, valid 나누기
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=2022)

In [7]:
# Scaling
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

### 3. 모델링

- Logistic, DecisionTree, KNN, SVC 사용

#### (1) 로지스틱 회귀

In [8]:
# 아래 함수는 로지스틱 회귀를 위한 전진선택법 함수 입니다.
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit(disp = False)
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

In [9]:
vars, result = forward_stepwise_logistic(x_train, y_train)

In [10]:
# 선택된 변수
vars

['AccountBalance_3',
 'Payment_3',
 'Purpose_1',
 'CreditAmount',
 'Payment_2',
 'Duration',
 'Employment_2',
 'AccountBalance_2',
 'ForeignWorker_2',
 'Purpose_3',
 'SexMarital_3',
 'CreditCount_2',
 'CurrentAddress_2',
 'MostValuableAsset_4',
 'Telephone_2',
 'Occupation_4']

In [11]:
# 전체 변수로 모델링
lr = LogisticRegression()
lr.fit(x_train_s, y_train)

pred_lr = lr.predict(x_val_s)
print(classification_report(y_val, pred_lr))

              precision    recall  f1-score   support

           0       0.58      0.48      0.53        97
           1       0.77      0.83      0.80       203

    accuracy                           0.72       300
   macro avg       0.68      0.66      0.66       300
weighted avg       0.71      0.72      0.71       300



In [12]:
# 전진선택법 변수로 모델링
lr_f = LogisticRegression()
lr_f.fit(x_train[vars], y_train)

pred_lr_f = lr_f.predict(x_val[vars])
print(classification_report(y_val, pred_lr_f))

              precision    recall  f1-score   support

           0       0.62      0.46      0.53        97
           1       0.77      0.86      0.81       203

    accuracy                           0.73       300
   macro avg       0.69      0.66      0.67       300
weighted avg       0.72      0.73      0.72       300



#### (2) Decision Tree

In [13]:
params_dt = {
    'max_depth': range(1, 10),
    'min_samples_leaf': range(10, 100)
}

In [14]:
dt = DecisionTreeClassifier()

dt_gs = GridSearchCV(dt, params_dt, cv=5)

In [15]:
dt_gs.fit(x_train_s, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 10),
                         'min_samples_leaf': range(10, 100)})

In [17]:
dt_gs.best_params_, dt_gs.best_score_

({'max_depth': 3, 'min_samples_leaf': 78}, 0.7428571428571429)

In [18]:
pred_dt = dt_gs.predict(x_val_s)
print(classification_report(y_val, pred_dt))

              precision    recall  f1-score   support

           0       0.55      0.46      0.50        97
           1       0.76      0.82      0.79       203

    accuracy                           0.70       300
   macro avg       0.66      0.64      0.65       300
weighted avg       0.69      0.70      0.70       300



#### (3) KNN

In [19]:
params_knn = {
    'n_neighbors': range(3, 31, 2),
    'metric': ['euclidean', 'manhattan']
}

In [20]:
knn = KNeighborsClassifier()

knn_gs = GridSearchCV(knn, params_knn, cv=5)

In [21]:
knn_gs.fit(x_train_s, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(3, 31, 2)})

In [22]:
knn_gs.best_params_, knn_gs.best_score_

({'metric': 'euclidean', 'n_neighbors': 5}, 0.7214285714285715)

In [23]:
pred_knn = knn_gs.predict(x_val_s)
print(classification_report(y_val, pred_knn))

              precision    recall  f1-score   support

           0       0.49      0.29      0.36        97
           1       0.72      0.86      0.78       203

    accuracy                           0.67       300
   macro avg       0.60      0.57      0.57       300
weighted avg       0.64      0.67      0.65       300



#### (4) SVC

In [24]:
params_svm = {
    'C': range(1, 10),
    'gamma': range(1, 10)
}

In [25]:
svm = SVC()

svm_gs = GridSearchCV(svm, params_svm, cv=5)

In [26]:
svm_gs.fit(x_train_s, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': range(1, 10), 'gamma': range(1, 10)})

In [27]:
svm_gs.best_params_, svm_gs.best_score_

({'C': 1, 'gamma': 1}, 0.7142857142857143)

In [28]:
pred_svm = svm_gs.predict(x_val_s)
print(classification_report(y_val, pred_svm))

              precision    recall  f1-score   support

           0       0.50      0.01      0.02        97
           1       0.68      1.00      0.81       203

    accuracy                           0.68       300
   macro avg       0.59      0.50      0.41       300
weighted avg       0.62      0.68      0.55       300

