## 성능튜닝

### 1. 환경준비

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 모델링
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import * 

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### 2. 데이터 준비

* 변수설명
    * COLLEGE : 대학 졸업여부
    * INCOME : 연수입
    * OVERAGE : 월평균 초과사용 시간(분)
    * LEFTOVER : 월평균 잔여시간비율(%)
    * HOUSE : 집값
    * HANDSET_PRICE : 스마트폰 가격
    * OVER_15MINS_CALLS_PER_MONTH : 월평균 장기통화(15분이상) 횟수
    * AVERAGE_CALL_DURATION : 평균 통화 시간
    * REPORTED_SATISFACTION : 만족도 설문조사 결과
    * REPORTED_USAGE_LEVEL : 사용도 자가진단 결과
    * CONSIDERING_CHANGE_OF_PLAN : 향후 변경계획 설문조사 결과
    * CHURN : 이탈(번호이동) 여부 (1-이탈, 0-잔류, Target 변수)

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
data = data.sample(5000, random_state = 2022)

# sklearn을 사용할때 y를 가변수화 할 필요는 없다.
# 반드시 필요할 때) statsmodels, tensorflow(keras)
data['CHURN'] = data['CHURN'].map({'LEAVE':1, 'STAY':0})
data.head()

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURN
3178,3179,0,119512,51,31,248566,229,5,2,very_sat,very_high,considering,1
14926,14927,1,142144,192,15,774317,581,29,4,unsat,very_little,never_thought,1
15116,15117,1,142308,0,79,306426,497,1,1,sat,little,considering,0
12733,12734,1,113385,0,0,333599,819,1,6,very_unsat,very_high,considering,1
14032,14033,1,90348,209,10,637286,360,26,4,unsat,little,actively_looking_into_it,0


In [4]:
# id drop
drop_cols = ['id']
data.drop(drop_cols, axis=1, inplace=True)

In [5]:
# x, y 나누기
target = 'CHURN'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [None]:
# NA조치

In [6]:
# 가변수화
dumm_cols = ['REPORTED_SATISFACTION','REPORTED_USAGE_LEVEL','CONSIDERING_CHANGE_OF_PLAN']
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True)

In [7]:
# train, valid 나누기
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=20)

In [8]:
# Scaling
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

### 3. 선형모델 튜닝

#### Logistic Regression : 전진선택법
- 변수를 하나씩 늘려가면서 AIC를 가장 낮추는 모델 찾기

In [9]:
# 아래 함수는 로지스틱 회귀를 위한 전진선택법 함수 입니다.
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit()
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

In [10]:
vars, result = forward_stepwise_logistic(x_train, y_train)

Optimization terminated successfully.
         Current function value: 0.693075
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693062
         Iterations 2
Optimization terminated successfully.
         Current function value: 0.683528
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.693007
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.684909
         Iterations 2
Optimization terminated successfully.
         Current function value: 0.693042
         Iterations 2
Optimization terminated successfully.
         Current function value: 0.686899
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692565
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692806
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.693137
  

Optimization terminated successfully.
         Current function value: 0.633174
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633137
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633162
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633092
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633175
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633193
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633016
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633178
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633151
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.633158
  

In [11]:
# 선택된 변수
vars

['OVERAGE',
 'HOUSE',
 'HANDSET_PRICE',
 'LEFTOVER',
 'REPORTED_SATISFACTION_very_sat',
 'INCOME',
 'REPORTED_SATISFACTION_sat']

In [12]:
result

Unnamed: 0,step,feature,aic
0,1.0,[OVERAGE],4786.699456
1,1.0,[HOUSE],4796.363859
2,1.0,[OVER_15MINS_CALLS_PER_MONTH],4810.294604
3,1.0,[REPORTED_SATISFACTION_very_sat],4845.064834
4,1.0,[AVERAGE_CALL_DURATION],4849.951663
...,...,...,...
114,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.309949
115,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.315315
116,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.316816
117,7.0,"[OVERAGE, HOUSE, HANDSET_PRICE, LEFTOVER, REPO...",4442.448187


In [13]:
# 모델링
## 전체 변수
m1 = LogisticRegression()
m1.fit(x_train, y_train)
p1 = m1.predict(x_val)

print(accuracy_score(y_val, p1))
print(classification_report(y_val, p1))

0.6333333333333333
              precision    recall  f1-score   support

           0       0.62      0.68      0.65       738
           1       0.65      0.59      0.62       762

    accuracy                           0.63      1500
   macro avg       0.63      0.63      0.63      1500
weighted avg       0.64      0.63      0.63      1500



In [14]:
## 전진선택법 변수
m2 = LogisticRegression()
m2.fit(x_train[vars], y_train)
p2 = m2.predict(x_val[vars])

print(accuracy_score(y_val, p2))
print(classification_report(y_val, p2))

0.634
              precision    recall  f1-score   support

           0       0.62      0.68      0.65       738
           1       0.66      0.59      0.62       762

    accuracy                           0.63      1500
   macro avg       0.64      0.63      0.63      1500
weighted avg       0.64      0.63      0.63      1500



### 4. 하이퍼파라미터 튜닝

In [15]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#### (1) Random Search

- 값의 범위를 지정한다.
- 모델 선언 (시도 횟수 지정)
- 모델링 (값의 범위 내에서 시도 횟수만큼 랜덤하게 선택해서 시도한다.
- 가장 성능이 좋은 값을 선정

In [16]:
# 값의 범위를 지정
# dictionary형태로 선언
params = {'n_neighbors' : range(1,51), 
          'metric' : ['euclidean', 'manhattan']}
params

{'n_neighbors': range(1, 51), 'metric': ['euclidean', 'manhattan']}

In [17]:
# 모델 선언
model = KNeighborsClassifier()

# Random Search 설정
model_rs = RandomizedSearchCV(model, params, cv=5, n_iter=5)

In [19]:
# 모델링
model_rs.fit(x_train_s, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=5,
                   param_distributions={'metric': ['euclidean', 'manhattan'],
                                        'n_neighbors': range(1, 51)})

In [20]:
# 튜닝 결과
model_rs.cv_results_

{'mean_fit_time': array([0.00136576, 0.00116186, 0.00087748, 0.00101352, 0.00118628]),
 'std_fit_time': array([6.58699519e-04, 4.20731435e-04, 1.46626503e-04, 1.84739786e-05,
        7.49888866e-04]),
 'mean_score_time': array([0.06402044, 0.06864343, 0.06620574, 0.06744781, 0.06602507]),
 'std_score_time': array([0.00795003, 0.0072078 , 0.00210912, 0.00189677, 0.00358689]),
 'param_n_neighbors': masked_array(data=[39, 32, 14, 29, 25],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_metric': masked_array(data=['manhattan', 'manhattan', 'manhattan', 'manhattan',
                    'euclidean'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 39, 'metric': 'manhattan'},
  {'n_neighbors': 32, 'metric': 'manhattan'},
  {'n_neighbors': 14, 'metric': 'manhattan'},
  {'n_neighbors': 29, 'metric': 'manhattan'},
  {'n_neighbors': 25, 'metri

In [21]:
model_rs.cv_results_['params']

[{'n_neighbors': 39, 'metric': 'manhattan'},
 {'n_neighbors': 32, 'metric': 'manhattan'},
 {'n_neighbors': 14, 'metric': 'manhattan'},
 {'n_neighbors': 29, 'metric': 'manhattan'},
 {'n_neighbors': 25, 'metric': 'euclidean'}]

In [22]:
model_rs.cv_results_['mean_test_score']

array([0.62514286, 0.61771429, 0.59685714, 0.618     , 0.58457143])

In [23]:
# 최적의 파라미터
model_rs.best_params_

{'n_neighbors': 39, 'metric': 'manhattan'}

In [24]:
# 찾은 최적 파라미터의 성능
model_rs.best_score_

0.6251428571428571

In [25]:
# 예측 및 평가
pred = model_rs.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.59      0.74      0.66       738
           1       0.67      0.51      0.58       762

    accuracy                           0.62      1500
   macro avg       0.63      0.63      0.62      1500
weighted avg       0.63      0.62      0.62      1500



#### (2) Grid Search

- 값의 범위를 지정한다.
- 모델링 (값의 범위 내에서 모든 조합을 시도)
- 가장 성능이 좋은 값을 선정한다.

In [26]:
# 값의 범위를 지정
params = {'n_neighbors' : range(3, 31, 2), 
          'metric' : ['euclidean', 'manhattan']}
params

{'n_neighbors': range(3, 31, 2), 'metric': ['euclidean', 'manhattan']}

In [27]:
# 모델 선언
model = KNeighborsClassifier()

model_gs = GridSearchCV(model, params, cv=5)

In [28]:
# 모델링
model_gs.fit(x_train_s, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(3, 31, 2)})

In [29]:
# 튜닝 결과
model_gs.cv_results_

{'mean_fit_time': array([0.00135331, 0.00120363, 0.00099115, 0.00100484, 0.00120959,
        0.00122418, 0.00138717, 0.00162492, 0.00119658, 0.00118933,
        0.0012022 , 0.00120177, 0.00140662, 0.00118637, 0.00102091,
        0.0007906 , 0.0014154 , 0.00099721, 0.00100055, 0.00100985,
        0.00102053, 0.00102677, 0.00096154, 0.00121875, 0.00080733,
        0.00100117, 0.0009973 , 0.00102253]),
 'std_fit_time': array([4.23634256e-04, 4.19343854e-04, 1.31380536e-05, 5.37504838e-05,
        4.24210175e-04, 3.87101844e-04, 4.51628264e-04, 3.86699195e-04,
        3.99138007e-04, 4.03035042e-04, 3.67863893e-04, 3.96405763e-04,
        4.81211515e-04, 4.04750379e-04, 2.97066105e-05, 3.96766368e-04,
        4.74156920e-04, 7.41723227e-06, 2.90489729e-05, 2.43666121e-05,
        4.60883773e-05, 7.00529777e-05, 9.19844710e-05, 3.75008018e-04,
        4.04020227e-04, 7.05074325e-06, 1.50789149e-07, 5.77023007e-05]),
 'mean_score_time': array([0.06159868, 0.06662145, 0.06563096, 0.06188688, 

In [30]:
model_gs.cv_results_['params']

[{'metric': 'euclidean', 'n_neighbors': 3},
 {'metric': 'euclidean', 'n_neighbors': 5},
 {'metric': 'euclidean', 'n_neighbors': 7},
 {'metric': 'euclidean', 'n_neighbors': 9},
 {'metric': 'euclidean', 'n_neighbors': 11},
 {'metric': 'euclidean', 'n_neighbors': 13},
 {'metric': 'euclidean', 'n_neighbors': 15},
 {'metric': 'euclidean', 'n_neighbors': 17},
 {'metric': 'euclidean', 'n_neighbors': 19},
 {'metric': 'euclidean', 'n_neighbors': 21},
 {'metric': 'euclidean', 'n_neighbors': 23},
 {'metric': 'euclidean', 'n_neighbors': 25},
 {'metric': 'euclidean', 'n_neighbors': 27},
 {'metric': 'euclidean', 'n_neighbors': 29},
 {'metric': 'manhattan', 'n_neighbors': 3},
 {'metric': 'manhattan', 'n_neighbors': 5},
 {'metric': 'manhattan', 'n_neighbors': 7},
 {'metric': 'manhattan', 'n_neighbors': 9},
 {'metric': 'manhattan', 'n_neighbors': 11},
 {'metric': 'manhattan', 'n_neighbors': 13},
 {'metric': 'manhattan', 'n_neighbors': 15},
 {'metric': 'manhattan', 'n_neighbors': 17},
 {'metric': 'manha

In [31]:
model_gs.cv_results_['mean_test_score']

array([0.57885714, 0.57942857, 0.57714286, 0.57342857, 0.568     ,
       0.57457143, 0.58085714, 0.58257143, 0.57914286, 0.58057143,
       0.586     , 0.58457143, 0.57971429, 0.58028571, 0.57314286,
       0.59142857, 0.59171429, 0.59142857, 0.59828571, 0.59771429,
       0.59942857, 0.60142857, 0.60171429, 0.61057143, 0.61      ,
       0.61142857, 0.61857143, 0.618     ])

In [32]:
# 최적의 파라미터
model_gs.best_params_

{'metric': 'manhattan', 'n_neighbors': 27}

In [33]:
model_gs.best_score_

0.6185714285714285

In [34]:
# 예측 및 평가
pred = model_gs.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.59      0.72      0.65       738
           1       0.65      0.51      0.57       762

    accuracy                           0.61      1500
   macro avg       0.62      0.61      0.61      1500
weighted avg       0.62      0.61      0.61      1500

