#### 머신러닝 개념(ML)

- 데이터를 기반으로 패턴을 학습하고 결과를 추론하는 알고리즘 기법
- 지도학습(Supervised Learning) 과 비지도학습(Un-Supervised Learning) , 강화학습(Reinforcement Learning)
- 지도학습   - 분류, 회귀
- 비지도학습 - 군집화(클러스터링) , 차원축소(PCA) 

#### 머신러닝 용어?
- 피처(feature) : 데이터의 일반 속성
- 레이블, 클래스, 타겟 값, 결정 값 : 정답데이터를 의미한다

In [1]:
import numpy  as np
import pandas as pd

# 시각화
import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns

import json 

import warnings
warnings.filterwarnings(action='ignore')

from datetime import date, datetime, timedelta
from dateutil.parser import parse


# 한글 폰트 문제 해결
import platform

from matplotlib import font_manager, rc

if platform.system() == 'Darwin':
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    plt.rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~') 


# 차트 축 <- 음수 부호 지원
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False


# crawling
from bs4 import BeautifulSoup
from urllib.request import urlopen , urlretrieve
from urllib.error   import HTTPError
from urllib.error   import URLError

import requests 
import re

from selenium import webdriver

from time    import sleep , time 
from random  import randint
from IPython.core.display import clear_output

# 비정형 디비 
import pymongo as mongo

# print('numpy version  - ' , np.__version__)
# print('pandas version - ' , pd.__version__)


# ml 
import sklearn
from   sklearn.datasets import load_iris

from   sklearn.model_selection import train_test_split , KFold , StratifiedKFold, cross_val_score , cross_validate , GridSearchCV
from   sklearn.tree            import DecisionTreeClassifier
from   sklearn.metrics         import accuracy_score


sklearn.__version__


'0.24.2'

In [2]:
iris = load_iris()
iris

print('type -' ,type(iris)) # 번치 타입이란 ? 
print('keys - ' , iris.keys())

type - <class 'sklearn.utils.Bunch'>
keys -  dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [6]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [7]:
print('feature, target을 이용해서 데이터 프레임을 만들어보자 ')

feature, target을 이용해서 데이터 프레임을 만들어보자 


In [8]:

# 연습.... 

iris_frm =pd.DataFrame(data=iris.data , columns=iris.feature_names)
iris_frm['target'] = iris.target

iris_frm

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


- Estimator : classification(분류), regression(회귀)
- 분류 알고리즘 : DecisoinTreeClassifer , RandnomForestClassifer, xxxx clasifer, GNB  ,SVM ,  
- 회귀 알고리즘 : LinerRegression, Ridge , Lasso, RandomForestRegression 




##### classfication 절차 
 - step01. 데이터 분리(학습, 테스트)
 - step02. 학습데이터를 기반으로 ML알고리즘을 적용해 학습 모델을 생성 - fit()
 - step03. 분류예측 수행(테스트 ) - predict()
 - step04. 평가(정확도, 정밀도, 재현율, 조화평균)

In [9]:
# STEP1  - 데이터세트의 분리 
# x는 데이터 y는 타겟
X_train ,X_test , y_train ,y_test =train_test_split(iris.data, 
                 iris.target,
                 test_size = 0.2 , 
                        shuffle= True  , random_state=200 ) 



In [10]:
dt_model = DecisionTreeClassifier() # 어떤 모델을 선택할지는 본인이 결정한다. 
dt_model.fit(X_train , y_train)  # 모델 생성 

DecisionTreeClassifier()

In [11]:
# predict 함수에 테스트
dt_model.predict(X_test)
y_pred = dt_model.predict(X_test)
print('answer - ' , y_test)
print('pred - ' , y_pred)

answer -  [1 2 0 0 1 2 1 1 1 2 2 0 0 0 2 0 1 0 1 2 0 1 2 0 0 0 1 2 2 1]
pred -  [1 2 0 0 1 2 1 1 1 2 2 0 0 0 2 0 1 0 1 2 0 1 2 0 0 0 1 2 2 1]


In [12]:
# 성능평가 정확도 
print('예측 정확도 - ' , accuracy_score(Y_test , Y_pred))

NameError: name 'Y_test' is not defined

In [None]:
# 데이터프레임 이용한 학습데이터 테스트 데이터 분리 후 모델 생성 및 예측평가를 진행해 본다면 ? 

# 1 데이터 세트의 분리 
iris_feacture = iris_frm.iloc[: , :-1]
iris_feacture

#2 target 
iris_target = iris_frm.iloc[: , -1]
iris_target

In [None]:
x_train  , x_test , y_test,y_train = train_test_split(iris_feacture , 
                                                      iris_target ,
                                                      test_size=0.2 ,
                                                      shuffle=True ,
                                                      random_state=200)

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train , y_train)

###### 교차 검증 
 - 과적합을 방지학 위한 작업 overfitting 
 - 데이터편중을 막기위한 ㅏㅇ법
 - KFold 
 - 분류는 사용할 수 있지만 , 회구 x 사용 불가 연속된 숫자값 사용시


In [None]:
features = iris.data
target = iris.target

target

In [None]:
features[1]

In [None]:
fold = KFold(n_splits=5)
cv_accuracy = [] 

cv_dt_model = DecisionTreeClassifier()
fold_idx = 0 

for train_idx , test_idx in fold.split(features) : 
    X_train, X_test = features[train_idx] , features[test_idx] # x_train 피쳐 레이블 데이터 값 
    y_train , y_test = target[train_idx] , target[test_idx] # y_train target 을 별도로 설정해주어야 한다. 
    
    cv_dt_model.fit(X_train, y_train)
    fold_pred = cv_dt_model.predict(X_test)
    
    fold_idx += 1 
    
    acc = accuracy_score(y_test, fold_pred)
    print('fold set : {} , 교차검증 정확도 : {} ,학습데이터 크기 : {} , 검증데이터 크기 {}'.format(fold_idx, acc, X_train.shape[0] , X_test.shape[0]))
    cv_accuracy.append(acc)
    print()
    print()
    
print()
print('평균 검증 정확도 : ', np.mean(cv_accuracy))

- 레이블에대한 분포를 고려한 뒤 이 분포와 동일하게 학습과 검증데이터 세트를 분배하는 교차검증이 있다.
- Stratifild KFold -> 기존 KFold 의 불균형한 분포를 개선

In [None]:
fold = KFold(n_splits = 3) 

fold_idx = 0 
for train_idx , test_idx in fold.split(iris_frm) :
    target_train = iris_frm['target'].iloc[train_idx]
    target_test  = iris_frm['target'].iloc[test_idx]
    print()
    fold_idx += 1 
    
    print('교차검증 {} set '.format(fold_idx) ) 
    print('train value count - ')
    print(target_train.value_counts())
    print('test  value count - ')
    print(target_test.value_counts())
    print()
    print()

In [None]:
skf = StratifiedKFold(n_splits = 3) 

fold_idx = 0 

print('레이블 데이터 세트도 필요함!!!')

for train_idx , test_idx in skf.split(iris_frm , iris_frm['target']) :
    
    target_train = iris_frm['target'].iloc[train_idx]
    target_test  = iris_frm['target'].iloc[test_idx]
    print()
    fold_idx += 1 
    
    print('교차검증 {} set '.format(fold_idx) ) 
    print('train value count - ')
    print(target_train.value_counts())
    print('test  value count - ')
    print(target_test.value_counts())
    print()
    print()

 - cross_val_score() 교차검증을 보다 간단하게 
 - cross_valdidate(estimator , X , y ,)
 - cross_val_score vs cross_vaildate 차이점은 무엇인지 명확하게 알아둘것. ! ! 차이점은 의외로..

In [None]:
# kfold 대신 cross_Val 을 사용하면 되는건가 ? 


In [None]:
features = iris.data
target = iris.target 


dt_model = DecisionTreeClassifier(random_state=100)
scores = cross_val_score(dt_model, features , target , scoring='accuracy' ,cv=4) # x는 피쳐 y는 타겟 


print('교차 검증 정확도 - ', scores)
print('평균 검증 정확도 - ', np.mean(scores))

In [None]:
features = iris.data
target = iris.target 


dt_model = DecisionTreeClassifier(random_state=100)
scores = cross_validate(dt_model, features , target , scoring='accuracy' ,cv=5) # x는 피쳐 y는 타겟 


print('교차 검증 정확도 - ', scores)
print('평균 검증 정확도 - ', np.mean(scores['test_score']))
print('교차 검증 시간-' , scores['fit_time'])

 - GridSearchCV : 교차검증과 하이퍼 파라미터 튜닝을 한번에  # cv cross_vaildate 약어 
    
    

##### DecisionTree 에서 사용할 수 있는 파라미터

- criterion : 분할 품질을 측정하는 기능 (default : gini)
- splitter : 각 노드에서 분할을 선택하는 데 사용되는 전략 (default : best)
- max_depth : 트리의 최대 깊이 (값이 클수록 모델의 복잡도가 올라간다.)
- min_samples_split : 자식 노드를 분할하는데 필요한 최소 샘플 수 (default : 2)
- min_samples_leaf : 리프 노드에 있어야 할 최소 샘플 수 (default : 1)
- min_weight_fraction_leaf : min_sample_leaf와 같지만 가중치가 부여된 샘플 수에서의 비율
- max_features : 각 노드에서 분할에 사용할 특징의 최대 수
- random_state : 난수 seed 설정
- max_leaf_nodes : 리프 노드의 최대수
- min_impurity_decrease : 최소 불순도
- min_impurity_split : 나무 성장을 멈추기 위한 임계치
- class_weight : 클래스 가중치
- presort : 데이터 정렬 필요 여부


class sklearn.model_selection.GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, 
verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)[source]

In [15]:
# 하이퍼 파라미터를 딕셔너리 형태로 변수에 저장하여 여기에 저장된 데이터 만큼 돌린다. 
params = {
    'criterion' : ['gini' , 'entropy'] , 
    'splitter'  : ['best' , 'random']  ,
    'max_depth' : [1 , 2 , 3 ] , 
    'min_samples_split' : [2 , 3]
}

iris = load_iris() 

X_train , X_test , y_train , y_test = train_test_split(iris.data   , 
                                                       iris.target ,
                                                       test_size    = 0.25 ,
                                                       shuffle      = True , 
                                                       random_state = 200 )


dt_model = DecisionTreeClassifier(random_state = 100)
# gridseachcv 자체가 가장 좋은 파라미터 를 선택한다고 보면될듯 ? 
#lass sklearn.model_selection.GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, 
   # verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)[source]
# refit -> 가장 좋은 파라미터 설정으로 재학습 시킴
grid_tree_model = GridSearchCV(dt_model , param_grid = params , cv = 3 , refit = True)
grid_tree_model.fit(X_train , y_train)
grid_tree_model.cv_results_

{'mean_fit_time': array([0.00066471, 0.00066503, 0.00066487, 0.00099794, 0.00066503,
        0.00066479, 0.00232712, 0.00099516, 0.00066503, 0.00099715,
        0.00099492, 0.00067528, 0.00099723, 0.        , 0.00099754,
        0.00066487, 0.00033363, 0.00065541, 0.00033267, 0.00033251,
        0.0009973 , 0.        , 0.00066479, 0.00099707]),
 'std_fit_time': array([4.70021695e-04, 4.70246800e-04, 4.70528890e-04, 2.97360213e-07,
        4.70246599e-04, 4.70077860e-04, 2.61755522e-03, 3.39415177e-06,
        4.70246438e-04, 8.14490916e-04, 2.86763804e-06, 4.77613546e-04,
        1.12391596e-07, 0.00000000e+00, 8.16248375e-04, 4.70201791e-04,
        4.71819920e-04, 4.63606424e-04, 4.70471221e-04, 4.70246438e-04,
        3.37174788e-07, 0.00000000e+00, 4.70077860e-04, 3.37174788e-07]),
 'mean_score_time': array([0.00033236, 0.00033275, 0.00033251, 0.0003322 , 0.00033228,
        0.00033228, 0.00033259, 0.00033243, 0.        , 0.00033243,
        0.00033458, 0.0003225 , 0.        , 0.00

In [19]:
info_frm = pd.DataFrame(grid_tree_model.cv_results_)
info_frm

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000665,0.0004700217,0.000332,0.0004700217,gini,1,2,best,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.684211,0.675676,0.648649,0.669512,0.015158,17
1,0.000665,0.0004702468,0.000333,0.0004705836,gini,1,2,random,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.684211,0.675676,0.648649,0.669512,0.015158,17
2,0.000665,0.0004705289,0.000333,0.0004702464,gini,1,3,best,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.684211,0.675676,0.648649,0.669512,0.015158,17
3,0.000998,2.973602e-07,0.000332,0.0004697969,gini,1,3,random,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",0.684211,0.675676,0.648649,0.669512,0.015158,17
4,0.000665,0.0004702466,0.000332,0.0004699093,gini,2,2,best,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.868421,0.945946,0.891892,0.902086,0.03246,5
5,0.000665,0.0004700779,0.000332,0.0004699093,gini,2,2,random,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.710526,0.72973,0.675676,0.705311,0.022374,13
6,0.002327,0.002617555,0.000333,0.0004703588,gini,2,3,best,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.868421,0.945946,0.891892,0.902086,0.03246,5
7,0.000995,3.394152e-06,0.000332,0.000470134,gini,2,3,random,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.710526,0.72973,0.675676,0.705311,0.022374,13
8,0.000665,0.0004702464,0.0,0.0,gini,3,2,best,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.947368,0.945946,0.945946,0.94642,0.000671,1
9,0.000997,0.0008144909,0.000332,0.000470134,gini,3,2,random,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.815789,0.837838,0.837838,0.830488,0.010394,9


In [20]:
print('최적 파라미터 - ' , grid_tree_model.best_params_ )
print('최고 점수     - ' , grid_tree_model.best_score_ )

최적 파라미터 -  {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2, 'splitter': 'best'}
최고 점수     -  0.9464201043148411


In [22]:
estimator = grid_tree_model.best_estimator_
estimator

DecisionTreeClassifier(max_depth=3, random_state=100)

In [None]:
y_pred = estimator.predict(X_test)

print('GridSearchCV 테스트 정확도 - ' , accuracy_score(y_test , y_pred)) 

print()

print('answer - ' , y_test)
print()
print('pred   - ' , y_pred)
