In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
#평가할 머신러닝 모델을 선정
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

In [2]:
iris = load_iris()
x = iris.data
y = iris.target

In [3]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=100)

In [13]:
#5번실행해서 평균 성능
from sklearn.model_selection import cross_validate

In [19]:
import numpy as np
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('classifier',LogisticRegression())    
])
scores = cross_validate(pipeline,x_train,y_train,n_jobs=-1)
# pipeline.fit(x_train,y_train)
np.mean(scores['test_score'])

0.9470355731225297

In [11]:
pipeline.score(x_test,y_test)

0.9473684210526315

In [21]:
# 1. pipline을 이용해서 전처리부터->모델학습하는 과정까지 일괄처리
# 2. 교차검증을 통해 우연히 좋은 성능을 발위하는 모델을 방지
# 1번과 2번의 과정을 지정한 머신러닝 모델에대해 각각 적용하고 가장 최상의 모델을 선택
score_list = []
for model in [LogisticRegression(), DecisionTreeClassifier(), SGDClassifier()]:
  pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('classifier',model)    
  ])
  scores = cross_validate(pipeline,x_train,y_train,n_jobs=-1)
  m = np.mean(scores["test_score"])
  score_list.append(m)
  print(f'model : {model} mean test score : {m}')


model : LogisticRegression() mean test score : 0.9470355731225297
model : DecisionTreeClassifier() mean test score : 0.9644268774703558
model : SGDClassifier() mean test score : 0.9201581027667984


In [22]:
# 최상의 모델을 선택한 후..... 하이퍼 파라메터 튜닝을 통해 다시 최상의 점수를 획득
# 이렇게 해도 별도 낳아지거나 향상된 기미가 안보일때는
# 처음데이터 수집 및 처리과정을 다시 한번 검토(피처 엔지니어링을 통해)

# 피처 엔지니어링
# p-value, VIF 계수 확인을 통한 적절한 feature만 골라서 다시 학습.
# 도메인지식을 활용
# 트리계열 머신러닝을 먼저적용해서. feature 선택
# 범주형데이터는 one hot표현하고, 기존 컬럼을 분석해서 새로운 컬럼을 도출...나이(연속된 데이터)->범주형 데이터로 새로 생성
# 기존에 없는 모델이나 라이브러리를 각종 논문이나. 대회 및 기타등등..... 찾아내서 적용

AutoML
  - 사이킷런 계열 : TPOT

In [23]:
!pip install tpot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tpot
  Downloading TPOT-0.12.0-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11938

In [25]:
from tpot import TPOTClassifier
automl = TPOTClassifier(generations = 10)
automl.fit(x_train,y_train)

타 계열의 AutoML
  - pycaret

In [None]:
!pip install pycaret

In [27]:
from pycaret import classification
classification.setup(data = x_train, target = y_train)

Unnamed: 0,Description,Value
0,Session id,3117
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(112, 5)"
4,Transformed data shape,"(112, 5)"
5,Transformed train set shape,"(78, 5)"
6,Transformed test set shape,"(34, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7fba00325420>

In [34]:
best = classification.compare_models(n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.95,0.99,0.95,0.9662,0.9453,0.9233,0.9344,0.061
lda,Linear Discriminant Analysis,0.95,1.0,0.95,0.9662,0.9453,0.9233,0.9344,0.107
xgboost,Extreme Gradient Boosting,0.95,0.98,0.95,0.9625,0.9477,0.9233,0.9315,0.077
lightgbm,Light Gradient Boosting Machine,0.95,0.9779,0.95,0.9673,0.9457,0.9238,0.9347,0.089
lr,Logistic Regression,0.9375,0.995,0.9375,0.9469,0.9348,0.9038,0.9101,0.152
nb,Naive Bayes,0.9375,0.9808,0.9375,0.9531,0.934,0.9038,0.9141,0.06
rf,Random Forest Classifier,0.9375,1.0,0.9375,0.9531,0.934,0.9038,0.9141,0.213
gbc,Gradient Boosting Classifier,0.9375,0.9704,0.9375,0.9531,0.934,0.9038,0.9141,0.464
dt,Decision Tree Classifier,0.925,0.9417,0.925,0.9375,0.9227,0.8852,0.8934,0.061
et,Extra Trees Classifier,0.925,0.9954,0.925,0.9475,0.9188,0.8848,0.8999,0.209


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [39]:
best[0].predict_proba(x_test)[0], best[1].predict_proba(x_test)[0]



(array([1.05143065e-106, 1.68708584e-006, 9.99998313e-001]),
 array([2.63373853e-46, 4.00516452e-05, 9.99959948e-01]))

In [50]:

models_proba =  np.array([model.predict_proba(x_test) for model in best])
models_mean = np.mean(models_proba,axis=0)
print(models_proba.shape, models_mean.shape)
predict = np.argmax(models_mean,axis=1)
predict

(5, 38, 3) (38, 3)




array([2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 0,
       2, 0, 1, 2, 1, 0, 1, 2, 1, 1, 1, 0, 0, 1, 0, 1])

In [52]:
!wget https://drive.google.com/u/0/uc?id=1ySNDrK4ahhGwgix5QHLaBNExu6GQFfXb&export=download

--2023-05-29 06:28:29--  https://drive.google.com/u/0/uc?id=1ySNDrK4ahhGwgix5QHLaBNExu6GQFfXb
Resolving drive.google.com (drive.google.com)... 142.251.167.102, 142.251.167.139, 142.251.167.100, ...
Connecting to drive.google.com (drive.google.com)|142.251.167.102|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://drive.google.com/uc?id=1ySNDrK4ahhGwgix5QHLaBNExu6GQFfXb [following]
--2023-05-29 06:28:29--  https://drive.google.com/uc?id=1ySNDrK4ahhGwgix5QHLaBNExu6GQFfXb
Reusing existing connection to drive.google.com:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘uc?id=1ySNDrK4ahhGwgix5QHLaBNExu6GQFfXb’

          uc?id=1yS     [<=>                 ]       0  --.-KB/s               uc?id=1ySNDrK4ahhGw     [ <=>                ]   2.20K  --.-KB/s    in 0s      

2023-05-29 06:28:29 (19.1 MB/s) - ‘uc?id=1ySNDrK4ahhGwgix5QHLaBNExu6GQFfXb’ saved [2250]

