# Iris Flower Classification

## Step 1. Download Iris Dataset

In [1]:
from sklearn import datasets
import pandas as pd
from pandas import DataFrame

iris = datasets.load_iris()
X = iris.data #X는 iris의 data darray
y = iris.target #y는 labels
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [2]:
#Do preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
X_prepared = num_pipeline.fit_transform(X) #정제된 X



## Step 2. Split Training and Testing Dataset

In [3]:
#training set & test set 나누기
from sklearn.model_selection import train_test_split
import numpy as np
X_train , X_test , y_train , y_test = train_test_split(X_prepared,y,test_size=0.2,random_state=42)

## Step 3. Generating Classifier Model

In [4]:
#첫번째 classification model : SGDClassifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=3, tol=-np.infty,random_state=42)
sgd_clf.fit(X_train,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=3,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=-inf,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [5]:
#두번째 classification model : RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=10,random_state=42)
forest_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [6]:
#세번째 classification model : k-NeighborsClassifier
#k-NeighborsClassifier : 새로운 data에 대한 k개의 근접한 이웃을 찾는 알고리즘
from sklearn.neighbors import KNeighborsClassifier
knb_clf = KNeighborsClassifier(n_neighbors=1) #1개의 가장 근접한 이웃을 찾는다
knb_clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

## Step 4. Get Accuracy using cross validation

In [7]:
#Get Accuracy using cross validation
from sklearn.model_selection import cross_val_score
#accuracy 평가하는 함수
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [8]:
#SGDclassifier cross_val
sgd_score=cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring="accuracy")
display_scores(sgd_score)

Scores: [0.76923077 0.83333333 1.         0.91666667 0.66666667 0.83333333
 0.75       1.         0.83333333 0.90909091]
Mean: 0.851165501165501
Standard deviation: 0.10192679193198635


In [9]:
#RandomForestclassifier cross_val
forest_score=cross_val_score(forest_clf,X_train, y_train, cv=10, scoring="accuracy")
display_scores(forest_score)

Scores: [0.92307692 1.         1.         1.         0.66666667 0.75
 1.         1.         1.         0.90909091]
Mean: 0.9248834498834497
Standard deviation: 0.1146157810557605


In [10]:
#k-NeighborsClassifier cross_val
knb_score=cross_val_score(knb_clf,X_train, y_train, cv=10, scoring="accuracy")
display_scores(knb_score)

Scores: [0.92307692 1.         1.         1.         0.75       0.83333333
 1.         1.         1.         0.90909091]
Mean: 0.9415501165501166
Standard deviation: 0.08384241861847685


## Step5. Find Hyper-Parameter using GridSearch

In [11]:
#Grid Search
from sklearn.model_selection import GridSearchCV
param_grid_SGD=[
    {'max_iter':[2,3],
     'alpha':[0.0001,0.001,0.01,0.1,1]}
]
param_grid_forest=[
    {'n_estimators':[1,10,100,1000,10000],
     'max_depth':[1,3]}
]
param_grid_knb=[
    {'n_neighbors':[1,5,10,15,20,25,30,35,40,45] }
     #n_neighbors : int, optional (default = 5) -->Number of neighbors to use by default for kneighbors queries.
]

In [12]:
#Accuracy를 출력하는 함수
def getAccuracy(grid):
    cvres =grid.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)
    print('##########################################################')
    print('Best hyperparameter: {}'.format(grid.best_params_))
    print('Best accuracy of hyperparameter : {}'.format(grid.best_score_))

In [13]:
#SGD Classifier의 accuracy using Grid Search
sgd_clf_1 = SGDClassifier(tol=-np.infty,random_state=42)
grid_SGD =GridSearchCV(sgd_clf_1, param_grid_SGD, scoring='accuracy', cv=10, n_jobs=-1)
grid_SGD.fit(X_train,y_train)
getAccuracy(grid_SGD)

0.8 {'alpha': 0.0001, 'max_iter': 2}
0.85 {'alpha': 0.0001, 'max_iter': 3}
0.875 {'alpha': 0.001, 'max_iter': 2}
0.85 {'alpha': 0.001, 'max_iter': 3}
0.8666666666666667 {'alpha': 0.01, 'max_iter': 2}
0.8333333333333334 {'alpha': 0.01, 'max_iter': 3}
0.8166666666666667 {'alpha': 0.1, 'max_iter': 2}
0.8166666666666667 {'alpha': 0.1, 'max_iter': 3}
0.6916666666666667 {'alpha': 1, 'max_iter': 2}
0.7 {'alpha': 1, 'max_iter': 3}
##########################################################
Best hyperparameter: {'alpha': 0.001, 'max_iter': 2}
Best accuracy of hyperparameter : 0.875




In [14]:
#RandomForest Classifier의 accuracy using Grid Search
forest_clf_1 =RandomForestClassifier(random_state=42)
grid_forest = GridSearchCV(forest_clf_1, param_grid_forest, scoring='accuracy', cv=10, n_jobs=-1)
grid_forest.fit(X_train,y_train)
getAccuracy(grid_forest)

0.6666666666666666 {'max_depth': 1, 'n_estimators': 1}
0.8 {'max_depth': 1, 'n_estimators': 10}
0.8916666666666667 {'max_depth': 1, 'n_estimators': 100}
0.9083333333333333 {'max_depth': 1, 'n_estimators': 1000}
0.9083333333333333 {'max_depth': 1, 'n_estimators': 10000}
0.9416666666666667 {'max_depth': 3, 'n_estimators': 1}
0.9333333333333333 {'max_depth': 3, 'n_estimators': 10}
0.9416666666666667 {'max_depth': 3, 'n_estimators': 100}
0.9416666666666667 {'max_depth': 3, 'n_estimators': 1000}
0.9416666666666667 {'max_depth': 3, 'n_estimators': 10000}
##########################################################
Best hyperparameter: {'max_depth': 3, 'n_estimators': 1}
Best accuracy of hyperparameter : 0.9416666666666667




In [15]:
#KNeighbors Classifier의 accuracy using Grid Search
knb_clf_1 = KNeighborsClassifier()
grid_knb =GridSearchCV(knb_clf_1, param_grid_knb, scoring='accuracy', cv=10, n_jobs=-1)
grid_knb.fit(X_train,y_train)
getAccuracy(grid_knb)

0.9416666666666667 {'n_neighbors': 1}
0.9416666666666667 {'n_neighbors': 5}
0.9416666666666667 {'n_neighbors': 10}
0.95 {'n_neighbors': 15}
0.9416666666666667 {'n_neighbors': 20}
0.925 {'n_neighbors': 25}
0.875 {'n_neighbors': 30}
0.8583333333333333 {'n_neighbors': 35}
0.8583333333333333 {'n_neighbors': 40}
0.8416666666666667 {'n_neighbors': 45}
##########################################################
Best hyperparameter: {'n_neighbors': 15}
Best accuracy of hyperparameter : 0.95




## Step6. Find Hyper-Parameter using RandomizedSearch

In [25]:
#randomize search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import random
param_distribs_SGD = {
        'max_iter':randint(low=1,high=20),
    }
param_distribs_forest = {
        'n_estimators': randint(low=1, high=200),
        'max_depth': randint(low=1, high=3),
    }
param_distribs_knb = {
        'n_neighbors':randint(low=1, high=95) }
     #n_neighbors : int, optional (default = 5) -->Number of neighbors to use by default for kneighbors queries.

In [22]:
#SGD Classifier의 accuracy using Randomize Search
rnd_SGD = RandomizedSearchCV(sgd_clf_1, param_distributions=param_distribs_SGD,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)
rnd_SGD.fit(X_train,y_train)
getAccuracy(rnd_SGD)

0.875 {'max_iter': 7}
0.9166666666666666 {'max_iter': 15}
0.875 {'max_iter': 11}
0.9166666666666666 {'max_iter': 8}
0.875 {'max_iter': 7}
0.9333333333333333 {'max_iter': 19}
0.875 {'max_iter': 11}
0.875 {'max_iter': 11}
0.9 {'max_iter': 4}
0.9166666666666666 {'max_iter': 8}
##########################################################
Best hyperparameter: {'max_iter': 19}
Best accuracy of hyperparameter : 0.9333333333333333




In [23]:
#RandomForest Classifier의 accuracy using Randomize Search
rnd_forest = RandomizedSearchCV(forest_clf_1, param_distributions=param_distribs_forest,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)
rnd_forest.fit(X_train,y_train)
getAccuracy(rnd_forest)

0.8583333333333333 {'max_depth': 1, 'n_estimators': 180}
0.9333333333333333 {'max_depth': 1, 'n_estimators': 15}
0.8916666666666667 {'max_depth': 1, 'n_estimators': 72}
0.8583333333333333 {'max_depth': 1, 'n_estimators': 21}
0.8583333333333333 {'max_depth': 1, 'n_estimators': 122}
0.8583333333333333 {'max_depth': 1, 'n_estimators': 75}
0.8583333333333333 {'max_depth': 1, 'n_estimators': 88}
0.8583333333333333 {'max_depth': 1, 'n_estimators': 100}
0.9333333333333333 {'max_depth': 2, 'n_estimators': 152}
0.9 {'max_depth': 1, 'n_estimators': 150}
##########################################################
Best hyperparameter: {'max_depth': 1, 'n_estimators': 15}
Best accuracy of hyperparameter : 0.9333333333333333




In [26]:
#KNeighbors Classifier의 accuracy using Randomize Search
rnd_knb = RandomizedSearchCV(knb_clf_1, param_distributions=param_distribs_knb,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)
rnd_knb.fit(X_train,y_train)
getAccuracy(rnd_knb)

0.8083333333333333 {'n_neighbors': 52}
0.4166666666666667 {'n_neighbors': 93}
0.9583333333333334 {'n_neighbors': 15}
0.6666666666666666 {'n_neighbors': 72}
0.8166666666666667 {'n_neighbors': 61}
0.9166666666666666 {'n_neighbors': 21}
0.4583333333333333 {'n_neighbors': 83}
0.43333333333333335 {'n_neighbors': 87}
0.5666666666666667 {'n_neighbors': 75}
0.5666666666666667 {'n_neighbors': 75}
##########################################################
Best hyperparameter: {'n_neighbors': 15}
Best accuracy of hyperparameter : 0.9583333333333334




## Step7. Calculate Precision and Recall

In [57]:
#Calculate Precision and Recall
from sklearn.metrics import precision_score,recall_score
def getPR(classifier):
    y_score = classifier.predict(X_test)
    precision = precision_score(y_test, y_score,average='micro')
    #Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
    recall =  recall_score(y_test, y_score,average='micro')
    print('precision score: {}'.format(precision))
    print('recall score: {}'.format(recall))

In [58]:
getPR(sgd_clf)

precision score: 0.9666666666666667
recall score: 0.9666666666666667


In [59]:
getPR(forest_clf)

precision score: 1.0
recall score: 1.0


In [61]:
getPR(knb_clf)

precision score: 0.9666666666666667
recall score: 0.9666666666666667
