In [1]:
import pandas as pd 
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df_train = pd.read_csv('../static/data/iris_train.csv')
df_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.3,1.7,0.5,0
1,6.1,3.0,4.9,1.8,2
2,5.1,3.4,1.5,0.2,0
3,5.1,3.5,1.4,0.3,0
4,5.2,3.4,1.4,0.2,0


In [3]:
X_train = df_train.iloc[:, :-1].values
y_train = df_train.iloc[:, -1].values
X_train.shape, y_train.shape

((112, 4), (112,))

In [4]:
df_test = pd.read_csv('../static/data/iris_test.csv')
df_test.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,4.8,3.0,1.4,0.3,0
1,4.6,3.6,1.0,0.2,0
2,4.8,3.0,1.4,0.1,0
3,4.7,3.2,1.6,0.2,0
4,4.9,2.4,3.3,1.0,1


In [5]:
X_test = df_test.iloc[:, :-1].values
y_test = df_test.iloc[:, -1].values
X_test.shape, y_test.shape

((38, 4), (38,))

In [6]:
scaler = MinMaxScaler()
scaler.fit(df_train.iloc[:, :-1])

MinMaxScaler()

In [7]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((112, 4), (38, 4))

In [8]:
joblib.dump(scaler, '../static/model/iris_scaler.pkl')

['../static/model/iris_scaler.pkl']

### 1. Logistic Regression

In [9]:
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [17]:
params = {
    # 'C': [3, 4, 5, 6, 7]
    'C': [3, 4, 5, 6, 7, 9, 10]
}

In [18]:
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9739
최적 파라미터: {'C': 4}


In [19]:
best_lr = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_lr.predict(X_test_scaled)
acc_lr = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_lr:.4f}')

평균 정확도 : 0.8947


In [20]:
joblib.dump(best_lr, '../static/model/iris_lr.pkl')

['../static/model/iris_lr.pkl']

### 2.Decision Tree

In [21]:
dt_clf = DecisionTreeClassifier()
dt_clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [22]:
params = {
    'max_depth': [0.9, 1, 2, 5, 8, 10],
    'min_samples_split': [1, 2, 3, 4]
}

In [23]:
grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 1.0000
최적 파라미터: {'max_depth': 5, 'min_samples_split': 2}


In [24]:
best_dt = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_dt.predict(X_test_scaled)
acc_dt = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_dt:.4f}')

평균 정확도 : 0.8947


In [25]:
joblib.dump(best_dt, '../static/model/iris_dt.pkl')

['../static/model/iris_dt.pkl']

### 3. SVM

In [26]:
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [38]:
params = {
    # 'C': [90, 91, 92, 93, 94, 95]
     'C': [0.9, 1, 3, 5, 10]
}

In [39]:
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9826
최적 파라미터: {'C': 3}


In [40]:
best_sv = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_sv.predict(X_test_scaled)
acc_sv = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_sv:.4f}')

평균 정확도 : 0.8947


In [41]:
joblib.dump(best_sv, '../static/model/iris_sv.pkl')

['../static/model/iris_sv.pkl']

### 4.랜덤 포레스트(Random Forest)

In [42]:
rf_clf = RandomForestClassifier()
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [43]:
params = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 13],
    'min_samples_split': [1, 3, 4, 5, 6, 7, 8, 9, 10]
}

In [44]:
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9913
최적 파라미터: {'max_depth': 6, 'min_samples_split': 3}


In [45]:
best_rf = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_rf.predict(X_test_scaled)
acc_rf = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_rf:.4f}')

평균 정확도 : 0.8947


In [46]:
joblib.dump(best_rf, '../static/model/iris_rf.pkl')

['../static/model/iris_rf.pkl']

### 5.K 최근접 이웃(K Nearest Neighbor)

In [47]:
kn_clf = KNeighborsClassifier()
kn_clf.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [48]:
params = {
    'n_neighbors': [1, 5, 10, 11, 12]
}

In [49]:
grid_cv = GridSearchCV(kn_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9739
최적 파라미터: {'n_neighbors': 1}


In [50]:
best_kn = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_kn.predict(X_test_scaled)
acc_kn = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_kn:.4f}')

평균 정확도 : 0.9211


In [51]:
joblib.dump(best_kn, '../static/model/iris_kn.pkl')

['../static/model/iris_kn.pkl']

### Test

In [62]:
index = 3

In [63]:
test_data = (df_test.iloc[index, :-1].values).reshape(1,-1)
label = df_test.iloc[index, -1]
test_data, label

(array([[4.7, 3.2, 1.6, 0.2]]), 0)

In [64]:
new_scaler = joblib.load('../static/model/iris_scaler.pkl')

In [65]:
test_scaled = new_scaler.transform(test_data)

In [66]:
pred_lr = best_lr.predict(test_scaled)
pred_sv = best_sv.predict(test_scaled)
pred_dt = best_dt.predict(test_scaled)
pred_rf = best_rf.predict(test_scaled)
pred_kn = best_kn.predict(test_scaled)

In [67]:
label, pred_lr[0], pred_sv[0], pred_dt[0], pred_rf[0], pred_kn[0]

(0, 0, 0, 0, 0, 0)

In [58]:
iris_dict = {'label':label, 'pred_lr':pred_lr[0], 'area_ratio': pred_sv[0], 'per_person':pred_dt[0], 'pred_rf': pred_rf[0], 'pred_kn': pred_kn[0]}
iris_dict

{'label': 2,
 'pred_lr': 2,
 'area_ratio': 2,
 'per_person': 2,
 'pred_rf': 2,
 'pred_kn': 2}

In [59]:
species = ['Setosa', 'Versicolor', 'Virginica']

In [68]:
result = {'index':index, 'label':f'{label} ({species[label]})',
                  'pred_lr':f'{pred_lr[0]} ({species[pred_lr[0]]})', 
                  'pred_sv':f'{pred_sv[0]} ({species[pred_sv[0]]})', 
                  'pred_rf':f'{pred_rf[0]} ({species[pred_rf[0]]})'}

In [69]:
result

{'index': 3,
 'label': '0 (Setosa)',
 'pred_lr': '0 (Setosa)',
 'pred_sv': '0 (Setosa)',
 'pred_rf': '0 (Setosa)'}