In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

# Load the iris dataset
iris = load_iris()

# Get the dataset content
x = iris.data # the data
feature_names = iris.feature_names # the feature names
y = iris.target # the labels {0, 1, 2}
target_names = iris.target_names # the labels names
labels = list(range(len(target_names))) # a list of labels [0, 1, 2]

#pca = PCA(n_components=2)
#x = pca.fit_transform(x)

In [2]:
# Split the data
x_train_and_val, x_test, y_train_and_val, y_test = train_test_split(x, y, test_size = 15, random_state=5)
x_train, x_val, y_train, y_val = train_test_split(x_train_and_val, y_train_and_val, test_size = 15, random_state=7)

# Fit a k-nn model
n_neighbors = 5
knn = KNeighborsClassifier(n_neighbors)
knn.fit(x_train, y_train)
acc_knn = knn.score(x_val, y_val)
print('Accuracy, knn: {}'.format(acc_knn))

# Fit a logistic regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
acc_lr = lr.score(x_val, y_val)
print('Accuracy, logistic regression: {}'.format(acc_lr))

# Fit a ridge classifier
rc = RidgeClassifier(alpha=0.1)
rc.fit(x_train, y_train)
acc_rc = rc.score(x_val, y_val)
print('Accuracy, ridge classifier: {}'.format(acc_rc))

# Fit a SVM classifier
svm = svm.SVC(C=1.0,kernel='rbf')
svm.fit(x_train, y_train)
acc_svm = svm.score(x_val, y_val)
print('Accuracy, SVM: {}'.format(acc_svm))

Accuracy, knn: 1.0
Accuracy, logistic regression: 1.0
Accuracy, ridge classifier: 0.8
Accuracy, SVM: 1.0


In [3]:
from sklearn.model_selection import cross_val_score

cv_scores_knn = cross_val_score(knn, x_train_and_val, y_train_and_val, cv=5)
cv_scores_lr = cross_val_score(lr, x_train_and_val, y_train_and_val, cv=5)
cv_scores_rc = cross_val_score(rc, x_train_and_val, y_train_and_val, cv=5)
cv_scores_svm = cross_val_score(svm, x_train_and_val, y_train_and_val, cv=5)

print('Accuracy, k-fold cross validation, knn: {}'.format(np.mean(cv_scores_knn)))
print('Scores for all folds: {}\n'.format(cv_scores_knn))
print('Accuracy, k-fold cross validation, logistic regression: {}'.format(np.mean(cv_scores_lr)))
print('Scores for all folds: {}\n'.format(cv_scores_lr))
print('Accuracy, k-fold cross validation, ridge classifier: {}'.format(np.mean(cv_scores_rc)))
print('Scores for all folds: {}\n'.format(cv_scores_rc))
print('Accuracy, k-fold cross validation, SVM: {}'.format(np.mean(cv_scores_svm)))
print('Scores for all folds: {}\n'.format(cv_scores_svm))

Accuracy, k-fold cross validation, knn: 0.962962962962963
Scores for all folds: [0.96296296 1.         0.92592593 1.         0.92592593]

Accuracy, k-fold cross validation, logistic regression: 0.9703703703703704
Scores for all folds: [1.         1.         0.92592593 1.         0.92592593]

Accuracy, k-fold cross validation, ridge classifier: 0.8666666666666666
Scores for all folds: [0.92592593 0.88888889 0.88888889 0.81481481 0.81481481]

Accuracy, k-fold cross validation, SVM: 0.962962962962963
Scores for all folds: [1.         0.96296296 0.92592593 1.         0.92592593]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train_and_val)
x_train_and_val_scaled = scaler.transform(x_train_and_val)
x_test_scaled = scaler.transform(x_test)

cv_scores_knn = cross_val_score(knn, x_train_and_val_scaled, y_train_and_val, cv=5)
cv_scores_lr = cross_val_score(lr, x_train_and_val_scaled, y_train_and_val, cv=5)
cv_scores_rc = cross_val_score(rc, x_train_and_val_scaled, y_train_and_val, cv=5)
cv_scores_svm = cross_val_score(svm, x_train_and_val_scaled, y_train_and_val, cv=5)

print('Accuracy, k-fold cross validation, knn: {}'.format(np.mean(cv_scores_knn)))
print('Scores for all folds: {}\n'.format(cv_scores_knn))
print('Accuracy, k-fold cross validation, logistic regression: {}'.format(np.mean(cv_scores_lr)))
print('Scores for all folds: {}\n'.format(cv_scores_lr))
print('Accuracy, k-fold cross validation, ridge classifier: {}'.format(np.mean(cv_scores_rc)))
print('Scores for all folds: {}\n'.format(cv_scores_rc))
print('Accuracy, k-fold cross validation, SVM: {}'.format(np.mean(cv_scores_svm)))
print('Scores for all folds: {}\n'.format(cv_scores_svm))

Accuracy, k-fold cross validation, knn: 0.962962962962963
Scores for all folds: [1.         1.         0.92592593 1.         0.88888889]

Accuracy, k-fold cross validation, logistic regression: 0.9777777777777779
Scores for all folds: [1.         1.         0.92592593 1.         0.96296296]

Accuracy, k-fold cross validation, ridge classifier: 0.8666666666666666
Scores for all folds: [0.92592593 0.88888889 0.88888889 0.81481481 0.81481481]

Accuracy, k-fold cross validation, SVM: 0.962962962962963
Scores for all folds: [1.         1.         0.92592593 1.         0.88888889]



In [5]:
# Tune hyperparameters
parameters_knn = {'n_neighbors':[1, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
knn_hyp = GridSearchCV(knn, parameters_knn)
knn_result = knn_hyp.fit(x_train_and_val_scaled, y_train_and_val)
cv_scores_knn = cross_val_score(knn_result, x_train_and_val_scaled, y_train_and_val, cv=5)

parameters_rc = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
rc_hyp = GridSearchCV(rc, parameters_rc)
rc_result = rc_hyp.fit(x_train_and_val_scaled, y_train_and_val)
cv_scores_rc = cross_val_score(rc_result, x_train_and_val_scaled, y_train_and_val, cv=5)

parameters_svm = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[0.001, 0.01, 0.1, 1, 10, 100], 'degree':[2, 3, 4], 'gamma':('scale', 'auto')}
svm_hyp = GridSearchCV(svm, parameters_svm)
svm_result = svm_hyp.fit(x_train_and_val_scaled, y_train_and_val)
cv_scores_svm = cross_val_score(svm_result, x_train_and_val_scaled, y_train_and_val, cv=5)

print('Accuracy, k-fold cross validation, knn: {}'.format(np.mean(cv_scores_knn)))
print('Scores for all folds: {}'.format(cv_scores_knn))
print('Best parameters set found on development set: {}\n'.format(knn_result.best_params_))
print('Accuracy, k-fold cross validation, logistic regression: {}'.format(np.mean(cv_scores_lr)))
print('Scores for all folds: {}\n'.format(cv_scores_lr))
print('Accuracy, k-fold cross validation, ridge classifier: {}'.format(np.mean(cv_scores_rc)))
print('Scores for all folds: {}'.format(cv_scores_rc))
print('Best parameters set found on development set: {}\n'.format(rc_result.best_params_))
print('Accuracy, k-fold cross validation, SVM: {}'.format(np.mean(cv_scores_svm)))
print('Scores for all folds: {}'.format(cv_scores_svm))
print('Best parameters set found on development set: {}\n'.format(svm_result.best_params_))

Accuracy, k-fold cross validation, knn: 0.9703703703703702
Scores for all folds: [0.96296296 1.         0.96296296 1.         0.92592593]
Best parameters set found on development set: {'n_neighbors': 10}

Accuracy, k-fold cross validation, logistic regression: 0.9777777777777779
Scores for all folds: [1.         1.         0.92592593 1.         0.96296296]

Accuracy, k-fold cross validation, ridge classifier: 0.8518518518518519
Scores for all folds: [0.92592593 0.81481481 0.88888889 0.81481481 0.81481481]
Best parameters set found on development set: {'alpha': 1}

Accuracy, k-fold cross validation, SVM: 0.9703703703703704
Scores for all folds: [0.96296296 1.         0.92592593 1.         0.96296296]
Best parameters set found on development set: {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}



In [6]:
lr = LogisticRegression()
lr.fit(x_train_and_val_scaled, y_train_and_val)
acc_lr_test = lr.score(x_test_scaled, y_test)
print('Accuracy in the test set, logistic regression: {}'.format(acc_lr_test))

Accuracy in the test set, logistic regression: 0.8


In [7]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('lr',lr),('knn',knn),('svm',svm)], voting='hard')
cv_scores_voting_clf = cross_val_score(voting_clf, x_train_and_val_scaled, y_train_and_val, cv=5)
print('Accuracy, k-fold cross validation on test data, voting classifier: {}'.format(np.mean(cv_scores_voting_clf)))
print('Scores for all folds: {}'.format(cv_scores_voting_clf))

Accuracy, k-fold cross validation on test data, voting classifier: 0.9703703703703704
Scores for all folds: [1.         1.         0.92592593 1.         0.92592593]


In [8]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=3)
cv_scores_tree = cross_val_score(tree, x_train_and_val_scaled, y_train_and_val, cv=5)
print('Accuracy, k-fold cross validation on test data, voting classifier: {}'.format(np.mean(cv_scores_tree)))
print('Scores for all folds: {}'.format(cv_scores_tree))

Accuracy, k-fold cross validation on test data, voting classifier: 0.9703703703703704
Scores for all folds: [1.         1.         0.92592593 1.         0.92592593]


In [9]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16)
cv_scores_forest = cross_val_score(forest, x_train_and_val_scaled, y_train_and_val, cv=5)
print('Accuracy, k-fold cross validation on test data, voting classifier: {}'.format(np.mean(cv_scores_forest)))
print('Scores for all folds: {}'.format(cv_scores_forest))

Accuracy, k-fold cross validation on test data, voting classifier: 0.9703703703703704
Scores for all folds: [1.         1.         0.92592593 1.         0.92592593]
