## Iris Dataset

In [4]:
from sklearn.datasets import load_iris
iris = load_iris()

In [16]:
print(iris.target_names)
print(iris.feature_names)
print(iris.data.shape)

['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(150, 4)


In [9]:
X = iris.data[:, :2]         # using only two of the features
y = iris.target

### Train and test

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(112, 2)
(38, 2)
(112,)
(38,)


## Using SVM

In [17]:
from sklearn.svm import SVC

  return f(*args, **kwds)


In [20]:
#Instantiating and fitting the model

clf = SVC(kernel='linear', random_state=1)
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=1,
    shrinking=True, tol=0.001, verbose=False)

In [21]:
# Predicting using the fitted model

y_pred = clf.predict(X_test)

In [22]:
# Check model accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7631578947368421

## Using Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [25]:
X = iris.data[:, :2]
y = iris.target

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1)

In [27]:
# train the model

clf = LogisticRegression(random_state=1)
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
# predict with logistic regression

y_pred = clf.predict(X_test)

In [30]:
# check model accuracy score

accuracy_score(y_test, y_pred)

0.6052631578947368

## Using cross-validation to choose between models

In [32]:
# imports and load the iris dataset

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [33]:
#load the classifying models

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [34]:
iris = load_iris()
X = iris.data[:, :2]
y = iris.target

In [37]:
# split the data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=7)

In [38]:
# split the training data into two parts and forget about the test set for now

from sklearn.model_selection import train_test_split
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.25, random_state=7)

In [39]:
# create an instance of svc classifier and fit it

svc_clf = SVC(kernel='linear', random_state=7)
svc_clf.fit(X_train_2, y_train_2)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=7,
    shrinking=True, tol=0.001, verbose=False)

In [40]:
# create an instance of logistic regression classifier and fit it

lr_clf = LogisticRegression(random_state=7)
lr_clf.fit(X_train_2, y_train_2)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=7, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
# Now predict and check the accuracy score

svc_pred = svc_clf.predict(X_test_2)
lr_pred = lr_clf.predict(X_test_2)

print('Accuracy score of SVC: ', accuracy_score(y_test_2, svc_pred))
print('Accuracy score of LR: ', accuracy_score(y_test_2, lr_pred))

Accuracy score of SVC:  0.8571428571428571
Accuracy score of LR:  0.7142857142857143


In [43]:
# The SVC performs better but we have not yet seen the original test data.

print('Accuracy of SVC on original test data: ', accuracy_score(y_test, svc_clf.predict(X_test)))

Accuracy of SVC on original test data:  0.6842105263157895


In [44]:
from sklearn.model_selection import cross_val_score

In [45]:
# Then we produce an accuracy score on four folds

svc_scores = cross_val_score(svc_clf, X_train, y_train, cv=4)
svc_scores

array([0.82758621, 0.85714286, 0.92857143, 0.77777778])

In [46]:
print('Average SVC scores: ', svc_scores.mean())
print('Standard Deviation of SVC Scores: ', svc_scores.std())

Average SVC scores:  0.8477695675971537
Standard Deviation of SVC Scores:  0.054596286469649064


In [49]:
lr_scores = cross_val_score(lr_clf, X_train, y_train, cv=4)
lr_scores



array([0.82758621, 0.75      , 0.71428571, 0.7037037 ])

In [50]:
print('Average LR scores: ', lr_scores.mean())
print('Standard Deviation of LR Scores: ', lr_scores.std())

Average LR scores:  0.7488939062214924
Standard Deviation of LR Scores:  0.048563316869946194


In [51]:
# This confirms our selection of SVC over logistic regression.

### Using kNN classifier

In [52]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=0)

In [53]:
# Construct two different KNN models by varying the n_neighbors parameter

knn_3_clf = KNeighborsClassifier(n_neighbors=3)
knn_5_clf = KNeighborsClassifier(n_neighbors=5)

In [54]:
#10 fold cross validation is common in the machine learning community

knn_3_scores = cross_val_score(knn_3_clf, X_train, y_train, cv = 10)
knn_5_scores = cross_val_score(knn_5_clf, X_train, y_train, cv = 10)

In [55]:
# Score and print out the scores for selection

print('knn_3 mean scores: ',knn_3_scores.mean(), 'knn_3 std:', knn_3_scores.std())
print('knn_5 mean scores: ',knn_5_scores.mean(), 'knn_5 std:', knn_5_scores.std())

knn_3 mean scores:  0.7983333333333333 knn_3 std: 0.09081421817216852
knn_5 mean scores:  0.8066666666666666 knn_5 std: 0.05593205754956987


In [57]:
#Both nearest neighbors score similarly yet the KNN with parameter n_neighbors=5 is a bit more stable.
#This is an example of hyperparameter optimization.

In [58]:
# using a simple loop to score the function more quickly

all_scores = []
for n_neighbors in range(3,9,1):
    knn_clf = KNeighborsClassifier(n_neighbors= n_neighbors)
    all_scores.append((n_neighbors, cross_val_score(knn_clf, X_train, y_train, cv = 10).mean()))
sorted(all_scores, key=lambda x:x[1], reverse = True)

[(4, 0.8511111111111112),
 (7, 0.8261111111111111),
 (6, 0.8233333333333335),
 (5, 0.8066666666666666),
 (3, 0.7983333333333333),
 (8, 0.7983333333333333)]

In [60]:
#Output suggests that n_neighbors=4 is a good choice.