In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("ggplot")
plt.rcParams["font.size"] = 14
plt.rcParams["figure.figsize"] = 8, 6

In [9]:
X = pd.read_csv("iris.csv")
y = X.pop("Species")

from sklearn import cross_validation
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    X.values, y.values, test_size=0.1)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3).fit(x_train, y_train)

In [11]:
for y_pred, y_true in zip(knn.predict(x_test), y_test):
    print(y_pred, y_true)

virginica virginica
virginica versicolor
versicolor versicolor
versicolor versicolor
setosa setosa
setosa setosa
versicolor versicolor
setosa setosa
virginica virginica
virginica virginica
versicolor versicolor
virginica virginica
setosa setosa
versicolor versicolor
setosa setosa


In [12]:
print(knn.score(x_test, y_test))

0.933333333333


In [13]:
scores = cross_validation.cross_val_score(KNeighborsClassifier(3), X, y, cv=5)
mean_score = scores.mean()
print(mean_score)

0.966666666667


In [14]:
for k in range(1, 11):
    scores = cross_validation.cross_val_score(KNeighborsClassifier(k), X, y, cv=5)
    mean_score = scores.mean()
    print(k, mean_score)

1 0.96
2 0.946666666667
3 0.966666666667
4 0.973333333333
5 0.973333333333
6 0.98
7 0.98
8 0.966666666667
9 0.973333333333
10 0.98


In [15]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(x_train, y_train)
print(gnb.predict_proba(x_test))

[[  1.48194346e-253   9.90614193e-007   9.99999009e-001]
 [  1.77850116e-148   8.03952588e-002   9.19604741e-001]
 [  1.30627216e-071   9.99981590e-001   1.84096329e-005]
 [  8.08288182e-071   9.99989339e-001   1.06611739e-005]
 [  1.00000000e+000   2.42377666e-010   3.47133799e-018]
 [  1.00000000e+000   8.98413519e-019   7.18958894e-026]
 [  5.79278247e-156   5.16741348e-002   9.48325865e-001]
 [  1.00000000e+000   1.32905415e-019   1.19662491e-026]
 [  6.94353867e-176   3.82107028e-003   9.96178930e-001]
 [  1.25705760e-170   5.59664370e-001   4.40335630e-001]
 [  1.57134276e-092   9.99923353e-001   7.66470063e-005]
 [  2.52060971e-285   1.04675035e-011   1.00000000e+000]
 [  1.00000000e+000   9.81000901e-018   3.48997992e-025]
 [  2.24013141e-081   9.99852979e-001   1.47020564e-004]
 [  1.00000000e+000   5.44656213e-013   4.78341006e-020]]


In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class="multinomial", solver="lbfgs").fit(x_train, y_train)
lr.predict_proba(x_test)

array([[  8.70658052e-07,   2.29085144e-02,   9.77090615e-01],
       [  2.40652124e-03,   3.91867616e-01,   6.05725863e-01],
       [  2.68654280e-02,   9.51411191e-01,   2.17233813e-02],
       [  1.74305092e-02,   9.61918007e-01,   2.06514836e-02],
       [  9.70725141e-01,   2.92745687e-02,   2.90123826e-07],
       [  9.71632735e-01,   2.83672050e-02,   5.96577559e-08],
       [  6.49643165e-04,   4.44816984e-01,   5.54533373e-01],
       [  9.80041103e-01,   1.99588650e-02,   3.16591090e-08],
       [  2.67129626e-04,   1.46158717e-01,   8.53574154e-01],
       [  7.99100491e-05,   2.08109127e-01,   7.91810962e-01],
       [  9.25891977e-03,   8.93686737e-01,   9.70543430e-02],
       [  5.13172183e-07,   1.11152660e-02,   9.88884221e-01],
       [  9.60106480e-01,   3.98933852e-02,   1.34727042e-07],
       [  1.84427284e-02,   9.49342844e-01,   3.22144277e-02],
       [  9.58294933e-01,   4.17045949e-02,   4.72135707e-07]])

In [17]:
models = {
    "knn": KNeighborsClassifier(6),
    "gnb":GaussianNB(),
    "lr":LogisticRegression(multi_class="multinomial", solver="lbfgs")
}

for name, model in models.items():
    score = cross_validation.cross_val_score(model, X, y, cv=5).mean()
    print(name, score)

lr 0.973333333333
gnb 0.953333333333
knn 0.98
