Follow this website now:
https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html

In [1]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## IRIS Dataset:
- 50 samples of 3 different species of iris (150 samples total)
- Measurements: sepal length, sepal width, petal length, petal width

In [2]:
from IPython.display import IFrame
IFrame('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
       width=300, height=200)

In [3]:
# READ DATA INTO A PANDAS DATAFRAME
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
                   header=None)

In [4]:
data.shape

(150, 5)

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
colnames = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

data.columns = colnames

In [8]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [9]:
data.species.value_counts()

Iris-setosa        50
Iris-virginica     50
Iris-versicolor    50
Name: species, dtype: int64

#### *** Machine learning algorithms don't take names in feature or respone data matrices or arrays. Must use numbers to represent the species. 

In [10]:
# ASSIGN A NUMBER TO EACH UNIQUE SPECIES. WILL USE THESE NUMBERS INSTEAD OF NAMES.
data['Species'] = data['species'].map( {'Iris-setosa': 0, 
                                        'Iris-versicolor': 1, 
                                        'Iris-virginica': 2} ).astype(int)

In [11]:
data.Species.value_counts()

2    50
1    50
0    50
Name: Species, dtype: int64

---

# MODEL TRAINING DATA:
    X - all the data in columns sepal_length through petal_width
    y - 0s, 1s, and 2s from the Species column. Defines species. 
    
We will train a model with this data so that it can correctly predict the species of an Iris based on its measurements of sepal lenght, sepal width, petal length, and petal width. 

In [12]:
# X is training data set - matrix
X = data.loc[:,'sepal_length':'petal_width']
# y is the response vector - array
y = data.loc[:, 'Species']

In [13]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Species, dtype: int64

In [15]:
print(X.shape), print(y.shape)

(150, 4)
(150,)


(None, None)

---
---

# 1. Nearest Neighbors Classification

Documentation: https://scikit-learn.org/stable/modules/neighbors.html#nearest-neighbors-classification

The first example we will provide is for Nearest Neighbors Classification. 

* The k-neighbors classification in KNeighborsClassifier is the most commonly used technique. The optimal choice of the value k is highly data-dependent: in general a larger k suppresses the effects of noise, but makes the classification boundaries less distinct.

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
knn = KNeighborsClassifier(n_neighbors=1)

In [18]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [19]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

predict the species based on a measurement

In [20]:
knn.predict([[3, 5, 4, 2]])

array([2])

In [21]:
knn.predict([[5, 4, 1.3, .2]])

array([0])

In [22]:
knn.predict([[6, 4, 5, 1]])

array([1])

In [23]:
knn.classes_

array([0, 1, 2])

In [24]:
# Can predict multiple observations at once
knn.predict([[3, 5, 4, 2], [5, 4, 3, 2], [6, 4, 5, 1]])

array([2, 1, 1])

### using a different number for k neighbors

In [25]:
# using a different number for k neighbors
knn = KNeighborsClassifier(n_neighbors=5)

In [26]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [27]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [28]:
knn.predict([[3, 5, 4, 2], [5, 4, 3, 2], [6, 4, 5, 1]])

array([1, 1, 1])

In [29]:
knn.n_neighbors

5

In [30]:
knn.kneighbors_graph(X)

<150x150 sparse matrix of type '<class 'numpy.float64'>'
	with 750 stored elements in Compressed Sparse Row format>

In [31]:
?knn.kneighbors

In [32]:
distances, indices = knn.kneighbors(X)

In [33]:
distances

array([[0.        , 0.1       , 0.14142136, 0.14142136, 0.14142136],
       [0.        , 0.14142136, 0.14142136, 0.17320508, 0.17320508],
       [0.        , 0.14142136, 0.24494897, 0.26457513, 0.26457513],
       [0.        , 0.14142136, 0.17320508, 0.2236068 , 0.24494897],
       [0.        , 0.14142136, 0.17320508, 0.17320508, 0.2236068 ],
       [0.        , 0.33166248, 0.34641016, 0.36055513, 0.37416574],
       [0.        , 0.2236068 , 0.26457513, 0.3       , 0.31622777],
       [0.        , 0.1       , 0.14142136, 0.17320508, 0.2       ],
       [0.        , 0.14142136, 0.3       , 0.31622777, 0.34641016],
       [0.        , 0.        , 0.        , 0.17320508, 0.17320508],
       [0.        , 0.1       , 0.28284271, 0.3       , 0.33166248],
       [0.        , 0.2236068 , 0.2236068 , 0.28284271, 0.3       ],
       [0.        , 0.14142136, 0.17320508, 0.17320508, 0.17320508],
       [0.        , 0.24494897, 0.31622777, 0.34641016, 0.47958315],
       [0.        , 0.41231056, 0.

In [35]:
indices

array([[  0,  17,   4,  39,  27],
       [  1,  45,  12,  34,  37],
       [  2,  47,   3,  12,   6],
       [  3,  47,  29,  30,   2],
       [  4,   0,  17,  40,   7],
       [  5,  18,  10,  48,  44],
       [  6,  47,   2,  11,  42],
       [  7,  39,  49,   0,  17],
       [  8,  38,   3,  42,  13],
       [ 34,   9,  37,   1,  30],
       [ 10,  48,  27,  36,  19],
       [ 11,  29,   7,  26,  24],
       [ 12,   1,   9,  37,  34],
       [ 13,  38,  42,   8,  47],
       [ 14,  33,  16,  15,  18],
       [ 15,  33,  14,   5,  16],
       [ 16,  10,  48,  33,  19],
       [ 17,   0,  40,   4,  39],
       [ 18,   5,  10,  48,  20],
       [ 19,  21,  46,  48,   4],
       [ 20,  31,  27,  28,  10],
       [ 21,  19,  46,  17,   4],
       [ 22,   6,   2,  40,  42],
       [ 23,  26,  43,  39,   7],
       [ 24,  11,  29,  26,  30],
       [ 25,   9,  34,  37,   1],
       [ 26,  23,  43,   7,  39],
       [ 27,  28,   0,  39,  17],
       [ 28,  27,   0,  39,  17],
       [ 29,  

In [36]:
distances[0]

array([0.        , 0.1       , 0.14142136, 0.14142136, 0.14142136])

In [37]:
indices[0]

array([ 0, 17,  4, 39, 27])

In [38]:
distances.shape

(150, 5)

In [39]:
indices.shape

(150, 5)

## Resources

- [Nearest Neighbors](http://scikit-learn.org/stable/modules/neighbors.html) (user guide), [KNeighborsClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) (class documentation)
- [Logistic Regression](http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) (user guide), [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) (class documentation)
- [Videos from An Introduction to Statistical Learning](http://www.dataschool.io/15-hours-of-expert-machine-learning-videos/)
    - Classification Problems and K-Nearest Neighbors (Chapter 2)
    - Introduction to Classification (Chapter 4)
    - Logistic Regression and Maximum Likelihood (Chapter 4)

In [40]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,Species
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0


# Unsupervised Learning Version of Nearest Neighbors:

In [42]:
from sklearn.neighbors import NearestNeighbors

In [43]:
# ONLY FIT THE X DATA. NO RESPONSE SINCE WE ARE MAKING NO ASSUMPTIONS ON LABEL.
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)

In [45]:
nbrs

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=2, p=2, radius=1.0)

In [57]:
nbrs.kneighbors_graph(X)

<150x150 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Row format>

In [58]:
nbrs.kneighbors_graph(X).toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [46]:
distances, indices = nbrs.kneighbors(X)

In [None]:
knn.fit(X, y)

In [None]:
knn.predict([[3, 5, 4, 2], [5, 4, 3, 2], [6, 4, 5, 1]])

In [61]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,Species
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0


In [63]:
data.loc[:, :'petal_length']

Unnamed: 0,sepal_length,sepal_width,petal_length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4
5,5.4,3.9,1.7
6,4.6,3.4,1.4
7,5.0,3.4,1.5
8,4.4,2.9,1.4
9,4.9,3.1,1.5


In [None]:
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

n_neighbors = 15

# import some data to play with
iris = datasets.load_iris()

# we only take the first two features. We could avoid this ugly
# slicing by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target

h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))

plt.show()