In [None]:
%pylab inline
rcParams['figure.figsize'] = (10, 4)

## Machine Learning

[Scikit Learn](http://scikit-learn.org/stable/index.html)

In [None]:
from sklearn.datasets import load_iris

# most examples here are based on examples from the sklearn docs

In [None]:
data = load_iris()

In [None]:
type(data)

In [None]:
data

In [None]:
type(data)

In [None]:
print(data['DESCR']) # DESCR === header that descibes the data


- [linearly seperable](https://en.wikipedia.org/wiki/Linear_separability)
- [perceptron](https://en.wikipedia.org/wiki/Perceptron)

In [None]:
data['data'].shape

In [None]:
type(data)

For sklearn "Bunches", you can use key names as members of the object too:

In [None]:
data.data.shape

In [None]:
data.data.shape, data['data'].shape # same thing

In [None]:
data.feature_names

In [None]:
data.target # which data are of which class

In [None]:
data.target.shape

In [None]:
data.target_names # 'setosa'==0, 'versicolor'==1, 'virginica'==2

To find a set:

In [None]:
virginicas = argwhere(data.target == list(data.target_names).index('virginica'))[:,0]
print(virginicas)

In [None]:
# give me the index of the first occurance of 'virginica' in the list of data.target_names
list(data.target_names).index('virginica')

In [None]:
data.target == 2

In [None]:
argwhere(data.target == 2)[:,0]

In [None]:
virginicas = argwhere(data.target == list(data.target_names).index('virginica'))[:,0]
print(virginicas)

In [None]:
feature = 1
data.feature_names[feature]

In [None]:
data.data[virginicas].shape

In [None]:
data.data[virginicas]

In [None]:
data.data[virginicas][:,1].mean() # 'sepal width (cm)'

In [None]:
data.data[virginicas][:,1].var() # 'sepal width (cm)'

In [None]:
stem(data.data[virginicas][:,1]) # 'sepal width (cm)'
pass

In [None]:
setosas = argwhere(data.target == list(data.target_names).index('setosa'))[:,0]

# 'sepal width (cm)'
plot(data.data[virginicas][:,1], label="virginicas")
plot(data.data[setosas][:,1], label="setosas")
legend()
pass

Do you see a trend? . . . . . . . . . . . . . These are similar. They are not totally different, but they are not exactly the same either. Also, there's no special relationship between the 10th virginica and the 10th setosa, so plotting them this way is, perhaps, misleading... or, at least, not ideal.

In [None]:
setosas = argwhere(data.target == list(data.target_names).index('setosa'))[:,0]

# 'sepal width (cm)'
plot(sort(data.data[virginicas][:,1]),label="virginicas")
plot(sort(data.data[setosas][:,1]), label="setosas")
legend()
pass

In [None]:
versicolors = argwhere(data.target == list(data.target_names).index('versicolor'))[:,0]

plot(sort(data.data[virginicas][:,1]))
plot(sort(data.data[setosas][:,1]))
plot(sort(data.data[versicolors][:,1]))

title("Feature: " + data.feature_names[1])
legend(['virginica', 'setosa', 'versicolor'])
pass

In [None]:
feature = 3

plot(sort(data.data[virginicas][:,feature]))
plot(sort(data.data[setosas][:,feature]))
plot(sort(data.data[versicolors][:,feature]))

title("Feature: " + data.feature_names[feature])
legend(['virginica', 'setosa', 'versicolor'])
pass

Each seems somewhat correlated, but we cannot be sure.

Taking the first two features for all samples:

In [None]:
from matplotlib.colors import ListedColormap
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # R, G, B

In [None]:
# c=data.target connects the class with the sample, then..
# cmap=cmap_bold -> color
scatter(data.data[:, 0], data.data[:, 1], c=data.target, cmap=cmap_bold)
xlabel(data.feature_names[0])
ylabel(data.feature_names[1])
pass

Red seems linearly separable, but remember that we're still just looking a 2 of 4 features!

# Baseline Classifiers

http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html#sklearn.dummy.DummyClassifier

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='uniform') # random classification..

In [None]:
# Training set (all the samples, but just the first two features)
X = data.data[:,:2]
y = data.target

In [None]:
X.shape, y.shape

In [None]:
# training step, given training data which has already been classified.
clf.fit(X, y) 

In [None]:
data.feature_names

In [None]:
clf.predict(array([[7.2, 2.5]]))

In [None]:
clf.predict(array([[7.2, 2.5]]))

### Steps for Machine Learning...
0. have data
1. create classifier
2. train on data (aka 'fit')
3. classify new data (aka 'predict')
4. compare to a baseline

`Dummy` is a random classifier to compare your method with.

# k-Nearest Neighbor Classifiers

In [None]:
from sklearn import neighbors

In [None]:
X = data.data[:, :2]
y = data.target

In [None]:
n_neighbors = 15
clf = neighbors.KNeighborsClassifier(n_neighbors,)
clf.fit(X, y)

In [None]:
clf.predict(array([[7.2, 2.5]]))

In [None]:
clf.predict(array([[5.0, 3.5]]))

In [None]:
scatter(data.data[:, 0], data.data[:, 1], c=data.target, cmap=cmap_bold)
xlabel(data.feature_names[0])
ylabel(data.feature_names[1])

# show where we asked for predictions...
scatter(*zip((7.2, 2.5),(5.0, 3.5)), c='purple', marker='x', lw=8)
pass

# * https://stackoverflow.com/questions/2921847/what-does-the-star-operator-mean

Showing the desicion boundary

In [None]:
h = .02  # step size in the mesh

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

xx

In [None]:
# meshgrid(range, range) is line 2d linspace

Now get the prediction for each point in the mesh

In [None]:
# r_ is by row, c_ is by column
# ravel? https://docs.scipy.org/doc/numpy-1.12.0/reference/generated/numpy.ravel.html

In [None]:
Z = clf.predict(c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
pcolormesh(xx, yy, Z, cmap=cmap_bold)
pass

The number of neighbors affects the classification boundaries

In [None]:
n_neighbors = [5, 10, 15, 50]

X = data.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = data.target
h = .02  # step size in the mesh
sp = 1
for n in n_neighbors:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n, weights='distance')
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    
    subplot(1,4,sp); sp += 1 
    pcolormesh(xx, yy, Z, cmap=cmap_bold)

    xlim(xx.min(), xx.max())
    ylim(yy.min(), yy.max())
    title("k = %i"% (n))

Distance can be measured in many ways!

http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html

In [None]:
n_neighbors = 5

X = data.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = data.target
h = .02  # step size in the mesh

# Create color maps

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    figure()
    pcolormesh(xx, yy, Z, cmap=cmap_bold)

    # Plot also the training points
    scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
    xlim(xx.min(), xx.max())
    ylim(yy.min(), yy.max())
    title("3-Class classification (k = %i, weights = '%s')"
             % (n_neighbors, weights))

How do I ask for a prediction using 4 features?

In [None]:
n_neighbors = 15

X = data.data[:, :]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = data.target
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(X, y)

In [None]:
clf.predict([[7.2, 2.5, 3.0, 3.0]])

In [None]:
clf.predict([[7.2, 2.5, 5.0, 2.4]])

In [None]:
data.data[120, :] # class == 2

To show a plot of the space of all 4 features would be difficult. We cannot directly visualize it.

# Generative probabilistic approaches

## Naive Bayes Classifier

In [None]:
X = data.data[:, :2]

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
c_[xx.ravel(), yy.ravel()] # r_

In [None]:
clf = GaussianNB()

clf.fit(X, y) # supervised learning

# make and plot an image that shows predictions as colors
Z = clf.predict(c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
pcolormesh(xx, yy, Z, cmap=cmap_bold)

# 
scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
xlim(xx.min(), xx.max())
ylim(yy.min(), yy.max())

In [None]:
from sklearn.mixture import GaussianMixture

This assumes that the data takes the form of superpositions (piles) of gausian distributions... which may be a bad assumption. In fact, each of these techniques makes assumptions which may be terrible. 

In [None]:
clf = GaussianMixture(n_components=3) # number of classes..

clf.fit(X) # no y... it ignores the given classes. you just tell it the number of classes (unsupervised).

# make and plot an image that shows a grid of predictions as colors
Z = clf.predict(c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
pcolormesh(xx, yy, Z, cmap=cmap_bold)

#
scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
xlim(xx.min(), xx.max())
ylim(yy.min(), yy.max()) # try this a few times until the color works

# Decision Hyperplanes

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm

In [None]:
from sklearn.svm import SVC # Support Vector Classification

In [None]:
clf = SVC(kernel='linear')

In [None]:
clf.fit(X, y) # supervised

In [None]:
#
Z = clf.predict(c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
pcolormesh(xx, yy, Z, cmap=cmap_bold)

#
scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
xlim(xx.min(), xx.max())
ylim(yy.min(), yy.max())

It creates regions using just lines. We're just using 2 features here.

In [None]:
clf = SVC()

In [None]:
clf.fit(data.data, data.target) # use all features, all data

Predict the actual data which we know

In [None]:
clf.predict(data.data[0:50])

In [None]:
clf.predict(data.data[50:100])

What? What's wrong?

In [None]:
clf.predict(data.data[100:150])

# By: Andrés Cabrera mantaraya36@gmail.com

For Course MAT 201A at UCSB

Adapted by Karl Yerkes

This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/

![http://i.creativecommons.org/l/by-nc-sa/3.0/88x31.png](http://i.creativecommons.org/l/by-nc-sa/3.0/88x31.png)