# DAT210x - Programming with Python for DS

## Module5- Lab5

In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from sklearn import preprocessing
from sklearn.decomposition import PCA

matplotlib.style.use('ggplot') # Look Pretty

### A Convenience Function

In [72]:
def plotDecisionBoundary(model, X, y):
    fig = plt.figure()
    ax = fig.add_subplot(111)

    padding = 0.6
    resolution = 0.0025
    colors = ['royalblue','forestgreen','ghostwhite']

    # Calculate the boundaris
    x_min, x_max = X[:, 0].min(), X[:, 0].max()
    y_min, y_max = X[:, 1].min(), X[:, 1].max()
    x_range = x_max - x_min
    y_range = y_max - y_min
    x_min -= x_range * padding
    y_min -= y_range * padding
    x_max += x_range * padding
    y_max += y_range * padding

    # Create a 2D Grid Matrix. The values stored in the matrix
    # are the predictions of the class at at said location
    xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
                       np.arange(y_min, y_max, resolution))

    # What class does the classifier say?
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot the contour map
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.terrain)

    # Plot the test original points as well...
    for label in range(len(np.unique(y))):
        indices = np.where(y == label)
        plt.scatter(X[indices, 0], X[indices, 1], c=colors[label], label=str(label), alpha=0.8)

    p = model.get_params()
    plt.axis('tight')
    plt.title('K = ' + str(p['n_neighbors']))

### The Assignment

Load up the dataset into a variable called `X`. Check `.head` and `dtypes` to make sure you're loading your data properly--don't fail on the 1st step!

In [73]:
X = pd.read_csv('Datasets/wheat.data', index_col=0)
print(X.head(10))
print('---------------------------------------------------------------------------------')
print(X.dtypes)
print('---------------------------------------------------------------------------------')
print(X.describe())

     area  perimeter  compactness  length  width  asymmetry  groove wheat_type
id                                                                            
0   15.26      14.84       0.8710   5.763  3.312      2.221   5.220       kama
1   14.88      14.57       0.8811   5.554  3.333      1.018   4.956       kama
2   14.29      14.09       0.9050   5.291  3.337      2.699   4.825       kama
3   13.84      13.94       0.8955   5.324  3.379      2.259   4.805       kama
4   16.14      14.99       0.9034   5.658  3.562      1.355   5.175       kama
5   14.38      14.21       0.8951   5.386  3.312      2.462   4.956       kama
6   14.69      14.49       0.8799   5.563  3.259      3.586   5.219       kama
7   14.11      14.10       0.8911   5.420  3.302      2.700     NaN   canadian
8   16.63      15.46       0.8747   6.053  3.465      2.040   5.877       kama
9   16.44      15.25       0.8880   5.884  3.505      1.969   5.533       kama
----------------------------------------------------

Copy the `wheat_type` series slice out of `X`, and into a series called `y`. Then drop the original `wheat_type` column from the `X`:

In [74]:
y = X.wheat_type
X = X.drop(labels=['wheat_type'], axis=1)

Do a quick, "ordinal" conversion of `y`. In actuality our classification isn't ordinal, but just as an experiment...

In [75]:
y = y.astype('category').cat.codes
y

id
0      1
1      1
2      1
3      1
4      1
5      1
6      1
7      0
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     1
      ..
180    0
181    0
182    0
183    0
184    0
185    0
186    0
187    0
188    0
189    0
190    0
191    0
192    0
193    0
194    0
195    0
196    0
197    0
198    0
199    0
200    0
201    0
202    0
203    0
204    0
205    0
206    0
207    0
208    0
209    0
dtype: int8

Do some basic nan munging. Fill each row's nans with the mean of the feature:

In [76]:
X.compactness.fillna(X.compactness.mean(), inplace = True)
X.width.fillna(X.width.mean(), inplace = True)
X.groove.fillna(X.groove.mean(), inplace = True)
X

Unnamed: 0_level_0,area,perimeter,compactness,length,width,asymmetry,groove
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,15.26,14.84,0.8710,5.763,3.312,2.2210,5.220000
1,14.88,14.57,0.8811,5.554,3.333,1.0180,4.956000
2,14.29,14.09,0.9050,5.291,3.337,2.6990,4.825000
3,13.84,13.94,0.8955,5.324,3.379,2.2590,4.805000
4,16.14,14.99,0.9034,5.658,3.562,1.3550,5.175000
5,14.38,14.21,0.8951,5.386,3.312,2.4620,4.956000
6,14.69,14.49,0.8799,5.563,3.259,3.5860,5.219000
7,14.11,14.10,0.8911,5.420,3.302,2.7000,5.407529
8,16.63,15.46,0.8747,6.053,3.465,2.0400,5.877000
9,16.44,15.25,0.8880,5.884,3.505,1.9690,5.533000


Split `X` into training and testing data sets using `train_test_split()`. Use `0.33` test size, and use `random_state=1`. This is important so that your answers are verifiable. In the real world, you wouldn't specify a random_state:

In [77]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)

Create an instance of SKLearn's Normalizer class and then train it using its .fit() method against your _training_ data. The reason you only fit against your training data is because in a real-world situation, you'll only have your training data to train with! In this lab setting, you have both train+test data; but in the wild, you'll only have your training data, and then unlabeled data you want to apply your models to.

In [78]:
T = preprocessing.Normalizer()
T.fit_transform(X_train)
T.fit_transform(y_train)
T.transform(X_test)
T.transform(y_test)



array([[ 0.10101525,  0.        ,  0.10101525,  0.        ,  0.        ,
         0.20203051,  0.10101525,  0.        ,  0.        ,  0.        ,
         0.20203051,  0.10101525,  0.10101525,  0.10101525,  0.10101525,
         0.        ,  0.        ,  0.10101525,  0.10101525,  0.10101525,
         0.10101525,  0.10101525,  0.20203051,  0.        ,  0.10101525,
         0.        ,  0.10101525,  0.        ,  0.10101525,  0.20203051,
         0.20203051,  0.20203051,  0.20203051,  0.10101525,  0.10101525,
         0.20203051,  0.        ,  0.20203051,  0.10101525,  0.        ,
         0.10101525,  0.        ,  0.        ,  0.        ,  0.20203051,
         0.20203051,  0.        ,  0.        ,  0.20203051,  0.        ,
         0.20203051,  0.        ,  0.20203051,  0.20203051,  0.10101525,
         0.10101525,  0.        ,  0.        ,  0.10101525,  0.        ,
         0.20203051,  0.10101525,  0.20203051,  0.        ,  0.10101525,
         0.        ,  0.10101525,  0.        ,  0.1

With your trained pre-processor, transform both your training AND testing data. Any testing data has to be transformed with your preprocessor that has ben fit against your training data, so that it exist in the same feature-space as the original data used to train your models.

Just like your preprocessing transformation, create a PCA transformation as well. Fit it against your training data, and then project your training and testing features into PCA space using the PCA model's `.transform()` method. This has to be done because the only way to visualize the decision boundary in 2D would be if your KNN algo ran in 2D as well:

In [80]:
pca = PCA(n_components = 2)
pca.fit_transform(X_train, y_train)
pca.transform(X_test, y_test)

array([[  1.21246288e-01,  -5.37511966e-01],
       [ -4.53301239e+00,   3.50532042e+00],
       [  6.62116608e-01,   1.96891257e+00],
       [ -2.92562862e+00,   2.26622439e-02],
       [ -1.87092486e-03,   1.71045019e+00],
       [  6.40941204e+00,   1.88538287e+00],
       [ -1.02684213e+00,  -1.62086845e+00],
       [ -3.03822681e+00,  -2.33708934e-01],
       [ -3.19456402e+00,  -2.96874205e-01],
       [ -4.01713091e+00,   3.77112352e-01],
       [  2.26595599e+00,   2.55605763e-01],
       [  1.38195483e+00,  -2.21852691e+00],
       [  2.74882469e-02,  -7.11700071e-01],
       [ -3.87598157e-01,  -1.98203869e+00],
       [ -3.01593995e+00,  -2.42747936e+00],
       [ -4.28660476e+00,   1.84109871e+00],
       [ -4.59113386e+00,   1.19001481e+00],
       [ -9.04537779e-01,  -1.51124092e+00],
       [ -1.46522010e-01,  -5.55940935e-01],
       [  2.40248602e-01,  -1.50596686e+00],
       [  5.61468230e-01,  -1.62572359e+00],
       [  6.19934596e-01,  -2.01638414e-01],
       [  

Create and train a KNeighborsClassifier. Start with `K=9` neighbors. Be sure train your classifier against the pre-processed, PCA- transformed training data above! You do not, of course, need to transform your labels.

In [81]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

Display the accuracy score of your test data/labels, computed by your KNeighbors model. You do NOT have to run `.predict` before calling `.score`, since `.score` will take care of running your predictions for you automatically.

In [82]:
print(knn.score(X_test, y_test))

0.871428571429


In [83]:
# I hope your KNeighbors classifier model from earlier was named 'knn'
# If not, adjust the following line:
plotDecisionBoundary(knn, X_train, y_train)

TypeError: unhashable type: 'slice'

### Bonus

Instead of the ordinal conversion, try and get this assignment working with a proper Pandas get_dummies for feature encoding. You might have to update some of the `plotDecisionBoundary()` code.

In [None]:
plt.show()

In [None]:
for i in range(9, 0, -1):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    print('K=', i, ', score=', knn.score(X_test, y_test))