Example of Model validation using wheat seed dataset with knn classifier.

    Method 1: Use whole dataset to train/test model
    Method 2: Use split dataset- one to train model (train set) and another to test model (test set)

In [19]:
import pandas as pd  #library for pandas library
from sklearn.neighbors import KNeighborsClassifier #library to use KNN classifier
from sklearn import metrics  #library to check prediction accuracy
from sklearn.cross_validation import train_test_split  #library to split dataset

In [13]:
col_names = ['area','perimeter','compactness','length_kernel','width_kernel','assym','length_groove','type']
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt"
data = pd.read_csv(url,names=col_names,delim_whitespace=True)

In [14]:
features_col = ['area','perimeter','compactness','length_kernel','width_kernel','assym','length_groove']
X = data[features_col]
y = data['type']

In [15]:
knn = KNeighborsClassifier(n_neighbors=50)

In [16]:
knn.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=50, p=2,
           weights='uniform')

In [17]:
y_predicted = knn.predict(X)

In [18]:
#use metric library to check prediction accuracy
print (metrics.accuracy_score(y,y_predicted))
print (metrics.accuracy_score(y_predicted,y))

0.890476190476
0.890476190476


------------------------- Method 2 -------------------------------------

In [32]:
#random_state to obtain same numbers when splitting dataset

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=99)

In [31]:
#Confirm that the splitting of the dataset occurs
print (X.head(2))
print (X.shape)
print (X_train.head(2))
print (X_train.shape)
print (X_test.head(2))
print (X_test.shape)
print (y.head(2))
print (y.shape)
print (y_train.head(2))
print (y_train.shape)
print (y_test.head(2))
print (y_test.shape)

    area  perimeter  compactness  length_kernel  width_kernel  assym  \
0  15.26      14.84       0.8710          5.763         3.312  2.221   
1  14.88      14.57       0.8811          5.554         3.333  1.018   

   length_groove  
0          5.220  
1          4.956  
(210, 7)
     area  perimeter  compactness  length_kernel  width_kernel  assym  \
18  14.70      14.21       0.9153          5.205         3.466  1.767   
64  12.78      13.57       0.8716          5.262         3.026  1.176   

    length_groove  
18          4.649  
64          4.782  
(157, 7)
      area  perimeter  compactness  length_kernel  width_kernel  assym  \
36   16.20      15.27       0.8734          5.826         3.464  2.823   
142  13.34      13.95       0.8620          5.389         3.074  5.995   

     length_groove  
36           5.527  
142          5.307  
(53, 7)
0    1
1    1
Name: type, dtype: int64
(210,)
18    1
64    1
Name: type, dtype: int64
(157,)
36     1
142    3
Name: type, dtype: int

In [33]:
knn_two = KNeighborsClassifier(n_neighbors=50)

In [34]:
knn_two.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=50, p=2,
           weights='uniform')

In [35]:
y_predicted_two = knn_two.predict(X_test)

In [37]:
y_predicted_two


array([2, 1, 1, 3, 2, 1, 2, 2, 3, 2, 1, 2, 2, 1, 2, 2, 3, 2, 1, 1, 1, 1, 3,
       1, 1, 1, 3, 2, 1, 1, 1, 1, 2, 3, 1, 1, 2, 2, 3, 2, 3, 2, 3, 3, 1, 2,
       2, 3, 2, 1, 2, 3, 2])

In [39]:
print (metrics.accuracy_score(y_test,y_predicted_two))
print (metrics.accuracy_score(y_predicted_two,y_test))

0.867924528302
0.867924528302


Note:
    - Using whole dataset to train/test model, we get an accuracy of 89%. When we split dataset, we get an accuracy of 86%. Not bad
    - The order for the parameter in the accuracy_score function is not relevant since we see that we get the same answer.