# Predict Grain Type using NN

This exercise is taken and modified from https://github.com/benjaminwilson/python-clustering-exercises

This is a class to choose a good number of clusters for a dataset using the k-means inertia graph.  You are given a dataset of the measurements of samples of grain.  What's a good number of clusters in this case?

This dataset was obtained from the [UCI](https://archive.ics.uci.edu/ml/datasets/seeds).


In [10]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris

In [1]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

ImportError: cannot import name 'set_random_seed' from 'tensorflow' (C:\Users\tangkaiv\Anaconda3\lib\site-packages\tensorflow\__init__.py)

**Step 1:** Load the dataset _(written for you)_.

In [3]:
import pandas as pd

seeds_df = pd.read_csv('./data/seeds.csv')
seeds_df

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry_coefficient,groove_length,grain_variety
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,Kama wheat
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,Kama wheat
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,Kama wheat
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,Kama wheat
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,Kama wheat
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,Canadian wheat
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,Canadian wheat
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,Canadian wheat
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,Canadian wheat


In [5]:
dataset = seeds_df.values
X = dataset[:,0:7].astype(float)
Y = dataset[:,7]

**Step 2:** Encode output

In [12]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
dummy_y

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1

**Step 3:** Define NN Model

In [21]:
# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(14, input_dim=7, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [22]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

  estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)


**Step 4:** Evaluate model with k-Fold CV

In [19]:
kfold = KFold(n_splits=10, shuffle=True)


In [20]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 90.48% (6.73%)


**Optional:** Find the best param

In [32]:
# define baseline model
def baseline_model(batch_size=10):
	# create model
	model = Sequential()
	model.add(Dense(units=batch_size, input_dim=7, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [33]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

  estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)


In [34]:
batch_size = [8, 9, 10, 11, 12, 13, 14]

In [35]:
param_grid = dict(batch_size=batch_size)

In [36]:
kfold = KFold(n_splits=10, shuffle=True)

In [37]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=kfold)

In [38]:
grid_result = grid.fit(X, dummy_y)



In [39]:
# print results
print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')

Best Accuracy for 0.9047619104385376 using {'batch_size': 9}
 mean=0.8905, std=0.07392 using {'batch_size': 8}
 mean=0.9048, std=0.07678 using {'batch_size': 9}
 mean=0.8857, std=0.06801 using {'batch_size': 10}
 mean=0.8381, std=0.2078 using {'batch_size': 11}
 mean=0.9, std=0.08371 using {'batch_size': 12}
 mean=0.8857, std=0.05303 using {'batch_size': 13}
 mean=0.8952, std=0.05948 using {'batch_size': 14}
