## Install the packages

In [None]:
!pip install sklearn
!pip install tensorflow
!pip install hdfs

In [None]:
import tensorflow as tf
import numpy as np
import csv
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from hdfs import InsecureClient

## Load raw data from DataLake

In [None]:
train = []
categories = []
client = InsecureClient('http://datalake:50070')
with client.read('/worm/iris/iris.csv', encoding = 'utf-8') as reader:
    readCSV = csv.reader(reader, delimiter=',')
    for row in list(readCSV)[1:]:  # skip header
        data = row[:4]
        category = row[4]

        train.append(data)
        categories.append(category)

# print(train)
# print(categories)

## Prepare training data
* Scale inputs between 0 and 1
* Split into train and test set

In [None]:
## split data set
X_train, X_test, Y_train, Y_test = train_test_split(train, categories, test_size=0.33, random_state=42, stratify=categories)

## max min scalar on parameters
X_scaler = MinMaxScaler(feature_range=(0,1))
 
## Preprocessing the dataset
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.fit_transform(X_test)
 
## One hot encode Y
onehot_encoder = OneHotEncoder(sparse=False)
Y_train_enc = onehot_encoder.fit_transform(np.array(Y_train).reshape(-1,1))
Y_test_enc = onehot_encoder.fit_transform(np.array(Y_test).reshape(-1,1))
# print(Y_test_enc[:5])

## Create 2 layer network and train for 100 epochs
* achieves ca. 96% accuracy on test

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, Y_train_enc))
dataset = dataset.batch(32)
dataset = dataset.shuffle(1000)
dataset = dataset.repeat()
dataset_test = tf.data.Dataset.from_tensor_slices((X_test_scaled, Y_test_enc))
dataset_test = dataset_test.batch(32)
dataset_test = dataset_test.shuffle(1000)
dataset_test = dataset_test.repeat()

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, input_dim=4),
    tf.keras.layers.Dense(3, activation=tf.nn.softmax),
])

sgd = tf.keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss="categorical_crossentropy", metrics=["accuracy"])

model.fit(dataset, steps_per_epoch=32, epochs=100, verbose=1)

loss, accuracy = model.evaluate(dataset_test, steps=32)
print("loss:%f" % (loss))
print("accuracy: %f" % (accuracy))

## Evaluate

In [None]:
y_predict = model.predict_classes(X_test_scaled)
for i in range(len(X_test_scaled)):
	print("X=%s, Label=%s Predicted=%s" % (X_test_scaled[i], Y_test_enc[i], y_predict[i]))