In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

<h2>Iris Classification Dataset</h2>
<h3>Target Class contains string. It needs to be encoded to numeric values as XGBoost requires all numeric values</h3>
<h4>Hands-on: Classification with AWS Machine Learning Service</h4>
Input Features: sepal_length,sepal_width,petal_length,petal_width<br>
Target Feature: encoded_class<br>
Objective: Predict encoded_class for a given sepal_length,sepal_width,petal_length,petal_width<br>
<h4>Data source: https://archive.ics.uci.edu/ml/datasets/iris</h4>

In [None]:
columns = ['encoded_class','sepal_length','sepal_width','petal_length','petal_width']

In [None]:
# Encode Class Labels to integers
le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

In [None]:
le.classes_

In [None]:
df = pd.read_csv('iris_all.csv')

In [None]:
df['class'].value_counts()

In [None]:
df.head()

In [None]:
# Convert Classes to numeric value
df['encoded_class'] = le.transform(df['class'])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# Visualize
setosa = df['class'] == 'Iris-setosa'
versicolor = df['class'] == 'Iris-versicolor'
virginica = df['class'] == 'Iris-virginica'

In [None]:
plt.scatter(df[setosa].sepal_length,y=df[setosa].sepal_width, label='setosa',color='g')
plt.scatter(df[versicolor].sepal_length,y=df[versicolor].sepal_width, label='versicolor',color='r')
plt.scatter(df[virginica].sepal_length,y=df[virginica].sepal_width, label='virginica',color='b')
plt.xlabel('length')
plt.ylabel('width')
plt.title('sepal')
plt.grid(True)
plt.legend()

In [None]:
plt.scatter(df[setosa].petal_length,y=df[setosa].petal_width, label='setosa',color='g')
plt.scatter(df[versicolor].petal_length,y=df[versicolor].petal_width, label='versicolor',color='r')
plt.scatter(df[virginica].petal_length,y=df[virginica].petal_width, label='virginica',color='b')
plt.xlabel('length')
plt.ylabel('width')
plt.title('petal')
plt.grid(True)
plt.legend()

## Training and Validation Set
### Target Variable as first column followed by input features:
class,sepal_length,sepal_width,petal_length,petal_width
### Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [None]:
rows, train, test

In [None]:
# Write Training Set
df[:train].to_csv('iris_train.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
df[train:].to_csv('iris_validation.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Column List
with open('iris_train_column_list.txt','w') as f:
    f.write(','.join(columns))