# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*

In [None]:
import numpy as np
from datascience import *
import math as m

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

# Classification - Examination through ScatterPlots

In [None]:
link = "https://raw.githubusercontent.com/humdskyline/data/main/IRIS.csv"
iris =  Table.read_table(link)
iris

In [None]:
#let's pick two of the column labels above and observe the scatter plot. 
# Note that each of the three species will be a different color on the plot to help us differentiate 
# the iris flower species

iris.scatter(... , ... , group = 'species')

# Identifying key variables for classification

In [None]:
#Now let's look at comparing ALL of the labels to each other in pairwise comparisons
#run the code below 

labels_without_species = list(iris.labels)[:-1]
plotted_pairs = []

for x in labels_without_species:
    for y in labels_without_species:
        
        ordered_pair =  {x,y}
        
        if (x != y) and (ordered_pair not in plotted_pairs):
            iris.scatter(x,y,group='species')
            plotted_pairs.append(ordered_pair)
            
print(plotted_pairs)    

# Creating Training and Testing Data Sets

In [None]:
print('The iris data set has',iris.num_rows,'flowers represented in the sameple')

Let's separate 100 rows of data and use it for the training set, and then use the remaining 50 rows of data for the test set. Since the table is organized nicely by species (which is what we are classifying) we want to mix these rows up. We can do this using the `sample` method to shuffle, and then `take` the first 100 rows for training, and the last 50 rows for testing

In [None]:
shuffled_iris = iris.sample(with_replacement = False)

train_iris = shuffled_iris.take(np.arange(100))
test_iris = shuffled_iris.take(np.arange(100, 150))

print("Training set:\t",   train_iris.num_rows, "examples")
print("Test set:\t",       test_iris.num_rows, "examples")
train_iris.show(5), test_iris.show(5);

# The Distance Formula

#### The following function below is defined for you homework 12

In [None]:
# row (input): a row from the table 
# features (input): an array of column labels. These labels are the attributes that will help us classify individuals. 
# Note: the attributes must be numerical to help us pass them through the distance function defined above. 

def row_to_array(row, features):
    """Converts a row to an array of its features."""
    arr = make_array()
    for feature in features:
        arr = np.append(arr, row.item(feature))
    return arr

In [None]:
iris_features = iris.drop('species').labels
print(iris_features)
first_setosa = row_to_array(iris.row(0),iris_features)
second_setosa = row_to_array(iris.row(1),iris_features)
last_virginica = row_to_array(iris.row(-1),iris_features)

(sum((first_setosa - second_setosa)**2))**(0.5)

Let's convert the above process into a single function:

In [None]:
def distance(array_one,array_two):
    return np.sqrt(sum((array_one - array_two)**2))

In [None]:
distance(first_setosa,second_setosa)

In [None]:
distance(first_setosa,last_virginica)

In [None]:
distance(second_setosa,last_virginica)

# Let's classify these irises

## Finding the `k` Nearest Neighbors

### Some pre-formatting:

In [None]:
#Take the first row and use it to test
row_to_test = test_iris.row(0)
row_to_test

In [None]:
test_features_array = row_to_array(...)
test_features_array

### Find the distance between the example (i.e. test row) and each example in the training set

In [None]:
# we will store the distance between the test row with all the rows in the training set. 
distances = make_array()

# we will iterate through the training set row by row.  
for train_row in train_iris.rows:
    #convert the train_row into an array also
    train_row_array = ...
    
    # compute the distance between the test row array and test row array
    distance_to_test_row = ...
    
    # save the distance between these two arrays (test row and train row) into the distances array
    distances = ...
    

distances

### Augment the training data table with a column containing all the distances

In [None]:
train_with_distances = ...
train_with_distances

### Sort the augmented table in increasing order of the distances

In [None]:
#by default sort orders in ascending order
sorted_training = ...
sorted_training

### Take the top `k` rows of the sorted table

In [None]:
#choose k = 15
k = ...
top_k_training = ...
top_k_training

## The Classifier

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (visually)

In [None]:
#Use the first row of the test set to test the classifier
row_to_test = test_iris.row(0)

In [None]:
x_axis_label = "sepal_length"
y_axis_label = "petal_width"

top_k_training.scatter(x_axis_label, y_axis_label, group = "species")
plt.scatter(row_to_test.item(x_axis_label), row_to_test.item(y_axis_label), marker = 'X', s = 100)

### Take a majority vote of the `k` nearest neighbors to see which of the two classes appear most often (algorithm)

In [None]:
groups = ...
print(groups)
decision = ...
decision

# Evaluating your classifier (Accuracy)
What we did for the first row of the `test` table, we must repeat for the rest of the table. 

In [None]:
test_iris

As daunting as that sounds, a for-loop can sufficiently handle this. We repeat the above steps for each row of the test table: 

In [None]:
species_column = make_array()

for test_row in test_iris.rows:
    test_row_array = ...
    distances = make_array()
#######
    for train_row in train_iris.rows:
        #convert the train row into an array also
        ...
        #compute the distance between the test row array and the train row array
        ...
        #save the distance between these two arrays (test row and train row) into the distances array
        
#######
    train_with_distances = train_iris.with_column('distances',distances)
    sorted_training = ...
    top_k_training = ...
    species= ...
    species_column = np.append(species_column, species)
    
len(species_column) == test_iris.num_rows 

In [None]:
species_column

In [None]:
np.count_nonzero(test_iris.column('species') == species_column)/len(species_column)