In [None]:
#
# hw4pr1iris:  iris clasification via nearest neighbors
#

In [None]:
#
# We don't need any data at all to create a predictive model!
#
import random

def predictive_model( Features ):
    """ input: a list of four features 
                [ sepallen, sepalwid, petallen, petalwid ]
        output: the predicted species of iris, from
                  setosa (0), versicolor (1), virginica (2)
    """
    [ sepallen, sepalwid, petallen, petalwid ] = Features # unpacking!
    if petalwid < 1.0:
        return 'setosa (0)'
    else:
        return random.choice( ['versicolor (1)', 'virginica (2)'] )
    
#
# Try it!
# 
# Features = eval(input("Enter new Features: "))
#
Features = [ 4.6, 3.6, 3.0, 0.2 ] 
result = predictive_model( Features )
print(f"I predict {result} from Features {Features}")

In [None]:
#
# (now, to explore how we _can_ use data to do better... :-) 
#

In [None]:
# libraries!
import numpy as np      # numpy is Python's "array" library
import pandas as pd     # Pandas is Python's "data" library ("dataframe" == spreadsheet)

In [None]:
# let's read in our flower data...
# 
# for read_csv, use header=0 when row 0 is a header row
# 
filename = 'iris.csv'
df = pd.read_csv(filename, header=0)   # encoding="latin1" et al.
print(f"{filename} : file read into a pandas dataframe.")

In [None]:
#
# a dataframe is a "spreadsheet in Python"   (seems to have an extra column!)
#
pd.set_option('display.max_rows', 10)  # None for no limit; default: 10
pd.set_option('display.min_rows', 10)  # None for no limit; default: 10
# let's view it!
df

In [None]:
#
# let's look at our pandas dataframe   (Aargh: that extra column!)
#
df.info()

In [None]:
#
# let's drop that last column (dropping is usually by _name_):
#
#   if you want a list of the column names use df.columns
col5name = df.columns[5]  # get column name at index 5
df_clean = df.drop(columns=[col5name])  # drop by name is typical
df_clean.info()                         # should be happier!

In [None]:
#
# let's keep our column names in variables, for reference
#
COLUMNS = df_clean.columns            # "list" of columns
print(f"COLUMNS is {COLUMNS}\n")  
  # It's a "pandas" list, called an Index
  # use it just as a Python list of strings:
print(f"COLUMNS[0] is {COLUMNS[0]}\n")

# let's create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}")

In [None]:
#
# let's look at our cleaned-up dataframe...
#
df_clean.info()   
#
# notice that the non-null is _different_ for irisname!
df_clean   # show a table! (the problem rows are the last two...)

In [None]:
#
# typically, after dropping columns we don't want, 
#   we drop rows with missing data (other approaches are possible, too)
#
df_full = df_clean.dropna()   # this removes all rows with nan items
df_full.info()                # it's "full" because it has no nan items
df_full
#
# notice that _all_ of the rows now have 142 non-null items
#    also, the last row isn't real data... we'll handle it next

In [None]:
# 
# get rid of last row!
#
df_final = df_full.iloc[0:-1]   # not the syntax I would choose
# careful:  don't run this again!
print(df_final.shape)
df_final

In [None]:
# all of scikit-learn's ML routines need numbers, not strings
#   ... even for categories/classifications (like species!)
#   so, we will convert the flower-species to numbers:

SPECIES = ['setosa','versicolor','virginica']   # int to str
SPECIES_INDEX = {'setosa':0,'versicolor':1,'virginica':2}  # str to int

def convert_species(speciesname):
    """ return the species index (a unique integer/category) """
    #print(f"converting {speciesname}...")
    return SPECIES_INDEX[speciesname]

# Let's try it out...
for name in SPECIES:
    print(f"{name} maps to {convert_species(name)}")

In [None]:
#
# we can "apply" to a whole column
#   it may give a warning, but this is ok...
#

df_final['irisname'] = df_final['irisname'].apply(convert_species)

# Don't run this twice!   Why?!  What's "KeyError: 0"?
#   (for sure, you can always go back and re-establish definitions)

# don't worry about the (possible)  "SettingWithCopyWarning" here...

In [None]:
#
# let's see it!  (this is safe to run many times...)
#
df_final         # print(df_final.tostring())  # for _all_ rows...

In [None]:
#
# let's convert our dataframe to a numpy array, named A
#    Our ML library, scikit-learn operates entirely on numpy arrays.
#
A = df_final.values    # .values gets the numpy array
print(A)

In [None]:
#
# let's make sure it's all floating-point, so we can multiply and divide
#
A = A.astype('float64')  # so many:  www.tutorialspoint.com/numpy/numpy_data_types.htm
print(A)

In [None]:
#
# nice to have NUM_ROWS and NUM_COLS around
#
NUM_ROWS, NUM_COLS = A.shape
print(f"\nThe dataset has {NUM_ROWS} rows and {NUM_COLS} cols")

In [None]:
# let's use all of our variables, to reinforce names...

# choose a row index, n:
n = 132
print(f"flower #{n} is {A[n]}")

for i in range(len(COLUMNS)):
    colname = COLUMNS[i]
    if colname != 'irisname':
        print(f"  Its {colname} is {A[n][i]}")
    else:
        species_num = int(A[n][i])
        species = SPECIES[species_num]
        print(f"  Its {colname} is {species} ({species_num})")

In [None]:
#
# We don't have to use scikit-learn to implement n.n.!
#

#
# data-driven predictive model (1-nearest-neighbor)
#
dist = np.linalg.norm  # built in to numpy
NUM_ROWS, NUM_COLS = A.shape  # data size

def predictive_model( Features ):
    """ input: a list of four features 
                [ sepallen, sepalwid, petallen, petalwid ]
        output: the predicted species of iris, from
                  setosa (0), versicolor (1), virginica (2)
    """
    our_features = np.asarray(Features)   # make a numpy array
    
    closest_flower   = A[0]
    closest_features = A[0,0:4]         
    closest_distance = dist(our_features-closest_features)
    
    for i in range(NUM_ROWS):
        current_flower   = A[i]
        current_features = A[i,0:4] 
        current_distance = dist(our_features-current_features)
        
        if current_distance < closest_distance:
            closest_distance = current_distance  # remember closest!
            closest_flower = current_flower
    
    # done comparing with every flower in the dataset
    predicted_species = int(round(closest_flower[4]))
    name = SPECIES[predicted_species]
    return f"{name} ({predicted_species})"
    
#
# Try it!
# 
# Features = eval(input("Enter new Features: "))
#
Features = [ 4.6, 3.6, 3.0, 1.2 ] 
result = predictive_model( Features )
print(f"I predict {result} from Features {Features}")

In [None]:
#
# but, we don't have to write our own ... because
#
#     we want knn for any k!
#     we want an already-debugged algorithm!
#     we want to ask iris q'ns instead of implementation ones... (?)
#

In [None]:
print("+++ Start of data definitions +++\n")

X_all = A[:,0:4]  # X (features) ... is all rows, columns 0, 1, 2, 3
y_all = A[:,4]    # y (labels) ... is all rows, column 4 only

print(f"X_all (just features) is \n {X_all}")
print(f"y_all (just labels)   is \n {y_all}")

In [None]:
#
# we can re-weight different features here...
#

COL_WEIGHT = {              # could be called Feature weight...
    'sepallen':1.0,
    'sepalwid':1.0,
    'petallen':1.0,
    'petalwid':1.0,
}

for colname in COL_WEIGHT:
    i = COL_INDEX[colname]    # get the column index, i, of the colname
    weight = COL_WEIGHT[colname]  # from the dictionary above
    print("Weighting", colname, "by", weight)   
    # weighting == "multiplying"
    X_all[:,i] *= weight   # multiply by the weight to give this column ("feature")

In [None]:
#
# we scramble the data, to give a different TRAIN/TEST split each time...
# 
indices = np.random.permutation(len(y_all))  # indices is a permutation-list

# we scramble both X and y, necessarily with the same permutation
X_labeled = X_all[indices]              # we apply the _same_ permutation to each!
y_labeled = y_all[indices]              # again...
print(X_labeled)
print(y_labeled)

In [None]:
#
# We next separate into test data and training data ... 
#    + We will train on the training data...
#    + We will _not_ look at the testing data to build the model
#
# Then, afterward, we will test on the testing data -- and see how well we do!
#

#
# a common convention:  train on 80%, test on 20%    Let's define the TEST_PERCENT
#
NUM_ROWS = X_labeled.shape[0]     # the number of labeled rows
TEST_PERCENT = 0.20
TEST_SIZE = int(TEST_PERCENT*NUM_ROWS)   # no harm in rounding down

X_test = X_labeled[:TEST_SIZE]    # first section are for testing
y_test = y_labeled[:TEST_SIZE]

X_train = X_labeled[TEST_SIZE:]   # all the rest are for training
y_train = y_labeled[TEST_SIZE:]

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows" )

In [None]:
#
# +++ This is the "Model-building and Model-training Cell"
#       
# Create a kNN model and train it! 
#
from sklearn.neighbors import KNeighborsClassifier

k = 84   # we don't know what k to use, so we guess!  (this will _not_ be a good value)
knn_model = KNeighborsClassifier(n_neighbors=k)       # here, k is the "k" in kNN

# we train the model (it's one line!)
knn_model.fit(X_train, y_train)                              # yay!  trained!
print("Created and trained a knn classifier with k =", k)  

In [None]:
#
# +++ This is the "Model-testing Cell"
#
# Now, let's see how well we did on our "held-out data" (the testing data)
#

# We run our test set!
predicted_labels = knn_model.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual  labels  :", actual_labels)

# And, some overall results
num_correct = sum(predicted_labels == actual_labels)
total = len(actual_labels)
print(f"\nResults on test set:  {num_correct} correct out of {total} total.")

In [None]:
#
# Let's print these more helpfully, in a vertical table
#

def compare_labels(predicted_labels, actual_labels):
    """ a more neatly formatted comparison """
    NUM_LABELS = len(predicted_labels)
    num_correct = 0
    
    for i in range(NUM_LABELS):
        p = int(round(predicted_labels[i]))         # round protects from fp error 
        a = int(round(actual_labels[i]))
        result = "incorrect"
        if p == a:  # if they match,
            result = ""       # no longer incorrect
            num_correct += 1  # and we count a match!

        print(f"row {i:>3d} : {SPECIES[p]:>12s} {SPECIES[a]:<12s}   {result}")   

    print()
    print("Correct:", num_correct, "out of", NUM_LABELS)
    return num_correct

In [None]:
#
# let's try it out!
#

compare_labels(predicted_labels,actual_labels)

In [None]:
#
# Ok!  We have our knn model, we could just use it...
#

#
# data-driven predictive model (k-nearest-neighbor), using scikit-learn
#

def predictive_model( Features ):
    """ input: a list of four features 
                [ sepallen, sepalwid, petallen, petalwid ]
        output: the predicted species of iris, from
                  setosa (0), versicolor (1), virginica (2)
    """
    our_features = np.asarray([Features])                 # extra brackets needed
    predicted_species = knn_model.predict(our_features)
    
    predicted_species = int(round(predicted_species[0]))  # unpack one element
    name = SPECIES[predicted_species]
    return f"{name} ({predicted_species})"
    
#
# Try it!
# 
# Features = eval(input("Enter new Features: "))
#
Features = [6.7,3.3,5.7,2.1]  # [5.8,2.7,4.1,1.0] [4.6,3.6,3.0,2.2] [6.7,3.3,5.7,2.1]
result = predictive_model( Features )
print(f"I predict {result} from Features {Features}")

In [None]:
#
# Except, we didn't really explore whether this was the BEST model we could build!
#
#
# We used k = 84  (a neighborhood size of 84 flowers)
# In a dataset of only 140ish flowers, with three species, this seems like a bad idea!
#
# Perhaps we should try ALL the neighborhood sizes in their own TRAIN/TEST split
# and see which neighborhood size works the best, for irises, at least...
#

In [None]:
#
# to do this, we use "cross validation"
#

from sklearn.model_selection import cross_val_score

#
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"
#
for k in range(1,85):
    knn_cv_model = KNeighborsClassifier(n_neighbors=k)   # build knn_model for every k!
    cv_scores = cross_val_score( knn_cv_model, X_train, y_train, cv=5 )  # 5 means 80/20 split
    # print(cv_scores)  # just to see the five scores... 
    average_cv_accuracy = cv_scores.mean()  # mean() is numpy's built-in average function 
    print(f"k: {k:2d}  cv accuracy: {average_cv_accuracy:7.4f}")

    
    
# assign best value of k to best_k
best_k = k      # at the moment this is incorrect   TO DO for hw4pr1: fix this...
# you'll need to use the loop above to find and remember the real best_k

print(f"best_k = {best_k}   yields the highest average cv accuracy.")  # print the best one

In [None]:
#
# Now, we re-create and re-run the  "Model-building and -training Cell"
#
# Now, using best_k instead of the original, randomly-guessed value    How does it do?!
#
from sklearn.neighbors import KNeighborsClassifier
knn_model_tuned = KNeighborsClassifier(n_neighbors=best_k)   # here, we use the best_k

# we train the model (one line!)
knn_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print(f"Created + trained a knn classifier, now tuned with a (best) k of {best_k}")  

In [None]:
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = knn_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)
print()
# and, we'll print our nicer table...
compare_labels(predicted_labels,actual_labels)

In [None]:
#
# Ok!  We have tuned knn to use the "best" value of k...
#
# And, we should really use ALL available data to train our final predictive model:
#

knn_model_final = KNeighborsClassifier(n_neighbors=best_k)   # here, we use the best_k
knn_model_final.fit(X_all, y_all)                              # yay!  trained!
print(f"Created + trained a 'final' knn classifier, with a (best) k of {best_k}") 

In [None]:
#
# final predictive model (k-nearest-neighbor), with tuned k + ALL data incorporated
#

def predictive_model( Features ):
    """ input: a list of four features 
                [ sepallen, sepalwid, petallen, petalwid ]
        output: the predicted species of iris, from
                  setosa (0), versicolor (1), virginica (2)
    """
    our_features = np.asarray([Features])                 # extra brackets needed
    predicted_species = knn_model_final.predict(our_features)
    
    predicted_species = int(round(predicted_species[0]))  # unpack one element
    name = SPECIES[predicted_species]
    return f"{name} ({predicted_species})"
    
#
# Try it!
# 
# Features = eval(input("Enter new Features: "))
#
Features = [6.7,3.3,5.7,2.1]  # [5.8,2.7,4.1,1.0] [4.6,3.6,3.0,2.2] [6.7,3.3,5.7,2.1]
result = predictive_model( Features )
print(f"I predict {result} from Features {Features}")

In [None]:
#
# try it on new data!  (grab it from the problem statement)
#
#
# TO DO for hw4pr1:
#       write a loop that will handle _multiple_ new flowers and predict their species...
#

In [None]:
#
# Be sure your results from trying this on the unknown data are here - or above!
#

In [None]:
#
# That's it!  Welcome to the world of model-building workflows!!    
#
#             Our prediction?  We'll be back for more ML! 
#

#
# In fact, the rest of the hw is to run more ML workflows:   Digits, Titanic, Housing, ...
#