In [None]:
#
# hw4pr1iris_cleaner:  data-cleaning for iris modeling and classification
#

In [None]:
#
# We don't need any data at all to create a predictive model!
#
import random

def predictive_model( Features ):
    """ input: a list of four features 
                [ sepallen, sepalwid, petallen, petalwid ]
        output: the predicted species of iris, from
                  setosa (0), versicolor (1), virginica (2)
    """
    [ sepallen, sepalwid, petallen, petalwid ] = Features # unpacking!
    
    if petalwid < 1.0:
        return 'setosa (0)'
    else:
        return random.choice( ['versicolor (1)', 'virginica (2)'] )
    
#
# Try it!
# 
# Features = eval(input("Enter new Features: "))
#
Features = [ 4.6, 3.6, 3.0, 0.2 ] 
result = predictive_model( Features )
print(f"I predict {result} from Features {Features}")

In [None]:
#
# (now, to explore how we _can_ use data to do better... :-) 
#

In [None]:
# libraries!
import numpy as np      # numpy is Python's "array" library
import pandas as pd     # Pandas is Python's "data" library ("dataframe" == spreadsheet)

In [None]:
# let's read in our flower data...
# 
# for read_csv, use header=0 when row 0 is a header row
# 
filename = 'iris.csv'
df = pd.read_csv(filename, header=0)   # encoding="latin1" et al.
print(f"{filename} : file read into a pandas dataframe.")

In [None]:
#
# a dataframe is a "spreadsheet in Python"   (seems to have an extra column!)
#
pd.set_option('display.max_rows', 10)  # None for no limit; default: 10
pd.set_option('display.min_rows', 10)  # None for no limit; default: 10
# let's view it!
df

In [None]:
#
# let's look at our pandas dataframe   (Aargh: that extra column!)
#
df.info()

In [None]:
#
# let's drop that last column (dropping is usually by _name_):
#
#   if you want a list of the column names use df.columns
name_to_drop = df.columns[5]  # get column name at index 5
print(f"dropping {name_to_drop}")
df_clean = df.drop(columns=[name_to_drop])  # drop by name is typical
df_clean.info()                         # should be happier!

In [None]:
#
# let's keep our column names in variables, for reference
#
COLUMNS = df_clean.columns            # "list" of columns
print(f"COLUMNS is {COLUMNS}\n")  
  # It's a "pandas" list, called an Index
  # use it just as a Python list of strings:
print(f"COLUMNS[0] is {COLUMNS[0]}\n")

# let's create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}")

In [None]:
#
# let's look at our cleaned-up dataframe...
#
df_clean.info()   
#
# notice that the non-null is _different_ for irisname!
df_clean   # show a table! (the problem rows are the last two...)

In [None]:
#
# typically, after dropping columns we don't want, 
#   we drop rows with missing data (other approaches are possible, too)
#
df_full = df_clean.dropna()   # this removes all rows with nan items
df_full.info()                # it's "full" because it has no nan items
df_full
#
# notice that _all_ of the rows now have 142 non-null items
#    also, the last row isn't real data... we'll handle it next

In [None]:
# 
# get rid of last 2 rows!
#
df_final = df_full.iloc[0:-2]   # not the syntax I would choose
# careful:  don't run this again!
print(df_final.shape)
df_final

In [None]:

# all of scikit-learn's ML routines need numbers, not strings
#   ... even for categories/classifications (like species!)
#   so, we will convert the flower-species to numbers:

SPECIES = ['setosa','versicolor','virginica']   # int to str
SPECIES_INDEX = {'setosa':0,'versicolor':1,'virginica':2}  # str to int

def convert_species(speciesname):
    """ return the species index (a unique integer/category) """
    #print(f"converting {speciesname}...")
    return SPECIES_INDEX[speciesname]

# Let's try it out...
for name in SPECIES:
    print(f"{name} maps to {convert_species(name)}")

In [None]:
#
# we can "apply" to a whole column
#   it may give a warning, but this is ok...
#

df_final['irisname_numeric'] = df_final['irisname'].apply(convert_species)

# Don't run this twice!   Why?!  What's "KeyError: 0"?
#   (for sure, you can always go back and re-establish definitions)

# don't worry about the (possible)  "SettingWithCopyWarning" here...

In [None]:
#
# let's see it!  (this is safe to run many times...)
#
df_final         # print(df_final.tostring())  # for _all_ rows...

In [None]:
df_tidy = df_final.drop(columns=["irisname"])  # drop the named species
df_tidy

In [None]:
#
# That's it!  Let's write it to 
cleaned_filename = filename[:-4] + "_cleaned.csv"  # name-creating
print(f"cleaned_filename is {cleaned_filename}")

# Now, save
df_tidy.to_csv(cleaned_filename, index_label=False)  # no "index" column...

In [None]:
#
# Let's make sure this worked!
#

# let's read in our flower data...
# 
# for read_csv, use header=0 when row 0 is a header row
# 
df_tidy = pd.read_csv(cleaned_filename, header=0)   # encoding="utf-8" et al.
print(f"{filename} : file read into a pandas dataframe.")
df_tidy

In [None]:
#
# Let's make sure we have all of our helpful variables in one place 
#      (for the next file, the modeler! ...)
#

#
# let's keep our column names in variables, for reference
#
COLUMNS = df_tidy.columns            # "list" of columns
print(f"COLUMNS is {COLUMNS}\n")  
  # It's a "pandas" list, called an Index
  # use it just as a Python list of strings:
print(f"COLUMNS[0] is {COLUMNS[0]}\n")

# let's create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}\n\n")


#
# and our species names
#


# all of scikit-learn's ML routines need numbers, not strings
#   ... even for categories/classifications (like species!)
#   so, we will convert the flower-species to numbers:

SPECIES = ['setosa','versicolor','virginica']   # int to str
SPECIES_INDEX = {'setosa':0,'versicolor':1,'virginica':2}  # str to int

def convert_species(speciesname):
    """ return the species index (a unique integer/category) """
    #print(f"converting {speciesname}...")
    return SPECIES_INDEX[speciesname]

# Let's try it out...
for name in SPECIES:
    print(f"{name} maps to {convert_species(name)}")

In [None]:
#
# now, we're starting to model...
#

#
# So this cell - and the two cells above -- should be in the next notebook!
#

#
# let's convert our dataframe to a numpy array, named A
#    Our ML library, scikit-learn operates entirely on numpy arrays.
#
A = df_more_final.values    # .values gets the numpy array
print(A)

In [None]:
#
# That's it!  Welcome to the world of data-cleaning workflows!!    
#
#             Our prediction?  You'll be headed to the "modeler" next! 
#

#
# And, the rest of the hw is to run more ML workflows:   Digits, Titanic, Housing, ...
#