In [1]:
#
# hw4pr1iris_cleaner:  data-cleaning for iris modeling and classification
#

#
# Here, our goal is to
# [1] look over the iris.csv data...
# [2] clean it up, removing rows and columns we don't want to use
# [3] saving the "cleaned-up data" to a new filename, iris_cleaned.csv

#
# This way, we can use iris_cleaned.csv for _ALL_ of our iris-modeling from here...
#

In [2]:
#
# Side note only!
# # don't copy this cell...
# 
# It's worth noting that we don't _need_ any data at all to create a predictive model!
# # Here is a model that is half hand-built and half random:
#
import random

def predictive_model( Features ):
    """ input: a list of four features 
                [ sepallen, sepalwid, petallen, petalwid ]
        output: the predicted species of iris, from
                  setosa (0), versicolor (1), virginica (2)
    """
    [ sepallen, sepalwid, petallen, petalwid ] = Features # unpacking!
    
    if petalwid < 1.0:
        return 'setosa (0)'
    else:
        return random.choice( ['versicolor (1)', 'virginica (2)'] )
    
#
# Try it!
# 
# Features = eval(input("Enter new Features: "))
#
Features = [ 4.6, 3.6, 3.0, 1.2 ] 
result = predictive_model( Features )
print(f"from Features {Features} I predict {result} ")

from Features [4.6, 3.6, 3.0, 1.2] I predict versicolor (1) 


In [3]:
#
# (Next, let's explore how we _can_ use data to do better... :-) 
#

In [4]:
# libraries!
import numpy as np      # numpy is Python's "array" library
import pandas as pd     # Pandas is Python's "data" library ("dataframe" == spreadsheet)

In [5]:
# let's read in our flower data...
# 
# for read_csv, use header=0 when row 0 is a header row
# 
filename = 'iris.csv'
df = pd.read_csv(filename)        # encoding="utf-8" et al.
print(f"{filename} : file read into a pandas dataframe.")

iris.csv : file read into a pandas dataframe.


In [6]:
#
# a dataframe is a "spreadsheet in Python"   (seems to have an extra column!)
#
# let's view it!
df

Unnamed: 0,sepallen,sepalwid,petallen,petalwid,irisname,adapted from https://en.wikipedia.org/wiki/Iris_flower_data_set
0,4.6,3.6,1.000,0.200,setosa,
1,4.3,3.0,1.100,0.100,setosa,
2,5.0,3.2,1.200,0.200,setosa,
3,5.8,4.0,1.200,0.200,setosa,
4,4.4,3.0,1.300,0.200,setosa,
...,...,...,...,...,...,...
140,7.7,2.6,6.900,2.300,virginica,
141,7.4,2.4,,4.200,virginica,
142,4.7,4.7,4.747,4.747,alieniris,
143,4.2,4.2,4.242,4.242,aliiieniris,


In [7]:
#
# Looking at the result, above, we see some things that need to be "tidied":
#
# [1] there's an extra column (holding the reference url)
# [2] there are some flowers not in our three speciesL setosa, versicolor, virginica
# [3] there is a flower without a species name (irisname)
# [4] this is a virginia flower without a petallen 
#

In [8]:
#
# let's look at the dataframe's "info":
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 6 columns):
sepallen                                                           145 non-null float64
sepalwid                                                           145 non-null float64
petallen                                                           144 non-null float64
petalwid                                                           145 non-null float64
irisname                                                           144 non-null object
adapted from https://en.wikipedia.org/wiki/Iris_flower_data_set    0 non-null float64
dtypes: float64(5), object(1)
memory usage: 6.9+ KB


In [9]:
# Let's look at the dataframe's columns:
df.columns

Index(['sepallen', 'sepalwid', 'petallen', 'petalwid', 'irisname',
       'adapted from https://en.wikipedia.org/wiki/Iris_flower_data_set'],
      dtype='object')

In [10]:
# we can drop a series of data (a row or a column)
# they're indicated by numeric value, row~0, col~1, but let's use readable names instead:
ROW = 0
COLUMN = 1

df_clean1 = df.drop('adapted from https://en.wikipedia.org/wiki/Iris_flower_data_set', axis=COLUMN)
df_clean1

# df_clean1 is a new dataframe, without that unwanted column

Unnamed: 0,sepallen,sepalwid,petallen,petalwid,irisname
0,4.6,3.6,1.000,0.200,setosa
1,4.3,3.0,1.100,0.100,setosa
2,5.0,3.2,1.200,0.200,setosa
3,5.8,4.0,1.200,0.200,setosa
4,4.4,3.0,1.300,0.200,setosa
...,...,...,...,...,...
140,7.7,2.6,6.900,2.300,virginica
141,7.4,2.4,,4.200,virginica
142,4.7,4.7,4.747,4.747,alieniris
143,4.2,4.2,4.242,4.242,aliiieniris


In [11]:
# and, let's drop the unwanted rows:
ROW = 0
COLUMN = 1

df_clean2 = df_clean1.drop([142,143,144], axis=ROW)
df_clean2

Unnamed: 0,sepallen,sepalwid,petallen,petalwid,irisname
0,4.6,3.6,1.0,0.2,setosa
1,4.3,3.0,1.1,0.1,setosa
2,5.0,3.2,1.2,0.2,setosa
3,5.8,4.0,1.2,0.2,setosa
4,4.4,3.0,1.3,0.2,setosa
...,...,...,...,...,...
137,7.6,3.0,6.6,2.1,virginica
138,7.7,3.8,6.7,2.2,virginica
139,7.7,2.8,6.7,2.0,virginica
140,7.7,2.6,6.9,2.3,virginica


In [12]:
#
# let's re-look at our cleaned-up dataframe's info:
#
df_clean2.info()   
#
# notice that the non-null count is _different_ for petallen...
#

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 0 to 141
Data columns (total 5 columns):
sepallen    142 non-null float64
sepalwid    142 non-null float64
petallen    141 non-null float64
petalwid    142 non-null float64
irisname    142 non-null object
dtypes: float64(4), object(1)
memory usage: 6.7+ KB


In [13]:
#
# let's drop _all_ rows with data that is missing/NaN (not-a-number)
df_clean3 = df_clean2.dropna()
df_clean3.info()  # print the info, and
# let's see the whole table, as well:
df_clean3

# Tidy!

<class 'pandas.core.frame.DataFrame'>
Int64Index: 141 entries, 0 to 140
Data columns (total 5 columns):
sepallen    141 non-null float64
sepalwid    141 non-null float64
petallen    141 non-null float64
petalwid    141 non-null float64
irisname    141 non-null object
dtypes: float64(4), object(1)
memory usage: 6.6+ KB


Unnamed: 0,sepallen,sepalwid,petallen,petalwid,irisname
0,4.6,3.6,1.0,0.2,setosa
1,4.3,3.0,1.1,0.1,setosa
2,5.0,3.2,1.2,0.2,setosa
3,5.8,4.0,1.2,0.2,setosa
4,4.4,3.0,1.3,0.2,setosa
...,...,...,...,...,...
136,7.9,3.8,6.4,2.0,virginica
137,7.6,3.0,6.6,2.1,virginica
138,7.7,3.8,6.7,2.2,virginica
139,7.7,2.8,6.7,2.0,virginica


In [14]:
#
# let's keep our column names in variables, for reference
#
COLUMNS = df_clean1.columns            # "list" of columns
print(f"COLUMNS is {COLUMNS}")  
  # It's a "pandas" list, called an Index
  # use it just as a Python list of strings:
print(f"COLUMNS[0] is {COLUMNS[0]}\n")

# let's create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}")
print(f"COL_INDEX[ 'petallen' ] is {COL_INDEX[ 'petallen' ]}")

COLUMNS is Index(['sepallen', 'sepalwid', 'petallen', 'petalwid', 'irisname'], dtype='object')
COLUMNS[0] is sepallen

COL_INDEX is {'sepallen': 0, 'sepalwid': 1, 'petallen': 2, 'petalwid': 3, 'irisname': 4}
COL_INDEX[ 'petallen' ] is 2


In [15]:

# all of scikit-learn's ML routines need numbers, not strings
#   ... even for categories/classifications (like species!)
#   so, we will convert the flower-species to numbers

#
# First, let's map our different species to numeric values:

SPECIES = ['setosa','versicolor','virginica']   # int to str
SPECIES_INDEX = {'setosa':0,'versicolor':1,'virginica':2}  # str to int

def convert_species(speciesname):
    """ return the species index (a unique integer/category) """
    #print(f"converting {speciesname}...")
    return SPECIES_INDEX[speciesname]

# Let's try it out...
for name in SPECIES:
    print(f"{name} maps to {convert_species(name)}")

setosa maps to 0
versicolor maps to 1
virginica maps to 2


In [16]:
convert_species( 'virginica')

2

In [17]:
#
# we can "apply" to a whole column and create a new column
#   it may give a warning, but this is ok...
#

df_clean4 = df_clean3.copy()  # copy everything AND...

# add a new column, 'irisnum'
df_clean4['irisnum'] = df_clean3['irisname'].apply(convert_species)

# let's see...
df_clean4

Unnamed: 0,sepallen,sepalwid,petallen,petalwid,irisname,irisnum
0,4.6,3.6,1.0,0.2,setosa,0
1,4.3,3.0,1.1,0.1,setosa,0
2,5.0,3.2,1.2,0.2,setosa,0
3,5.8,4.0,1.2,0.2,setosa,0
4,4.4,3.0,1.3,0.2,setosa,0
...,...,...,...,...,...,...
136,7.9,3.8,6.4,2.0,virginica,2
137,7.6,3.0,6.6,2.1,virginica,2
138,7.7,3.8,6.7,2.2,virginica,2
139,7.7,2.8,6.7,2.0,virginica,2


In [18]:
#
# different version vary on how to see all rows (adapt to suit your system!)
#
# pd.options.display.max_rows = 150   # None for no limit; default: 10
# pd.options.display.min_rows = 150   # None for no limit; default: 10
# pd.options.display.max_rows = 10   # None for no limit; default: 10
# pd.options.display.min_rows = 10   # None for no limit; default: 10
df_clean4


Unnamed: 0,sepallen,sepalwid,petallen,petalwid,irisname,irisnum
0,4.6,3.6,1.0,0.2,setosa,0
1,4.3,3.0,1.1,0.1,setosa,0
2,5.0,3.2,1.2,0.2,setosa,0
3,5.8,4.0,1.2,0.2,setosa,0
4,4.4,3.0,1.3,0.2,setosa,0
...,...,...,...,...,...,...
136,7.9,3.8,6.4,2.0,virginica,2
137,7.6,3.0,6.6,2.1,virginica,2
138,7.7,3.8,6.7,2.2,virginica,2
139,7.7,2.8,6.7,2.0,virginica,2


In [19]:
#
# let's call it df_tidy 
#
df_tidy =  df_clean4


In [20]:
#
# That's it!  Then, and write it out to iris_cleaned.csv

# We'll construct the new filename:
old_basename = filename[:-4]                      # remove the ".csv"
cleaned_filename = old_basename + "_cleaned.csv"  # name-creating
print(f"cleaned_filename is {cleaned_filename}")

# Now, save
df_tidy.to_csv(cleaned_filename, index_label=False)  # no "index" column...

cleaned_filename is iris_cleaned.csv


In [21]:
#
# Let's make sure this worked!
#

# let's read in our flower data...
# 
# for read_csv, use header=0 when row 0 is a header row
# 
df_tidy_reread = pd.read_csv(cleaned_filename)   # encoding="utf-8" et al.
print(f"{filename} : file read into a pandas dataframe.")
df_tidy_reread

iris.csv : file read into a pandas dataframe.


Unnamed: 0,sepallen,sepalwid,petallen,petalwid,irisname,irisnum
0,4.6,3.6,1.0,0.2,setosa,0
1,4.3,3.0,1.1,0.1,setosa,0
2,5.0,3.2,1.2,0.2,setosa,0
3,5.8,4.0,1.2,0.2,setosa,0
4,4.4,3.0,1.3,0.2,setosa,0
...,...,...,...,...,...,...
136,7.9,3.8,6.4,2.0,virginica,2
137,7.6,3.0,6.6,2.1,virginica,2
138,7.7,3.8,6.7,2.2,virginica,2
139,7.7,2.8,6.7,2.0,virginica,2


In [22]:
#
# Let's make sure we have all of our helpful variables in one place 
#       To be adapted if we drop/add more columns...
#

#
# let's keep our column names in variables, for reference
#
COLUMNS = df_tidy.columns            # "list" of columns
print(f"COLUMNS is {COLUMNS}\n")  
  # It's a "pandas" list, called an Index
  # use it just as a Python list of strings:
print(f"COLUMNS[0] is {COLUMNS[0]}\n")

# let's create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}\n\n")


#
# and our "species" names
#

# all of scikit-learn's ML routines need numbers, not strings
#   ... even for categories/classifications (like species!)
#   so, we will convert the flower-species to numbers:

SPECIES = ['setosa','versicolor','virginica']   # int to str
SPECIES_INDEX = {'setosa':0,'versicolor':1,'virginica':2}  # str to int

def convert_species(speciesname):
    """ return the species index (a unique integer/category) """
    #print(f"converting {speciesname}...")
    return SPECIES_INDEX[speciesname]

# Let's try it out...
for name in SPECIES:
    print(f"{name} maps to {convert_species(name)}")

COLUMNS is Index(['sepallen', 'sepalwid', 'petallen', 'petalwid', 'irisname', 'irisnum'], dtype='object')

COLUMNS[0] is sepallen

COL_INDEX is {'sepallen': 0, 'sepalwid': 1, 'petallen': 2, 'petalwid': 3, 'irisname': 4, 'irisnum': 5}


setosa maps to 0
versicolor maps to 1
virginica maps to 2


In [23]:
#
# That's it!  Welcome to the world of data-cleaning workflows!!    
#
#             Our prediction?  You'll be headed to the "modeler" next! 
#

#
# And, the rest of the hw is to run more ML workflows:   Births, Digits, Titanic, Housing, ...
#