Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
84 lines (63 sloc) 2.54 KB
'''
SOLUTIONS: "Human Learning" with iris data
'''
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# load the famous iris data
iris = load_iris()
# what do you think these attributes represent?
iris.data
iris.data.shape
iris.feature_names
iris.target
iris.target_names
# intro to numpy
type(iris.data)
## PART 1: Read data into pandas and explore
# read iris.data into a pandas DataFrame (df), including column names
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# clean up column names
features = [name[:-5].replace(' ', '_') for name in iris.feature_names]
# read into pandas again, with better column names
df = pd.DataFrame(iris.data, columns=features)
# create a list of species (150 elements) using iris.target and iris.target_names
species = [iris.target_names[num] for num in iris.target]
species = np.where(iris.target==0, iris.target_names[0],
np.where(iris.target==1, iris.target_names[1], iris.target_names[2]))
# add the species list as a new DataFrame column
df['species'] = species
# explore data numerically, looking for differences between species
df.describe()
df.groupby('species').sepal_length.mean()
df.groupby('species')['sepal_length', 'sepal_width', 'petal_length', 'petal_width'].mean()
df.groupby('species').agg(np.mean)
df.groupby('species').agg([np.min, np.max])
df.groupby('species').describe()
# explore data by sorting, looking for differences between species
df.sort_index(by='sepal_length').values
df.sort_index(by='sepal_width').values
df.sort_index(by='petal_length').values
df.sort_index(by='petal_width').values
# explore data visually, looking for differences between species
df.petal_width.hist(by=species, sharex=True)
df.boxplot(column='petal_width', by='species')
df.boxplot(by='species')
df.plot(x='petal_length', y='petal_width', kind='scatter', c=iris.target)
pd.scatter_matrix(df, c=iris.target)
## PART 2: Write a function to predict the species for each observation
# create a dictionary so we can reference columns by name
col_ix = {col:index for index, col in enumerate(df.columns)}
# define function that takes in a row of data and returns a predicted species
def classify_iris(data):
if data[col_ix['petal_length']] < 3:
return 'setosa'
elif data[col_ix['petal_width']] < 1.8:
return 'versicolor'
else:
return 'virginica'
# make predictions and store as numpy array
preds = np.array([classify_iris(row) for row in df.values])
# calculate the accuracy of the predictions
np.mean(preds == df.species.values)