Data from https://www.kaggle.com/arshid/iris-flower-dataset
License: CC0, Public Domain

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# You may not need this, it depends on how you show this notebook - but it won't hurt to run even if you don't need it
# It makes sure the charts are shown between the cells
%matplotlib inline

In [None]:
# Load data, show first 5 rows
df = pd.read_csv('IRIS.csv')
df.head()

In [None]:
# Show last 5 rows - note that it is sorted by species
df.tail()

In [None]:
# Some quick statistics
# Count: number, mean: average, std: standard deviation (how much do values deviate from the average)
# Min, max: minimum, maximum
# 25%, 50%, 75%: if you sort them by this measurement, what is the value of the 37th (25% of 150) measurement, etc
df.describe()

In [None]:
# Histogram of sepal_length - put each into a 'bin' (4-4.5, 4.5-5, 5-5.5, etc) based on their sepal length
# show how many in each bin
df['sepal_length'].hist(bins=[4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8])

In [None]:
# All 150 flowers plotted as scatter (x y) plot
# x coordinate (left-right) is the sepal length
# y coordinate (up-down) is the sepal width
# It looks like there might be a few different groups
df.plot(kind='scatter', x='sepal_length', y='sepal_width')

In [None]:
# Ditto, but using a different colour for each type of iris
# Wrapped into a function so we can use it for different combinations of columns

def compare_columns(column_a, column_b):
    # Create an empty figure
    fig = plt.figure()

    # For each different type of species
    for species in df['species'].unique():

        # Create an dataframe with the data just for this species
        df_for_species = df[df['species']==species]

        # Show this as a scatter plot
        plt.scatter(column_a, column_b, data=df_for_species)
    
# Note that each time .scatter is called, it will choose a different colour



In [None]:
compare_columns('sepal_length', 'sepal_width')
# Note that there seem to be two or three different point 'clouds'

In [None]:
compare_columns('petal_length', 'petal_width')
# Even more obvious - the 3 different types of irises gives us different measurements

## Automatic categorisation
When we put all four measurements (petal length/width, sepal length/width) together we get a 4 dimensional point cloud, but there is no way to visualise that.

Our charts above 'flatten' the 4 dimensions into 2. It looks like the clouds overlap but maybe one is completely behind the other

When we get beyond 2 or 3 dimensions it becomes difficult for humans to see any groupings.
Python's scikit-learn library has some functions to group points for us, as we'll see next 

In [None]:
# Let's imagine that we don't know how to group the irises yet - all we've got is the measurements
# This creates a new dataframe, consisting of our four measurements,
# but without the 'species' column
# As if we've just gone out in the field and did some measurements
measurements = df.drop('species', axis=1)

In [None]:
measurements.head()

In [None]:
# First a quick peak - it looks like there might be two different groups
measurements.plot(kind='scatter', x='sepal_length', y='sepal_width')

In [None]:
# We'll use the k-means clustering algorithm to try to group (cluster) the measurements
# K-means needs to know how many groups we want. We'll say 2
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2)

In [None]:
# Update the classifier to "fit" our measurements
model.fit(measurements)

In [None]:
# Our model has given each set of measurements, each row, a label
# Label 1 is the first group (cluster), label 0 is the second group
model.labels_

In [None]:
# Here is another scatter plot, using a different colour for both groups
fig = plt.figure()

# For each different type of species
for cluster in [0, 1]:

    # Create an dataframe with the data just for this species
    cluster_measurements = measurements[model.labels_==cluster]

    # Show this as a scatter plot
    plt.scatter('petal_length', 'petal_width', data=cluster_measurements)

In [None]:
# Note how the chart above doesn't look quite right - 
# Some points seem to have been put into the wrong group
# We'll try this again, but now aim for 3 groups instead of 2
# Same code as above
model = KMeans(n_clusters=3)
model.fit(measurements)

In [None]:
# And the plot again
fig = plt.figure()
for cluster in [0, 1, 2]:
    cluster_measurements = measurements[model.labels_==cluster]
    plt.scatter('petal_length', 'petal_width', data=cluster_measurements)

In [None]:
# Ditto, for some different measurements
# And the plot again
fig = plt.figure()
for cluster in [0, 1, 2]:
    cluster_measurements = measurements[model.labels_==cluster]
    plt.scatter('sepal_length', 'sepal_width', data=cluster_measurements)

In [None]:
# The charts above seem to be better, although there may still be some overlap
# Remember that this should be a 4 dimensional chat, so maybe one point cloud is behind the other
# If we could rotate this (in 4D?) we could see this better