# Load and Explore Iris Dataset
Load the Iris dataset using sklearn, and explore its structure and contents.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
import numpy as np

# Load the Iris dataset
irisX, irisy = load_iris(return_X_y=True)

# Print the shape and type of the dataset
print(f'irisX is a {type(irisX)} with shape {irisX.shape}')
print(f'irisy is a {type(irisy)} with shape {irisy.shape}')

# Define feature names
feature_names = ['sepal width', 'sepal length', 'petal width', 'petal length']

# Print the first 5 rows of Iris data
print('Each row of irisX is a sample with the following features:')
print(feature_names)
print(irisX[:5])

# Create Simple Scatter Plot
Create a basic 2D scatter plot using matplotlib to visualize the first two features of the Iris dataset.

In [None]:
# Create Simple Scatter Plot

# Select the features to show
x_axis_feature = 0  # sepal width
y_axis_feature = 1  # sepal length
x = irisX[:, x_axis_feature]
y = irisX[:, y_axis_feature]

# Import matplotlib for plotting
import matplotlib.pyplot as plt

# Create the scatter plot and title
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(x, y, c=irisy, cmap='viridis', s=50, marker='o')
title = fig.suptitle("Scatter Plot for Iris Data", fontsize=10)

# Add some axis labels
ax.set_xlabel(feature_names[x_axis_feature], fontsize=10)
ax.set_ylabel(feature_names[y_axis_feature], fontsize=10)

# Show the plot
plt.show()

# Enhance Scatter Plot with Titles and Labels
Add titles and labels to the scatter plot to make it more informative.

In [None]:
# Enhance Scatter Plot with Titles and Labels

# Create the scatter plot and title
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(x, y, c=irisy, cmap='viridis', s=50, marker='o')

# Add a title to the plot
title = fig.suptitle("Enhanced Scatter Plot for Iris Data", fontsize=15)

# Add axis labels
ax.set_xlabel(feature_names[x_axis_feature], fontsize=12)
ax.set_ylabel(feature_names[y_axis_feature], fontsize=12)

# Add grid lines for better readability
ax.grid(True)

# Show the plot
plt.show()

# Activity 1: Experiment with Different Features and Formatting
Experiment with different combinations of features and formatting options for the scatter plot.

In [None]:
# Activity 1: Experiment with Different Features and Formatting

# Experiment with different combinations of features and formatting options for the scatter plot

# Select different combinations of features
combinations = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]

# Iterate over each combination and create scatter plots
for x_axis_feature, y_axis_feature in combinations:
    x = irisX[:, x_axis_feature]
    y = irisX[:, y_axis_feature]

    # Create the scatter plot and title
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.scatter(x, y, c=irisy, cmap='viridis', s=50, marker='o')

    # Add a title to the plot
    title = fig.suptitle(f"Scatter Plot for Iris Data: {feature_names[x_axis_feature]} vs {feature_names[y_axis_feature]}", fontsize=15)

    # Add axis labels
    ax.set_xlabel(feature_names[x_axis_feature], fontsize=12)
    ax.set_ylabel(feature_names[y_axis_feature], fontsize=12)

    # Add grid lines for better readability
    ax.grid(True)

    # Show the plot
    plt.show()

# Experiment with different formatting options
x_axis_feature = 0  # sepal width
y_axis_feature = 1  # sepal length
x = irisX[:, x_axis_feature]
y = irisX[:, y_axis_feature]

# Different marker styles and colors
markers = ['o', '^', 's', 'v', 'x']
colors = ['red', 'blue', 'green', 'purple', 'orange']

# Iterate over each marker and color combination
for marker, color in zip(markers, colors):
    # Create the scatter plot and title
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.scatter(x, y, c=irisy, cmap='viridis', s=50, marker=marker, edgecolor=color)

    # Add a title to the plot
    title = fig.suptitle(f"Scatter Plot for Iris Data with Marker '{marker}' and Color '{color}'", fontsize=15)

    # Add axis labels
    ax.set_xlabel(feature_names[x_axis_feature], fontsize=12)
    ax.set_ylabel(feature_names[y_axis_feature], fontsize=12)

    # Add grid lines for better readability
    ax.grid(True)

    # Show the plot
    plt.show()

# Implement k-Means Clustering
Use the k-Means algorithm from sklearn to cluster the Iris dataset and explore the cluster centers.

In [None]:
# Implement k-Means Clustering

# Import the KMeans class from the sklearn library
from sklearn.cluster import KMeans

# Create a KMeans model with 3 clusters
cluster_model = KMeans(n_clusters=3, n_init=10)

# Fit the model to the data (find the clusters)
cluster_model.fit(irisX)

# Get the cluster centers
cluster_centers = cluster_model.cluster_centers_
print(f'The cluster centers are:\n{cluster_centers}')

# Predict the cluster for each data point
cluster_labels = cluster_model.predict(irisX)
print(f'The cluster labels for each data point are:\n{cluster_labels}')

# Visualize the clusters
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(irisX[:, 0], irisX[:, 1], c=cluster_labels, cmap='viridis', s=50, marker='o')
title = fig.suptitle("k-Means Clustering of Iris Data", fontsize=15)
ax.set_xlabel(feature_names[0], fontsize=12)
ax.set_ylabel(feature_names[1], fontsize=12)
ax.grid(True)
plt.show()

# Activity 2: Visualize Clusters with k-Means
Combine clustering and visualization code to create scatter plots colored by cluster assignments.

In [None]:
# Activity 2: Visualize Clusters with k-Means

# Combine clustering and visualization code to create scatter plots colored by cluster assignments

# Define the number of clusters
K = 3

# Create a KMeans model with K clusters
cluster_model = KMeans(n_clusters=K, n_init=10)

# Fit the model to the data (find the clusters)
cluster_model.fit(irisX)

# Predict the cluster for each data point
cluster_labels = cluster_model.predict(irisX)

# Select different combinations of features for visualization
combinations = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]

# Iterate over each combination and create scatter plots colored by cluster assignments
for x_axis_feature, y_axis_feature in combinations:
    x = irisX[:, x_axis_feature]
    y = irisX[:, y_axis_feature]

    # Create the scatter plot and title
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.scatter(x, y, c=cluster_labels, cmap='viridis', s=50, marker='o')

    # Add a title to the plot
    title = fig.suptitle(f"k-Means Clustering of Iris Data: {feature_names[x_axis_feature]} vs {feature_names[y_axis_feature]}", fontsize=15)

    # Add axis labels
    ax.set_xlabel(feature_names[x_axis_feature], fontsize=12)
    ax.set_ylabel(feature_names[y_axis_feature], fontsize=12)

    # Add grid lines for better readability
    ax.grid(True)

    # Show the plot
    plt.show()

# Activity 3: Scatter Plot Matrix with Histograms
Create a scatter plot matrix with histograms on the diagonal to visualize the clustered data.

In [None]:
# Activity 3: Scatter Plot Matrix with Histograms

# Define a function that produces a set of scatter plots with histograms on the diagonal
def show_scatterplot_matrix(X, y, feature_names, title=None):
    # Find the number of features
    num_feat = X.shape[1]
    if len(y) != X.shape[0]:
        print("Error, the y array must have the same length as there are rows in X")
        return

    # Create the plot with a grid of (num_feat) subplots
    fig, ax = plt.subplots(num_feat, num_feat, figsize=(12, 12))
    plt.set_cmap('viridis')  # set the color map to 'viridis' for better visualization

    # Loop over each pair of features
    for feature1 in range(num_feat):
        # Set the label for the axis
        ax[feature1, 0].set_ylabel(feature_names[feature1])
        ax[0, feature1].set_xlabel(feature_names[feature1])
        ax[0, feature1].xaxis.set_label_position('top')

        for feature2 in range(num_feat):
            # Extract the data for the feature pairs
            x_data = X[:, feature1]
            y_data = X[:, feature2]

            if feature1 != feature2:
                ax[feature1, feature2].scatter(x_data, y_data, c=y)
            else:
                ax[feature1, feature2].hist(x_data, bins=20, color='gray', edgecolor='black')

    # Add a title
    if title is not None:
        fig.suptitle(title, fontsize=16, y=0.925)

    plt.show()

# Run this cell to create the plot for the clustered data
show_scatterplot_matrix(irisX, cluster_labels, feature_names, title="Scatter Plot Matrix with Histograms for Clustered Iris Data")

# Activity 4: General Clustering and Visualization Function
Develop a general function to read data, perform k-Means clustering, and visualize the results.

In [None]:
# Activity 4: General Clustering and Visualization Function

# Define the general function to read data, perform k-Means clustering, and visualize the results
def cluster_and_visualise(datafile_name: str, K: int, feature_names: list):
    # Read the data from the file
    data = np.genfromtxt(datafile_name, delimiter=',')

    # Create a KMeans model with K clusters
    cluster_model = KMeans(n_clusters=K, n_init=10)

    # Fit the model to the data (find the clusters)
    cluster_model.fit(data)

    # Predict the cluster for each data point
    cluster_labels = cluster_model.predict(data)

    # Create the scatter plot and title
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=50, marker='o')
    title = fig.suptitle(f"Visualisation of {K} clusters by <your_username>", fontsize=15)

    # Add axis labels
    ax.set_xlabel(feature_names[0], fontsize=12)
    ax.set_ylabel(feature_names[1], fontsize=12)

    # Add grid lines for better readability
    ax.grid(True)

    # Save the plot to a file
    fig.savefig('myVisualisation.jpg')

    return fig, ax

# Test the function with the Iris dataset
cluster_and_visualise('iris_data.csv', 3, feature_names)