# Iris Dataset

Contains data about iris flowers, including features like sepal length, sepal width, petal length, and petal width, along with their species labels. We want to simply visualize the relationships between these features and the species labels.

Then, create a similarity matrix to see if we notice any patterns that we didn't beforehand.

## Scatter Plots

In [None]:
# Most of the code here is in designing the plots

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df = pd.read_csv("../data/iris.csv")

# Define colors for each species
colors = {'Iris-setosa': 'blue', 'Iris-versicolor': 'orange', 'Iris-virginica': 'red'}

# Create subplots: 2 rows, 1 column (2D on top, 3D on bottom)
fig = make_subplots(
    rows=2, cols=1,
    specs=[[{"type": "xy"}], [{"type": "scene"}]],
    subplot_titles=["2D Scatter Plot", "3D Scatter Plot"]
)

# 2D
for species in df['Species'].unique():
    subset = df[df['Species'] == species]
    fig.add_trace(
        go.Scatter(
            x=subset['PetalWidthCm'],
            y=subset['SepalLengthCm'],
            mode='markers',
            name=species,
            legendgroup='2d',  # Group for 2D legend
            marker=dict(size=8, opacity=0.8, color=colors[species])
        ),
        row=1, col=1
    )

# 3D
for species in df['Species'].unique():
    subset = df[df['Species'] == species]
    fig.add_trace(
        go.Scatter3d(
            x=subset['PetalWidthCm'],
            y=subset['SepalLengthCm'],
            z=subset['PetalLengthCm'],
            mode='markers',
            name=species,
            legendgroup='3d',  # Group for 3D legend
            marker=dict(size=6, opacity=0.9, line=dict(width=1, color='white'), color=colors[species])
        ),
        row=2, col=1
    )

fig.update_layout(
    template='plotly_white',
    font_family='Arial',
    title_font_size=20,
    width=1000,  # Slightly wider for legends
    height=1200,  # Taller height for stacked plots
    margin=dict(l=50, r=200, t=50, b=50),  # Extra right margin for legends
    legend=dict(
        x=1.02,  # Position to the right
        y=0.5,
        xanchor='left',
        yanchor='middle',
        tracegroupgap=20  # Space between legend groups
    )
)

fig.update_xaxes(title_text="Petal Width (cm)", row=1, col=1)
fig.update_yaxes(title_text="Sepal Length (cm)", row=1, col=1)

fig.update_scenes(
    xaxis_title="Petal Width (cm)",
    yaxis_title="Sepal Length (cm)",
    zaxis_title="Petal Length (cm)",
    aspectmode='cube', 
    row=2, col=1
)

# Configure the plot to be high quality when downloading
fig.show(config={
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'iris_plots',
        'height': 1200,
        'width': 1000,
        'scale': 5  # Scaled for high DPI
    }
})


## Distance Heatmap

In [17]:

import numpy as np
import pandas as pd
import plotly.express as px
import sys

sys.path.append('..')
from distances.distance_matrix import make_distance_matrix

df = pd.read_csv("../data/iris.csv")
X = df[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']].to_numpy()

# Build pairwise distance matrix (150x150)
D = make_distance_matrix(X, metric='gower')  # consider 'euclidean' too

# Order by species to reveal blocks
order = np.argsort(df['Species'].values)
D_ordered = D[np.ix_(order, order)]

# Labels for hover (sample id + species); hide tick labels to avoid clutter
sample_ids = np.arange(len(df))
hover_labels = (df['Species'] + " | id=" + sample_ids.astype(str)).values[order]

fig = px.imshow(
    D_ordered,
    x=hover_labels, y=hover_labels,
    color_continuous_scale='Viridis',
    origin='lower',
    labels=dict(color='Distance'),
)
fig.update_layout(
    title='Iris pairwise distances (Gower, ordered by species)',
    width=800, height=800
)
# Hide dense ticks, keep hover informative
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)

# Draw species boundary lines
species_ordered = df['Species'].values[order]
bounds = np.where(species_ordered[:-1] != species_ordered[1:])[0] + 0.5
n = len(df)
for b in bounds:
    fig.add_shape(type='line', x0=b, x1=b, y0=-0.5, y1=n-0.5, line=dict(color='white', width=1))
    fig.add_shape(type='line', y0=b, y1=b, x0=-0.5, x1=n-0.5, line=dict(color='white', width=1))

fig.show()

print("Notice how coordinates in the form (n,n) are darker, and, the further away they are from the diagonal, the lighter they become. Right now there are 3 species, so this forms 9 squares.")

Notice how coordinates in the form (n,n) are darker, and, the further away they are from the diagonal, the lighter they become. Right now there are 3 species, so this forms 9 squares.


## Making Predictions

### Using k-nearest neighhbours

Use Matrix `D` from the previous cell to make a prediction engine for species, given a new sample's features.


In [None]:
import numpy as np
import pandas as pd
from collections import Counter

# These are the correct answers
species = df['Species'].to_numpy()

def knn_predictor(D, species, k=5, target_idx=None, new_distances=None):
    """
    Predict species for a target sample using kNN from distance matrix.
    
    Args:
    - D: 150x150 distance matrix (precomputed).
    - species: Array of true species labels (150,).
    - k: Number of neighbours.
    - target_idx: Index of target sample (for LOO; if None, predict for all).
    - new_distances: 1D array of distances from a new point to all 150 samples (if predicting new point).
    
    Returns:
    - Predictions (string or array).
    """
    if new_distances is not None:
        distances = new_distances
        exclude_self = False
    else:  # Predicting existing samples (leave one out)
        if target_idx is not None:
            distances = D[target_idx].copy()
            exclude_self = True
        else:  # Predict for all samples
            predictions = []
            for i in range(len(D)):
                pred = knn_predictor(D, species, k, target_idx=i)
                predictions.append(pred)
            return np.array(predictions)
    
    # Find k nearest neighbours
    if exclude_self:
        sorted_indices = np.argsort(distances)[1:k+1]   # Skip self (i.e. check elements 1 to k)
    else:
        sorted_indices = np.argsort(distances)[:k]      # Include self; elements 0 to k-1

    neighbour_species = species[sorted_indices]
    # Majority vote
    vote = Counter(neighbour_species).most_common(1)[0][0]
    return vote

# Example: Leave-one-out accuracy
neighbour_count = 10

predictions = knn_predictor(D, species, k=neighbour_count)

accuracy = (predictions == species).mean()
print(f"kNN (k={neighbour_count}) Leave-One-Out Accuracy: {accuracy*100:.10f}%")

pred_for_0 = knn_predictor(D, species, k=neighbour_count, target_idx=0)
print(f"Prediction for sample 0 ({species[0]}): {pred_for_0}")

new_point = X[:100].mean(axis=0)  # Hypothetical random point

# Predict its species based on kNN, this time using Gower distance. 
combined_data = np.vstack([X, new_point.reshape(1, -1)])  # Add new_point as a row
full_D = make_distance_matrix(combined_data, metric='gower')  # Same metric as D
new_distances = full_D[-1, :-1]  # Extract distances from new_point (last row) to all X (all but last column)

# Now using Gower distance
pred_new = knn_predictor(D, species, k=5, new_distances=new_distances)
print(f"Prediction for new point (with Gower distance): {pred_new}")

combined_data = np.vstack([X, new_point.reshape(1, -1)])  # Add new_point as a row
full_D = make_distance_matrix(combined_data, metric='minkowski')  # Same metric as D
new_distances = full_D[-1, :-1]  # Extract distances from new_point (last row) to all X (all but last column)

# This time using Minkowski distance
pred_new = knn_predictor(D, species, k=5, new_distances=new_distances)
print(f"Prediction for new point (with Minkowski distance): {pred_new}")


# And now another point
new_point = X[100:110].mean(axis=0)

# Predict its species based on kNN, this time using Gower distance.
combined_data = np.vstack([X, new_point.reshape(1, -1)])  # Add new_point as a row
full_D = make_distance_matrix(combined_data, metric='gower')  # Same metric as D
new_distances = full_D[-1, :-1]  # Extract distances from new_point (last row) to all X (all but last column)

# Now using Gower distance
pred_new = knn_predictor(D, species, k=5, new_distances=new_distances)
print(f"Prediction for new point (with Gower distance): {pred_new}")

combined_data = np.vstack([X, new_point.reshape(1, -1)])  # Add new_point as a row
full_D = make_distance_matrix(combined_data, metric='minkowski')  # Same metric as D
new_distances = full_D[-1, :-1]  # Extract distances from new_point (last row) to all X (all but last column)

# This time using Minkowski distance
pred_new = knn_predictor(D, species, k=5, new_distances=new_distances)
print(f"Prediction for new point (with Minkowski distance): {pred_new}")

print("In this case, the choice of distance metric seems to not matter all that much.")



kNN (k=10) Leave-One-Out Accuracy: 95.3333333333%
Prediction for sample 0 (Iris-setosa): Iris-setosa
Prediction for new point (with Gower distance): Iris-versicolor
Prediction for new point (with Minkowski distance): Iris-versicolor
Prediction for new point (with Gower distance): Iris-virginica
Prediction for new point (with Minkowski distance): Iris-virginica
