Animal Clustering

In [21]:
# Load libraries
import pandas as pd
import numpy as np
import re
import sklearn
import os
import json
import matplotlib
import plotly

In [2]:
# Load in data
df = pd.read_csv("20AnimalsDataset.csv")
df.head()

Unnamed: 0,ID,animal,paragraph
0,OBS001,dogs,Dogs come in a wide range of sizes and breeds....
1,OBS002,dogs,They are known for their loyalty and companion...
2,OBS003,dogs,They are known for their loyalty and companion...
3,OBS004,dogs,Many families around the world keep them as pe...
4,OBS005,dogs,Many families around the world keep them as pe...


In [3]:
# Extract question
# Function to extract the question (first sentence ending with a '?')
def extract_question(text):
    # This finds all sentences ending in '?'
    questions = re.findall(r'[^.?!]*\?', text)
    if questions:
        return questions[0].strip() # .strip() removes leading space in question
    else:
        return None

# Apply to DataFrame
df['question'] = df['paragraph'].apply(extract_question)

# View result
print(df[['paragraph', 'question']])

                                            paragraph  \
0   Dogs come in a wide range of sizes and breeds....   
1   They are known for their loyalty and companion...   
2   They are known for their loyalty and companion...   
3   Many families around the world keep them as pe...   
4   Many families around the world keep them as pe...   
5   Many people adore them for their mysterious be...   
6   Many people adore them for their mysterious be...   
7   Many people adore them for their mysterious be...   
8   Their purring can be quite soothing to humans....   
9   They often nap for long hours during the day. ...   
10  They communicate through vocalizations and ges...   
11  Monkeys live in forests, mountains, and savann...   
12  Monkeys live in forests, mountains, and savann...   
13  Monkeys are playful and curious by nature. Are...   
14  They live in social groups with hierarchies. M...   
15  They are known for their strength and grace. M...   
16  They often form strong bond

In [4]:
# Cluster ambiguous questions
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# 1. Convert questions to a list
questions = df['question'].tolist()

# 2. Convert to embeddings using SBERT
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(questions, show_progress_bar=True)

# 3. Apply KMeans clustering
# n_clusters arg specifies how many clusters
# random_state ensures reproducible results
kmeans = KMeans(n_clusters=4, random_state=42)
df['ambig_cluster'] = kmeans.fit_predict(embeddings)
df.head()

  from .autonotebook import tqdm as notebook_tqdm
  return forward_call(*args, **kwargs)
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.75it/s]


Unnamed: 0,ID,animal,paragraph,question,ambig_cluster
0,OBS001,dogs,Dogs come in a wide range of sizes and breeds....,Are they more vocal when they're hungry or exc...,1
1,OBS002,dogs,They are known for their loyalty and companion...,How sensitive are they to environmental changes?,0
2,OBS003,dogs,They are known for their loyalty and companion...,What kind of enrichment helps them feel more s...,2
3,OBS004,dogs,Many families around the world keep them as pe...,How do they show signs of trust toward their c...,2
4,OBS005,dogs,Many families around the world keep them as pe...,What do they typically do when they feel threa...,2


In [5]:
# Use APIs to disambiguate clusters
# Load libraries
from openai import OpenAI     # OpenAI client to interact with API
from dotenv import load_dotenv     # Loads .env file (with API key) into Python environment

# Load API key from .env into environment
load_dotenv()

# Retrieve API key from environment
api_key = os.getenv("OPENAI_API_KEY")

# Initialize OpenAI client with API key
# Used to send requests to the OpenAI API
client = OpenAI(api_key=api_key)

# Stores name of model used for API calls
MODEL = "gpt-4o-mini"

# System prompt: general disambiguation
SYSTEM_PROMPT = """
Rewrite the QUESTION so all ambiguous pronouns or references are replaced 
with their explicit referents from the CONTEXT. 
Keep meaning, tone, and tense the same.
Return only JSON: {"disambiguated_question": "..."}.
"""

# JSON schema for clean output
RESPONSE_FORMAT = {
    "type": "json_schema",
    "json_schema": {
        "name": "disambiguated_question",
        "schema": {
            "type": "object",
            "properties": {
                "disambiguated_question": {"type": "string"}
            },
            "required": ["disambiguated_question"],
            "additionalProperties": False
        },
        "strict": True
    }
}

# Define function to disambiguate a question
def disambiguate_question(context, question):
    """Call API to rewrite a question using the given context."""
    prompt = f"CONTEXT:\n{context}\n\nQUESTION:\n{question}"
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT.strip()},
            {"role": "user", "content": prompt}
        ],
        response_format=RESPONSE_FORMAT,
        temperature=0
    )
    # Gets model's reply, converts JSON string to dict, then extracts disambiguated question
    return json.loads(resp.choices[0].message.content)["disambiguated_question"]

# Apply function to each row of dataset
df["disambiguated_question"] = df.apply(
    lambda row: disambiguate_question(row["paragraph"], row["question"]), axis=1
)

In [8]:
# Cluster the disambiguated questions
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# 1. Convert disambiguated questions to a list, in order to convert to embeddings
dq = df['disambiguated_question'].tolist()

# 2. Convert to embeddings using SBERT
model = SentenceTransformer('all-MiniLM-L6-v2')
dq_embeddings = model.encode(dq, show_progress_bar=True)

# 3. Apply KMeans clustering
kmeans_dq = KMeans(n_clusters=4, random_state=42)
df['disambig_cluster'] = kmeans_dq.fit_predict(dq_embeddings)

df.head()

  return forward_call(*args, **kwargs)
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.01it/s]


Unnamed: 0,ID,animal,paragraph,question,ambig_cluster,disambiguated_question,disambig_cluster
0,OBS001,dogs,Dogs come in a wide range of sizes and breeds....,Are they more vocal when they're hungry or exc...,1,Are dogs more vocal when dogs are hungry or ex...,2
1,OBS002,dogs,They are known for their loyalty and companion...,How sensitive are they to environmental changes?,0,How sensitive are dogs to environmental changes?,2
2,OBS003,dogs,They are known for their loyalty and companion...,What kind of enrichment helps them feel more s...,2,What kind of enrichment helps dogs feel more s...,2
3,OBS004,dogs,Many families around the world keep them as pe...,How do they show signs of trust toward their c...,2,How do dogs show signs of trust toward their c...,2
4,OBS005,dogs,Many families around the world keep them as pe...,What do they typically do when they feel threa...,2,What do dogs typically do when dogs feel threa...,2


In [7]:
# Export final dataset
df.to_csv('20AnimalsDataset_Final.csv', index=False)

In [11]:
# Import dataset with adjusted column names in Excel
df_clean = pd.read_csv("20AnimalsDataset_FinalWithNames.csv")
df_clean.head()

Unnamed: 0,ID,Animal,Paragraph,Ambiguous Question,Ambiguous Cluster ID,Ambiguous Cluster Name,Disambiguous Question,Disambiguous Cluster ID,Disambiguous Cluster Name,Weight
0,OBS001,dogs,Dogs come in a wide range of sizes and breeds....,Are they more vocal when they're hungry or exc...,1,Causes of behavior,Are dogs more vocal when dogs are hungry or ex...,2,Questions about dogs,1
1,OBS002,dogs,They are known for their loyalty and companion...,How sensitive are they to environmental changes?,0,Behavior in different environments,How sensitive are dogs to environmental changes?,2,Questions about dogs,1
2,OBS003,dogs,They are known for their loyalty and companion...,What kind of enrichment helps them feel more s...,2,Comfort and trust around others,What kind of enrichment helps dogs feel more s...,2,Questions about dogs,1
3,OBS004,dogs,Many families around the world keep them as pe...,How do they show signs of trust toward their c...,2,Comfort and trust around others,How do dogs show signs of trust toward their c...,2,Questions about dogs,1
4,OBS005,dogs,Many families around the world keep them as pe...,What do they typically do when they feel threa...,2,Comfort and trust around others,What do dogs typically do when dogs feel threa...,2,Questions about dogs,1


In [57]:
# Principal Component Analysis (PCA) Visualization
## Reduces high-dimensional data by finding X directions in data that capture the most variation
'''
- PC1 is the single direction that captures the most variance
- PC2 is the next best direction, perpendicular to PC1, and so on
- Scatterplot shows high-dimensional sentence embeddings in 2D space (shows clustering at a glance)
- “Explained variance ratio” tells you how much of the total variation in the original data is captured by each principal component
  - (e.g., PC1 = 28% means PC1 alone explains 28% of total variance)

"Ambiguous questions - PCA" interpretation
- Clusters are less compact and more spread out, with overlaps
- Some points from different clusters are close together
- Top 2 principal components don't strongly capture distinct group separation
- Suggests that in the original embedding space, ambiguous questions might be harder for KMeans to separate

"Disambiguous questions - PCA" interpretation
- Clusters are tighter and more separated in 2D, 4 clear groupings
- Explains more of the total variation in the embeddings than "Ambiguous questions - PCA", for both PC1 and PC2
- Suggests that after disambiguation, questions became semantically more distinct, and the clustering algorithm grouped them more cleanly
'''

import plotly.express as px
from sklearn.decomposition import PCA
# import plotly.io as pio
# pio.renderers.default = "notebook_connected"

def pca_scatter_interactive(X, labels, questions, title):
    pca = PCA(n_components=2, random_state=42)   # Creates PCA object that finds 2 directions of maximum variance -> reduces data to 2 components
    X2 = pca.fit_transform(X)                    # Results in 2D coordinates to plot
    var = pca.explained_variance_ratio_ * 100    # Gets total variance % captured by PC1 and PC2
    
    # Build DataFrame for Plotly
    df_plot = pd.DataFrame({
        "PC1": X2[:, 0],                         # x-coordinate
        "PC2": X2[:, 1],                         # y-coordinate
        "Cluster": labels.astype(str),           # Categorical labels instead of numeric gradient
        "Question": questions
    })

    fig = px.scatter(
        df_plot,
        x="PC1",
        y="PC2",
        color="Cluster",                         # Different colors per cluster
        hover_data={                             # Controls tooltip contents
            "Cluster": True,
            "PC1": False,
            "PC2": False,
            "Question": True
        },
        title=f"{title} — PCA",
        labels={
            "PC1": f"PC1 ({var[0]:.1f}%)",       # var[0] gets variance captured by PC1, {var[0]:.1f} rounds to 1 decimal place
            "PC2": f"PC2 ({var[1]:.1f}%)"        # var[1] gets variance captured by PC2
        }
    )
    fig.update_traces(marker=dict(size=9))       # Controls size of points
    #fig.show()
    return fig

    '''
    args:
    X: a 2-D array of shape (n_samples, n_features) — your embeddings.
    labels: 1-D array of cluster IDs for each row in X.
    questions: list/Series of text to show on hover (same length as X).
    title: string for the plot title.
    '''

# Prepare labels and questions
ambig_names = df_clean['Ambiguous Cluster Name']
disambig_names = df_clean['Disambiguous Cluster Name']

ambig_questions = df_clean['Ambiguous Question'].astype(str)
disambig_questions = df_clean['Disambiguous Question'].astype(str)

# Plot interactive PCA
fig1 = pca_scatter_interactive(embeddings, ambig_names, ambig_questions, "Ambiguous questions")
fig1.show()
fig2 = pca_scatter_interactive(dq_embeddings, disambig_names, disambig_questions, "Disambiguated questions")
fig2.show()

# Export plots as .html
# fig1.write_html("pca_1.html", include_plotlyjs="cdn")
# fig2.write_html("pca_2.html", include_plotlyjs="cdn")

In [63]:
# Interactive PCA
## Fit once on the combined embeddings from both ambiguous and disambiguated questions, projected into the same PC1 and PC2 axes
## Shows comparative changes of how points move after disambiguation
## Points noticeably shift after disambiguation into more distinct regions

def paired_pca_interactive(embeddings_A, labels_A, questions_A,
                           embeddings_B, labels_B, questions_B,
                           title_left="Ambiguous", title_right="Disambiguated"):
    # 1) Fit one PCA on both sets together
    both = np.vstack([embeddings_A, embeddings_B])
    pca = PCA(n_components=2, random_state=42).fit(both)
    var = pca.explained_variance_ratio_ * 100

    A2 = pca.transform(embeddings_A)
    B2 = pca.transform(embeddings_B)

    # 2) Combine for a faceted scatter
    df_plot = pd.concat([
        pd.DataFrame({
            "PC1": A2[:,0], "PC2": A2[:,1],
            "Cluster": labels_A.astype(str),
            "Question": questions_A.astype(str),
            "Stage": title_left
        }),
        pd.DataFrame({
            "PC1": B2[:,0], "PC2": B2[:,1],
            "Cluster": labels_B.astype(str),
            "Question": questions_B.astype(str),
            "Stage": title_right
        })
    ], ignore_index=True)

    fig = px.scatter(
        df_plot, x="PC1", y="PC2", color="Cluster",
        hover_data={"Cluster": True, "PC1": False, "PC2": False, "Question": True},
        facet_col="Stage", 
        title="Shared PCA space — Ambiguous vs. Disambiguated",
        labels={"PC1": f"PC1 ({var[0]:.1f}%)", "PC2": f"PC2 ({var[1]:.1f}%)"}
    )
    fig.update_traces(marker=dict(size=9), selector=dict(type="scatter"))
    # Link axis ranges across facets so they’re directly comparable
    fig.update_xaxes(matches='x'); fig.update_yaxes(matches='y')
    fig.update_layout(margin=dict(l=40, r=10, t=60, b=40))
    return fig

# Fit data into paired PCA
fig = paired_pca_interactive(
    embeddings, df["ambig_cluster"].to_numpy(), df["question"],
    dq_embeddings, df["disambig_cluster"].to_numpy(), df["disambiguated_question"]
)
fig.show()

In [115]:
# Alluvial diagram (Sankey workaround)
import plotly.graph_objects as go
import matplotlib.colors as mcolors  

# Step 1: Create node lists and mapping
left = df_clean["Ambiguous Cluster Name"].unique().tolist()
right = df_clean["Disambiguous Cluster Name"].unique().tolist()
labels = left + right                                              # List of node labels in order they appear
idx = {lab: i for i, lab in enumerate(labels)}                     # Dictionary mapping each label (node name) to its integer index

# Step 2: Create color mapping for left nodes
palette = px.colors.qualitative.Pastel
node_colors = {lab: palette[i % len(palette)] for i, lab in enumerate(left)}

# Make colors transparent
alpha = 0.5  # 50% opacity
# Convert your hex colors to rgba strings with alpha
link_colors_transparent = [c.replace("rgb", "rgba").replace(")", f", {alpha})") for c in link_colors]

# Step 3: Map sources nodes and targets nodes
sources = df_clean["Ambiguous Cluster Name"].map(idx)              
targets = df_clean["Disambiguous Cluster Name"].map(idx)

# Step 4: Assign link colors based on source node's color
link_colors = df_clean["Ambiguous Cluster Name"].map(node_colors)

# Step 5: Create custom data for tooltips, with Ambiguous Question (Column 1) and Disambiguous Question (Column 2)
custom = np.stack([df_clean["Ambiguous Question"], df_clean["Disambiguous Question"]], axis=1)

# Step 6: Create custom tooltip for each node
### Compute incoming/outgoing sums
out_by_left  = df_clean.groupby("Ambiguous Cluster Name")["Weight"].sum()
in_by_right  = df_clean.groupby("Disambiguous Cluster Name")["Weight"].sum()

### Build a hovertemplate string for each node (same order as `labels`)
node_text = []
for lab in labels:
    if lab in left:
        node_text.append(f"<b>{lab}</b><br>Outgoing flow count: {float(out_by_left.get(lab, 0))}")
    else:
        node_text.append(f"<b>{lab}</b><br>Incoming flow count: {float(in_by_right.get(lab, 0))}")

# Step 7: Build alluvial diagram
fig = go.Figure(go.Sankey(
    arrangement="snap",
    node=dict(                                                      # Customize node appearance
        label=labels,
        pad=10,
        thickness=30,                                              
        color=[node_colors.get(lab, "#cccccc") for lab in labels],
        customdata=node_text,  
        hovertemplate="%{customdata}<extra></extra>"                # Removes value display for nodes
    ),
    link=dict(
        source=sources,                                             # List of starting node indices
        target=targets,                                             # List of ending node indices
        value=df_clean["Weight"],                                   # Thickness for each link
        customdata=custom,
        color=link_colors_transparent,                              # Link colors
        hovertemplate=(
            "<b>%{source.label}</b> → <b>%{target.label}</b><br>"
            "Ambiguous: %{customdata[0]}<br>"
            "Disambiguated: %{customdata[1]}<br>"
            "<extra></extra>"
        )
    )
))

# Add title to figure
fig.update_layout(
    title_text="Alluvial Diagram of Ambiguous → Disambiguated Questions"
)

fig.show()
fig.write_html("alluvial_diagram.html", include_plotlyjs="cdn")

In [80]:
# Show codes for palettes
px.colors.qualitative.__dict__.keys()

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__file__', '__cached__', '__builtins__', '_swatches', 'swatches', 'Plotly', 'D3', 'G10', 'T10', 'Alphabet', 'Dark24', 'Light24', 'Alphabet_r', 'D3_r', 'Dark24_r', 'G10_r', 'Light24_r', 'Plotly_r', 'T10_r', 'Set1', 'Pastel1', 'Dark2', 'Set2', 'Pastel2', 'Set3', 'Set1_r', 'Pastel1_r', 'Dark2_r', 'Set2_r', 'Pastel2_r', 'Set3_r', 'Antique', 'Bold', 'Pastel', 'Prism', 'Safe', 'Vivid', 'Antique_r', 'Bold_r', 'Pastel_r', 'Prism_r', 'Safe_r', 'Vivid_r', '__all__'])