Animal Clustering

In [33]:
# Load libraries
import pandas as pd
import numpy as np
import re
import sklearn
import os
import json

In [24]:
# Load in data
df = pd.read_csv("20AnimalsDataset.csv")
df.head()

Unnamed: 0,ID,animal,paragraph
0,OBS001,dogs,Dogs come in a wide range of sizes and breeds....
1,OBS002,dogs,They are known for their loyalty and companion...
2,OBS003,dogs,They are known for their loyalty and companion...
3,OBS004,dogs,Many families around the world keep them as pe...
4,OBS005,dogs,Many families around the world keep them as pe...


In [25]:
# Extract question
# Function to extract the question (first sentence ending with a '?')
def extract_question(text):
    # This finds all sentences ending in '?'
    questions = re.findall(r'[^.?!]*\?', text)
    if questions:
        return questions[0].strip() # .strip() removes leading space in question
    else:
        return None

# Apply to DataFrame
df['question'] = df['paragraph'].apply(extract_question)

# View result
print(df[['paragraph', 'question']])

                                            paragraph  \
0   Dogs come in a wide range of sizes and breeds....   
1   They are known for their loyalty and companion...   
2   They are known for their loyalty and companion...   
3   Many families around the world keep them as pe...   
4   Many families around the world keep them as pe...   
5   Many people adore them for their mysterious be...   
6   Many people adore them for their mysterious be...   
7   Many people adore them for their mysterious be...   
8   Their purring can be quite soothing to humans....   
9   They often nap for long hours during the day. ...   
10  They communicate through vocalizations and ges...   
11  Monkeys live in forests, mountains, and savann...   
12  Monkeys live in forests, mountains, and savann...   
13  Monkeys are playful and curious by nature. Are...   
14  They live in social groups with hierarchies. M...   
15  They are known for their strength and grace. M...   
16  They often form strong bond

In [26]:
# Cluster ambiguous questions
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# 1. Convert questions to a list
questions = df['question'].tolist()

# 2. Convert to embeddings using SBERT
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(questions, show_progress_bar=True)

# 3. Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['ambig_cluster'] = kmeans.fit_predict(embeddings)

  return forward_call(*args, **kwargs)
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.74it/s]


In [36]:
df.head()

Unnamed: 0,ID,animal,paragraph,question,ambig_cluster,disambiguated_question,disambig_cluster
0,OBS001,dogs,Dogs come in a wide range of sizes and breeds....,Are they more vocal when they're hungry or exc...,1,Are dogs more vocal when dogs are hungry or ex...,2
1,OBS002,dogs,They are known for their loyalty and companion...,How sensitive are they to environmental changes?,0,How sensitive are dogs to environmental changes?,2
2,OBS003,dogs,They are known for their loyalty and companion...,What kind of enrichment helps them feel more s...,2,What kind of enrichment helps dogs feel more s...,2
3,OBS004,dogs,Many families around the world keep them as pe...,How do they show signs of trust toward their c...,2,How do dogs show signs of trust toward their c...,2
4,OBS005,dogs,Many families around the world keep them as pe...,What do they typically do when they feel threa...,2,What do dogs typically do when dogs feel threa...,2


In [27]:
# Use APIs to disambiguate clusters
from openai import OpenAI
from dotenv import load_dotenv
from textwrap import dedent

# Load variables from .env into environment
load_dotenv()

# Get API key from environment
api_key = os.getenv("OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI(api_key=api_key)

MODEL = "gpt-4o-mini"

# System prompt: general-purpose disambiguation
SYSTEM_PROMPT = """
Rewrite the QUESTION so all ambiguous pronouns or references are replaced 
with their explicit referents from the CONTEXT. 
Keep meaning, tone, and tense the same.
Return only JSON: {"disambiguated_question": "..."}.
"""

# JSON schema for clean output
RESPONSE_FORMAT = {
    "type": "json_schema",
    "json_schema": {
        "name": "disambiguated_question",
        "schema": {
            "type": "object",
            "properties": {
                "disambiguated_question": {"type": "string"}
            },
            "required": ["disambiguated_question"],
            "additionalProperties": False
        },
        "strict": True
    }
}

def disambiguate_question(context, question):
    """Call API to rewrite a question using the given context."""
    prompt = f"CONTEXT:\n{context}\n\nQUESTION:\n{question}"
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT.strip()},
            {"role": "user", "content": prompt}
        ],
        response_format=RESPONSE_FORMAT,
        temperature=0
    )
    return json.loads(resp.choices[0].message.content)["disambiguated_question"]

# Apply function to each row
df["disambiguated_question"] = df.apply(
    lambda row: disambiguate_question(row["paragraph"], row["question"]), axis=1
)

        ID   animal                                          paragraph  \
0   OBS001     dogs  Dogs come in a wide range of sizes and breeds....   
1   OBS002     dogs  They are known for their loyalty and companion...   
2   OBS003     dogs  They are known for their loyalty and companion...   
3   OBS004     dogs  Many families around the world keep them as pe...   
4   OBS005     dogs  Many families around the world keep them as pe...   
5   OBS006     cats  Many people adore them for their mysterious be...   
6   OBS007     cats  Many people adore them for their mysterious be...   
7   OBS008     cats  Many people adore them for their mysterious be...   
8   OBS009     cats  Their purring can be quite soothing to humans....   
9   OBS010     cats  They often nap for long hours during the day. ...   
10  OBS011  monkeys  They communicate through vocalizations and ges...   
11  OBS012  monkeys  Monkeys live in forests, mountains, and savann...   
12  OBS013  monkeys  Monkeys live in f

In [28]:
df['disambiguated_question']

0     Are dogs more vocal when dogs are hungry or ex...
1      How sensitive are dogs to environmental changes?
2     What kind of enrichment helps dogs feel more s...
3     How do dogs show signs of trust toward their c...
4     What do dogs typically do when dogs feel threa...
5           Are sudden noises always startling to cats?
6           Why do cats sometimes follow people around?
7              Why might cats pace in new environments?
8     Is the pacing of cats a sign of anticipation o...
9     Do cats show preference for certain people ove...
10    What might cause monkeys to vocalize more at n...
11       Why might monkeys isolate themselves at times?
12    Why do monkeys often repeat that behavior in t...
13    Are monkeys expressing contentment when monkey...
14    Can the posture of monkeys indicate interest o...
15      Why do horses retreat when approached suddenly?
16            Do horses rely on routine to feel stable?
17    Are horses influenced by other horses in t

In [37]:
# Cluster the disambiguated questions
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# 1. Convert disambiguated questions to a list
dq = df['disambiguated_question'].fillna('').astype(str).tolist()

# 2. Convert to embeddings using SBERT
model = SentenceTransformer('all-MiniLM-L6-v2')
dq_embeddings = model.encode(dq, show_progress_bar=True)

# 3. Apply KMeans clustering
kmeans_dq = KMeans(n_clusters=4, random_state=42)
df['disambig_cluster'] = kmeans_dq.fit_predict(dq_embeddings)

df.head()

  return forward_call(*args, **kwargs)
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.15it/s]


Unnamed: 0,ID,animal,paragraph,question,ambig_cluster,disambiguated_question,disambig_cluster
0,OBS001,dogs,Dogs come in a wide range of sizes and breeds....,Are they more vocal when they're hungry or exc...,1,Are dogs more vocal when dogs are hungry or ex...,2
1,OBS002,dogs,They are known for their loyalty and companion...,How sensitive are they to environmental changes?,0,How sensitive are dogs to environmental changes?,2
2,OBS003,dogs,They are known for their loyalty and companion...,What kind of enrichment helps them feel more s...,2,What kind of enrichment helps dogs feel more s...,2
3,OBS004,dogs,Many families around the world keep them as pe...,How do they show signs of trust toward their c...,2,How do dogs show signs of trust toward their c...,2
4,OBS005,dogs,Many families around the world keep them as pe...,What do they typically do when they feel threa...,2,What do dogs typically do when dogs feel threa...,2


In [41]:
# Export final dataset
df.to_csv('20AnimalsDataset_Final.csv', index=False)