In [1]:
# 1. Setup and Imports
%pip install openai python-dotenv scikit-learn scipy matplotlib scikit-fuzzy --quiet plotly

import os
import numpy as np
import pandas as pd
import openai
from openai import OpenAI
from dotenv import load_dotenv
from sklearn.metrics import pairwise_distances
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import skfuzzy as fuzz  # for fuzzy c-means

# Load environment variables
load_dotenv()
client = OpenAI()  # Updated API client initialization




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# 2. Read the animal names from a text file
with open('animal_names.txt', 'r', encoding='utf-8') as f:
    animals = [line.strip() for line in f if line.strip()]

# Remove duplicates while preserving order
unique_animals = list(dict.fromkeys(animals))

print(f"Number of animal names: {len(unique_animals)}")
unique_animals[:10]  # Show the first 10 for reference


Number of animal names: 574


['Dog',
 'Cat',
 'Bird',
 'Penguin',
 'Eagle',
 'Cockatoo',
 'Camel',
 'Giraffe',
 'Zebra',
 'Lion']

In [3]:
# 3. Define a function to obtain OpenAI embeddings using the latest API
def get_embedding(text, model="text-embedding-3-small"):
    """Fetch embedding for a single piece of text from OpenAI's latest API."""
    text = text.replace("\n", " ")  # Ensure clean input
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding  # Extract embedding vector


In [4]:
# 4. Generate embeddings for each animal name
embeddings = np.array([get_embedding(animal) for animal in unique_animals])

print("Embeddings shape:", embeddings.shape)


Embeddings shape: (574, 1536)


In [5]:
# 5. Compute the distance matrix using cosine distance
distance_matrix = pairwise_distances(embeddings, metric='cosine')
print("Distance matrix shape:", distance_matrix.shape)


Distance matrix shape: (574, 574)


In [21]:
# 6. Perform hierarchical clustering using Ward’s method
Z = linkage(distance_matrix, method='ward')

# Define the number of clusters
NUM_CLUSTERS = 10
cluster_assignments = fcluster(Z, t=NUM_CLUSTERS, criterion='maxclust')

# Store results in a DataFrame
df_hard_clusters = pd.DataFrame({'Animal': unique_animals, 'Cluster': cluster_assignments})
df_hard_clusters.sort_values('Cluster', inplace=True)
df_hard_clusters.reset_index(drop=True, inplace=True)

df_hard_clusters


  Z = linkage(distance_matrix, method='ward')


Unnamed: 0,Animal,Cluster
0,Orca,1
1,Sharks,1
2,sea lion,1
3,Tiger shark,1
4,Krill,1
...,...,...
569,Rhinoceros,10
570,tortoises,10
571,Iguana,10
572,arowana,10


In [28]:
# Create the interactive dendrogram as before
import plotly.figure_factory as ff

fig = ff.create_dendrogram(
    embeddings,
    orientation='left',
    labels=unique_animals,
    distfun=lambda x: pairwise_distances(x, metric='cosine'),
    linkagefun=lambda x: linkage(x, method='ward')
)
fig.update_layout(width=1200, height=800)

# Save the interactive plot as an HTML file
fig.write_html("interactive_dendrogram.html")



scipy.cluster: The symmetric non-negative hollow observation matrix looks suspiciously like an uncondensed distance matrix



In [10]:
# # 8. Fuzzy C-Means Clustering
# # Transpose the embeddings to (features, samples) format
# data_for_fuzzy = embeddings.T

# # Run fuzzy c-means
# cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
#     data_for_fuzzy,
#     c=40,          # Number of clusters
#     m=2.0,        # Fuzziness parameter
#     error=0.005,  # Stopping criterion
#     maxiter=1000,
#     init=None
# )

# # Assign each animal to its most probable cluster
# fuzzy_labels = np.argmax(u, axis=0)

# df_fuzzy_clusters = pd.DataFrame({'Animal': unique_animals, 'Cluster': fuzzy_labels})
# df_fuzzy_clusters.sort_values('Cluster', inplace=True)
# df_fuzzy_clusters.reset_index(drop=True, inplace=True)

# df_fuzzy_clusters.head(20)


Unnamed: 0,Animal,Cluster
0,chow chow,0
1,polar,0
2,porcupine,0
3,Chipmunk,1
4,tortoise,1
5,lemur,1
6,Chameleon,1
7,armadillo,1
8,sloth,1
9,Bees,2


In [11]:
# 9. Inspect membership degrees for soft clustering
membership_df = pd.DataFrame(u.T, columns=[f"Cluster_{i}" for i in range(u.shape[0])])
membership_df.insert(0, 'Animal', unique_animals)
membership_df.head(20)


Unnamed: 0,Animal,Cluster_0,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,...,Cluster_30,Cluster_31,Cluster_32,Cluster_33,Cluster_34,Cluster_35,Cluster_36,Cluster_37,Cluster_38,Cluster_39
0,Dog,0.025001,0.025001,0.024999,0.025,0.024998,0.025001,0.025001,0.025002,0.025,...,0.024999,0.025,0.024999,0.024999,0.025,0.025001,0.025,0.024999,0.025002,0.025
1,Cat,0.025001,0.025004,0.025002,0.025001,0.025,0.025,0.025,0.025005,0.024999,...,0.024999,0.025,0.024996,0.024998,0.024999,0.025004,0.024996,0.024998,0.025003,0.024998
2,Bird,0.025,0.025002,0.025002,0.025002,0.025003,0.025001,0.024999,0.025003,0.025,...,0.025,0.024999,0.024996,0.025,0.024997,0.025005,0.024999,0.024999,0.025001,0.024997
3,Penguin,0.025002,0.025001,0.024998,0.024999,0.024995,0.025001,0.025,0.024999,0.024999,...,0.024999,0.025002,0.025003,0.025003,0.025001,0.024999,0.025003,0.024997,0.025002,0.025
4,Eagle,0.025001,0.025002,0.025001,0.024999,0.024997,0.025001,0.025002,0.025002,0.025001,...,0.024998,0.024999,0.025,0.024999,0.024999,0.025003,0.025002,0.024999,0.025001,0.025
5,Cockatoo,0.024998,0.025003,0.025001,0.025001,0.025,0.025003,0.024996,0.025002,0.024999,...,0.025002,0.025003,0.024998,0.025001,0.025,0.025002,0.024999,0.024998,0.025,0.025
6,Camel,0.025001,0.025004,0.025,0.024999,0.024994,0.025001,0.025001,0.025002,0.025,...,0.024999,0.025002,0.025,0.024999,0.025002,0.025001,0.025002,0.024998,0.025002,0.025001
7,Giraffe,0.024998,0.025004,0.025003,0.024999,0.02499,0.025002,0.024998,0.025001,0.025001,...,0.024999,0.025005,0.025003,0.025002,0.025004,0.025,0.025003,0.024997,0.025002,0.025001
8,Zebra,0.025001,0.025003,0.025001,0.024999,0.024993,0.025001,0.025001,0.025002,0.025,...,0.024998,0.025003,0.025001,0.025,0.025002,0.025,0.025003,0.024998,0.025002,0.025
9,Lion,0.025001,0.025005,0.025001,0.024998,0.024988,0.025001,0.025003,0.025003,0.025001,...,0.024995,0.025004,0.025004,0.025,0.025003,0.025,0.025003,0.024996,0.025003,0.025001
