In [37]:
import chromadb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from huggingface_hub import InferenceClient

from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns', None) # display all columns

In [39]:
df = pd.read_csv('../data/superheroes_cleaned.csv')
df.head()

Unnamed: 0,id,name,intelligence,strength,speed,durability,power,combat,full-name,alter-egos,aliases,place-of-birth,first-appearance,publisher,alignment,gender,race,height,weight,eye-color,hair-color,occupation,base,group-affiliation,relatives,url
0,1,A-Bomb,38.0,100.0,17.0,80.0,24.0,64.0,Richard Milhouse Jones,No alter egos found.,['Rick Jones'],"Scarsdale, Arizona","Hulk Vol 2 #2 (April, 2008) (as A-Bomb)",Marvel Comics,good,Male,Human,"[""6'8"", '203 cm']","['980 lb', '441 kg']",Yellow,No Hair,"Musician, adventurer, author; formerly talk sh...",-,"Hulk Family; Excelsior (sponsor), Avengers (ho...",Marlo Chandler-Jones (wife); Polly (aunt); Mrs...,https://www.superherodb.com/pictures2/portrait...
1,2,Abe Sapien,88.0,28.0,35.0,65.0,100.0,85.0,Abraham Sapien,No alter egos found.,"['Langdon Everett Caul', 'Abraham Sapien', 'La...",-,Hellboy: Seed of Destruction (1993),Dark Horse Comics,good,Male,Icthyo Sapien,"[""6'3"", '191 cm']","['145 lb', '65 kg']",Blue,No Hair,Paranormal Investigator,-,Bureau for Paranormal Research and Defense,"Edith Howard (wife, deceased)",https://www.superherodb.com/pictures2/portrait...
2,3,Abin Sur,50.0,90.0,53.0,64.0,99.0,65.0,Unknown,No alter egos found.,['Lagzia'],Ungara,"Showcase #22 (October, 1959)",DC Comics,good,Male,Ungaran,"[""6'1"", '185 cm']","['200 lb', '90 kg']",Blue,No Hair,"Green Lantern, former history professor",Oa,"Green Lantern Corps, Black Lantern Corps","Amon Sur (son), Arin Sur (sister), Thaal Sines...",https://www.superherodb.com/pictures2/portrait...
3,4,Abomination,63.0,80.0,53.0,90.0,62.0,95.0,Emil Blonsky,No alter egos found.,"['Agent R-7', 'Ravager of Worlds']","Zagreb, Yugoslavia",Tales to Astonish #90,Marvel Comics,bad,Male,Human / Radiation,"[""6'8"", '203 cm']","['980 lb', '441 kg']",Green,No Hair,Ex-Spy,Mobile,former member of the crew of the Andromeda Sta...,"Nadia Dornova Blonsky (wife, separated)",https://www.superherodb.com/pictures2/portrait...
4,5,Abraxas,88.0,63.0,83.0,100.0,100.0,55.0,Abraxas,No alter egos found.,['-'],Within Eternity,Fantastic Four Annual #2001,Marvel Comics,bad,Male,Cosmic Entity,"['-', '0 cm']","['- lb', '0 kg']",Blue,Black,Dimensional destroyer,-,Cosmic Beings,"Eternity (""Father"")",https://www.superherodb.com/pictures2/portrait...


In [40]:
df.columns

Index(['id', 'name', 'intelligence', 'strength', 'speed', 'durability',
       'power', 'combat', 'full-name', 'alter-egos', 'aliases',
       'place-of-birth', 'first-appearance', 'publisher', 'alignment',
       'gender', 'race', 'height', 'weight', 'eye-color', 'hair-color',
       'occupation', 'base', 'group-affiliation', 'relatives', 'url'],
      dtype='object')

In [41]:
exclude_cols = ["id", "url", "context"]

def build_context(row):
    return (
        f"Name: {row['name']}. "
        f"Full Name: {row['full-name']}. "
        f"Alter Egos: {row['alter-egos']}. "
        f"Aliases: {row['aliases']}. "
        f"Publisher: {row['publisher']}. "
        f"Alignment: {row['alignment']}. "
        f"Gender: {row['gender']}. "
        f"Race: {row['race']}. "
        f"Place of Birth: {row['place-of-birth']}. "
        f"First Appearance: {row['first-appearance']}. "
        f"Intelligence: {row['intelligence']}. "
        f"Strength: {row['strength']}. "
        f"Speed: {row['speed']}. "
        f"Durability: {row['durability']}. "
        f"Power: {row['power']}. "
        f"Combat: {row['combat']}. "
        f"Height: {row['height']}. "
        f"Weight: {row['weight']}. "
        f"Eye Color: {row['eye-color']}. "
        f"Hair Color: {row['hair-color']}. "
        f"Occupation: {row['occupation']}. "
        f"Base: {row['base']}. "
        f"Group Affiliation: {row['group-affiliation']}. "
        f"Relatives: {row['relatives']}."
    )

# Apply the function to create the context column
df["context"] = df.apply(build_context, axis=1)

# Preview a single context
print(df["context"].iloc[0][:500])  # First 500 characters of first row


Name: A-Bomb. Full Name: Richard Milhouse Jones. Alter Egos: No alter egos found.. Aliases: ['Rick Jones']. Publisher: Marvel Comics. Alignment: good. Gender: Male. Race: Human. Place of Birth: Scarsdale, Arizona. First Appearance: Hulk Vol 2 #2 (April, 2008) (as A-Bomb). Intelligence: 38.0. Strength: 100.0. Speed: 17.0. Durability: 80.0. Power: 24.0. Combat: 64.0. Height: ["6'8", '203 cm']. Weight: ['980 lb', '441 kg']. Eye Color: Yellow. Hair Color: No Hair. Occupation: Musician, adventurer, a


In [None]:
client = InferenceClient(
    provider="auto",
    api_key=os.environ["HF_TOKEN"],
)

texts = df["context"].tolist()
embeddings = []

# Batch to avoid rate limits (e.g., 32 per call)
for i in range(0, len(texts), 32):
    batch = texts[i:i+32]
    batch_embeddings = client.feature_extraction(
        batch,
        model="sentence-transformers/all-mpnet-base-v2"
    )
    embeddings.extend(batch_embeddings)

print(f"Generated {len(embeddings)} embeddings.")

Generated 731 embeddings.


In [43]:
CHROMA_HOST = os.getenv("CHROMA_HOST")
CHROMA_PORT = int(os.getenv("CHROMA_PORT"))

client = chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT)
collection = client.get_or_create_collection("superheroes")

collection.add(
    documents=texts,
    embeddings=embeddings,
    ids=df["id"].astype(str).tolist(),
    metadatas=[{"name": n, "publisher": p} for n, p in zip(df["name"], df["publisher"])]
)

print("Data successfully stored in Chroma!")

Data successfully stored in Chroma!


In [44]:
collection = client.get_collection("superheroes")

# Count all documents
print("Total docs:", len(collection.get()["ids"]))

# Preview a few
preview = collection.get(limit=3)
for doc, meta in zip(preview["documents"], preview["metadatas"]):
    print(meta["name"], "→", doc[:120], "...")


Total docs: 731
A-Bomb → Name: A-Bomb. Intelligence: 38.0. Strength: 100.0. Speed: 17.0. Durability: 80.0. Power: 24.0. Combat: 64.0. Publisher:  ...
Abe Sapien → Name: Abe Sapien. Intelligence: 88.0. Strength: 28.0. Speed: 35.0. Durability: 65.0. Power: 100.0. Combat: 85.0. Publish ...
Abin Sur → Name: Abin Sur. Intelligence: 50.0. Strength: 90.0. Speed: 53.0. Durability: 64.0. Power: 99.0. Combat: 65.0. Publisher: ...
