In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dotenv import load_dotenv
load_dotenv()

# make sure to display all columns in a DataFrame
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/superheroes_data.csv')

In [3]:
df.head()

Unnamed: 0,id,name,intelligence,strength,speed,durability,power,combat,full-name,alter-egos,aliases,place-of-birth,first-appearance,publisher,alignment,gender,race,height,weight,eye-color,hair-color,occupation,base,group-affiliation,relatives,url
0,1,A-Bomb,38.0,100.0,17.0,80.0,24.0,64.0,Richard Milhouse Jones,No alter egos found.,['Rick Jones'],"Scarsdale, Arizona","Hulk Vol 2 #2 (April, 2008) (as A-Bomb)",Marvel Comics,good,Male,Human,"[""6'8"", '203 cm']","['980 lb', '441 kg']",Yellow,No Hair,"Musician, adventurer, author; formerly talk sh...",-,"Hulk Family; Excelsior (sponsor), Avengers (ho...",Marlo Chandler-Jones (wife); Polly (aunt); Mrs...,https://www.superherodb.com/pictures2/portrait...
1,2,Abe Sapien,88.0,28.0,35.0,65.0,100.0,85.0,Abraham Sapien,No alter egos found.,"['Langdon Everett Caul', 'Abraham Sapien', 'La...",-,Hellboy: Seed of Destruction (1993),Dark Horse Comics,good,Male,Icthyo Sapien,"[""6'3"", '191 cm']","['145 lb', '65 kg']",Blue,No Hair,Paranormal Investigator,-,Bureau for Paranormal Research and Defense,"Edith Howard (wife, deceased)",https://www.superherodb.com/pictures2/portrait...
2,3,Abin Sur,50.0,90.0,53.0,64.0,99.0,65.0,,No alter egos found.,['Lagzia'],Ungara,"Showcase #22 (October, 1959)",DC Comics,good,Male,Ungaran,"[""6'1"", '185 cm']","['200 lb', '90 kg']",Blue,No Hair,"Green Lantern, former history professor",Oa,"Green Lantern Corps, Black Lantern Corps","Amon Sur (son), Arin Sur (sister), Thaal Sines...",https://www.superherodb.com/pictures2/portrait...
3,4,Abomination,63.0,80.0,53.0,90.0,62.0,95.0,Emil Blonsky,No alter egos found.,"['Agent R-7', 'Ravager of Worlds']","Zagreb, Yugoslavia",Tales to Astonish #90,Marvel Comics,bad,Male,Human / Radiation,"[""6'8"", '203 cm']","['980 lb', '441 kg']",Green,No Hair,Ex-Spy,Mobile,former member of the crew of the Andromeda Sta...,"Nadia Dornova Blonsky (wife, separated)",https://www.superherodb.com/pictures2/portrait...
4,5,Abraxas,88.0,63.0,83.0,100.0,100.0,55.0,Abraxas,No alter egos found.,['-'],Within Eternity,Fantastic Four Annual #2001,Marvel Comics,bad,Male,Cosmic Entity,"['-', '0 cm']","['- lb', '0 kg']",Blue,Black,Dimensional destroyer,-,Cosmic Beings,"Eternity (""Father"")",https://www.superherodb.com/pictures2/portrait...


Check for missing values

In [4]:
df.isna().sum()

id                     0
name                   0
intelligence         165
strength             102
speed                165
durability           165
power                165
combat               165
full-name            101
alter-egos             0
aliases                0
place-of-birth         0
first-appearance       0
publisher             15
alignment              0
gender                 0
race                 302
height                 0
weight                 0
eye-color              0
hair-color             0
occupation             0
base                   0
group-affiliation      0
relatives              0
url                    0
dtype: int64

**Step 1:** convert all columns into string(obj)  
**Step 2:** convert all missing values into "Unknown

Since our embedding processing does not depend on the original datatypes, we convert all columns to string and replace missing values with "Unknown" to ensure consistent and robust model input.

In [5]:
df = df.fillna("Unknown").astype(str)

print(df.dtypes)
print(df.isna().sum())

id                   object
name                 object
intelligence         object
strength             object
speed                object
durability           object
power                object
combat               object
full-name            object
alter-egos           object
aliases              object
place-of-birth       object
first-appearance     object
publisher            object
alignment            object
gender               object
race                 object
height               object
weight               object
eye-color            object
hair-color           object
occupation           object
base                 object
group-affiliation    object
relatives            object
url                  object
dtype: object
id                   0
name                 0
intelligence         0
strength             0
speed                0
durability           0
power                0
combat               0
full-name            0
alter-egos           0
aliases              0
place

save dataset

In [6]:
df.to_csv('../data/superheroes_cleaned.csv', index=False)

In [None]:
# def build_context(row):
#     def clean(val):
#         if isinstance(val, list):
#             return ", ".join(str(v) for v in val if v)
#         return str(val)

#     return (
#         f"Name: {clean(row['name'])}. "
#         f"Full Name: {clean(row['full-name'])}. "
#         f"Alter Egos: {clean(row['alter-egos'])}. "
#         f"Aliases: {clean(row['aliases'])}. "
#         f"Publisher: {clean(row['publisher'])}. "
#         f"Alignment: {clean(row['alignment'])}. "
#         f"Gender: {clean(row['gender'])}. "
#         f"Race: {clean(row['race'])}. "
#         f"Place of Birth: {clean(row['place-of-birth'])}. "
#         f"First Appearance: {clean(row['first-appearance'])}. "
#         f"Intelligence: {clean(row['intelligence'])}. "
#         f"Strength: {clean(row['strength'])}. "
#         f"Speed: {clean(row['speed'])}. "
#         f"Durability: {clean(row['durability'])}. "
#         f"Power: {clean(row['power'])}. "
#         f"Combat: {clean(row['combat'])}. "
#         f"Height: {clean(row['height'])}. "
#         f"Weight: {clean(row['weight'])}. "
#         f"Eye Color: {clean(row['eye-color'])}. "
#         f"Hair Color: {clean(row['hair-color'])}. "
#         f"Occupation: {clean(row['occupation'])}. "
#         f"Base: {clean(row['base'])}. "
#         f"Group Affiliation: {clean(row['group-affiliation'])}. "
#         f"Relatives: {clean(row['relatives'])}."
#     )

# df["context"] = df.apply(build_context, axis=1)
