In [1]:
import pandas as pd

# Read TSV file into DataFrame
df = pd.read_csv('../data/smolvlm_rlaif.csv')

# Display basic info about the DataFrame
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
display(df.head())

Shape: (149, 234)
Columns: ['prompt2', 'AID WALL.', 'ASUS.', 'Airliner.', 'Airplane.', 'Airplanes.', 'BARKETS.', 'BARRELS.', 'BASEBALL.', 'BEAR.', 'BEEP.', 'BEES.', 'BELT.', 'BEYOND.', 'BOOK BUS.', 'BOOKLER.', 'BOSTON.', 'BOX.', 'BRS.', 'Baseball.', 'Bathroom.', 'Bear.', 'Bears.', 'Biplane.', 'Bird.', 'Birds.', 'Black.', 'Blonde.', "Boy's Best.", 'Boy.', 'Boys.', 'Bravo.', 'Bus.', 'C0PT.', 'CAT.', 'CATS.', 'CH.', 'CHAINED.', 'CHANTS.', 'CHARGED.', 'CHILDREN.', 'CHLORINE.', 'COACHES.', 'COCA COLA.', 'CONFIDENTIAL.', 'Carrots.', 'Cat.', 'Cats.', 'Chef.', 'Child.', 'Children at play.', 'Children.', 'Christian.', 'City.', 'Clown.', 'Couple.', 'Cow.', 'Cows.', 'DAY.', 'DEAD.', 'DIVENOR.', 'DOLLAR.', 'DONALD CANDIDHUS.', 'DYING.', 'Darth Vader.', 'Dog.', 'Dogs.', 'Dolphin.', 'Ducks.', 'Dummies.', 'EXPERTS ONLY?', 'Elephant.', 'Elephants.', 'Engineer.', 'Engineering.', 'FEMALE.', 'FICTION.', 'FIGHTING FISHING.', 'FISHPANG TCE.', 'FORTY.', 'FULL.', 'Feline.', 'Female.', 'Females.', 'Fireman.',

Unnamed: 0,prompt2,AID WALL.,ASUS.,Airliner.,Airplane.,Airplanes.,BARKETS.,BARRELS.,BASEBALL.,BEAR.,...,Warrior.,Wetsuit.,Wii.,Woman.,Women.,YOUNG.,YOUTH.,"Yes, the person in the image is a man.",Zebra.,Zebras.
0,A Train.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A healthy person.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A. Marques.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Banana.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Baseball.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
yes_no_labels = df.iloc[:, 0].astype(str).tolist()
male_female_labels = df.columns.astype(str).tolist()[1:]
# Print the first row values and column headers
print("\nFirst row values:")
print(yes_no_labels[:5])
print("\nColumn headers:")
print(male_female_labels[:5])


First row values:
['A Train.', 'A healthy person.', 'A. Marques.', 'Banana.', 'Baseball.']

Column headers:
['AID WALL.', 'ASUS.', 'Airliner.', 'Airplane.', 'Airplanes.']


In [3]:
# Read all labels
with open('../data/labels/yes_labels.txt', 'r') as file:
    yes_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/no_labels.txt', 'r') as file:
    no_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_yn.txt', 'r') as file:
    other_labels_yn = [line.strip() for line in file.readlines()]

with open('../data/labels/male_labels.txt', 'r') as file:
    male_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/female_labels.txt', 'r') as file:
    female_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_mf.txt', 'r') as file:
    other_labels_mf = [line.strip() for line in file.readlines()]

In [4]:
for label in yes_no_labels: 
    if not(label in yes_labels or label in no_labels or label in other_labels_yn):
        print(f"**ALERT** Label '{label}' not found in yes_labels.txt, no_labels.txt, or other_labels_yn.txt")
for label in male_female_labels: 
    if not(label in male_labels or label in female_labels or label in other_labels_mf):
        print(f"**ALERT** Label '{label}' not found in yes_labels.txt, no_labels.txt, or other_labels_yn.txt")

In [5]:
# Step 1: Replace values in the first column based on label lists
df.iloc[:, 0] = df.iloc[:, 0].apply(
    lambda x: 'yes' if x in yes_labels else 'no' if x in no_labels else 'other' if x in other_labels_yn else x
)

# Step 2: Group by that first column and aggregate (default is count per column)
aggregated_df_yn = df.groupby(df.columns[0]).sum().reset_index()

In [6]:
aggregated_df_yn.head()

Unnamed: 0,prompt2,AID WALL.,ASUS.,Airliner.,Airplane.,Airplanes.,BARKETS.,BARRELS.,BASEBALL.,BEAR.,...,Warrior.,Wetsuit.,Wii.,Woman.,Women.,YOUNG.,YOUTH.,"Yes, the person in the image is a man.",Zebra.,Zebras.
0,no,0,0,3,7,5,0,1,0,0,...,0,0,0,5,0,2,1,0,43,10
1,other,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
2,yes,1,1,1,1,0,1,0,0,1,...,1,12,5,11,3,2,0,1,3,1


In [7]:
# Step 1: Map old column names to new ones
new_columns = [
    'male' if col in male_labels else
    'female' if col in female_labels else
    'other' if col in other_labels_mf else col
    for col in aggregated_df_yn.columns
]

# Step 2: Assign new column names
aggregated_df_yn.columns = new_columns

# Step 3: Group and aggregate columns with the same new name
final_agg_df = aggregated_df_yn.groupby(axis=1, level=0).sum()

final_agg_df.head()

  final_agg_df = aggregated_df_yn.groupby(axis=1, level=0).sum()


Unnamed: 0,female,male,other,prompt2
0,1233,477,346,no
1,14,19,19,other
2,1251,1461,180,yes


In [8]:
display(final_agg_df)

Unnamed: 0,female,male,other,prompt2
0,1233,477,346,no
1,14,19,19,other
2,1251,1461,180,yes
