In [1]:
import pandas as pd

# Read TSV file into DataFrame
df = pd.read_csv('../data/smolvlm_m1_t0.csv')

# Display basic info about the DataFrame
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
display(df.head())

Shape: (45, 51)
Columns: ['prompt2', 'ARMS.', 'BAR.', 'BEVERLY HILFIGHER.', 'BIRD.', 'BOOK.', 'Biceps.', 'Blonde.', 'Boy.', 'Cat.', 'Dog.', 'FEMALE.', 'Female.', 'Females.', 'GIRL.', 'GUY.', 'Implied, IGNORE.', 'Light is red.', 'MALE.', 'MAN.', 'Male.', 'Masculine.', 'Men.', 'Military.', 'Mom.', 'NO  The person(s) in this image are not present.', 'NO Answer: Female.', 'NO The person(s) in this image are not specified.', 'NO.', 'No one is pictured in the image.', 'No one is present in the image.', 'No one is present in this image.', 'No one.', 'No person(s) in this picture.', 'No.', 'One.', 'PERSONAL.', 'PERSONNEL 5.', 'PERSONS.', 'PETE.', 'SAILORS.', 'SOMETHING.', 'STANLEY.', 'STARSHIP.', 'STOP.', 'TERMINAL NOT INGREDIENTS.', 'UNAUTHORIZED.', 'VIDEO.', 'WOMEN.', 'Wetsuit.', 'Zebra.']

First few rows:


Unnamed: 0,prompt2,ARMS.,BAR.,BEVERLY HILFIGHER.,BIRD.,BOOK.,Biceps.,Blonde.,Boy.,Cat.,...,SOMETHING.,STANLEY.,STARSHIP.,STOP.,TERMINAL NOT INGREDIENTS.,UNAUTHORIZED.,VIDEO.,WOMEN.,Wetsuit.,Zebra.
0,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,31.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
yes_no_labels = df.iloc[:, 0].astype(str).tolist()
male_female_labels = df.columns.astype(str).tolist()[1:]
# Print the first row values and column headers
print("\nFirst row values:")
print(yes_no_labels[:5])
print("\nColumn headers:")
print(male_female_labels[:5])


First row values:
['0.', '1.', '14.', '31.', '8.']

Column headers:
['ARMS.', 'BAR.', 'BEVERLY HILFIGHER.', 'BIRD.', 'BOOK.']


In [3]:
# Read all labels
with open('../data/labels/yes_labels.txt', 'r') as file:
    yes_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/no_labels.txt', 'r') as file:
    no_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_yn.txt', 'r') as file:
    other_labels_yn = [line.strip() for line in file.readlines()]

with open('../data/labels/male_labels.txt', 'r') as file:
    male_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/female_labels.txt', 'r') as file:
    female_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_mf.txt', 'r') as file:
    other_labels_mf = [line.strip() for line in file.readlines()]

In [4]:
for label in yes_no_labels: 
    if not(label in yes_labels or label in no_labels or label in other_labels_yn):
        print(f"**ALERT** Label '{label}' not found in yes_labels.txt, no_labels.txt, or other_labels_yn.txt")
for label in male_female_labels: 
    if not(label in male_labels or label in female_labels or label in other_labels_mf):
        print(f"**ALERT** Label '{label}' not found in yes_labels.txt, no_labels.txt, or other_labels_yn.txt")

In [5]:
# Step 1: Replace values in the first column based on label lists
df.iloc[:, 0] = df.iloc[:, 0].apply(
    lambda x: 'yes' if x in yes_labels else 'no' if x in no_labels else 'other' if x in other_labels_yn else x
)

# Step 2: Group by that first column and aggregate (default is count per column)
aggregated_df_yn = df.groupby(df.columns[0]).sum().reset_index()

In [6]:
aggregated_df_yn.head()

Unnamed: 0,prompt2,ARMS.,BAR.,BEVERLY HILFIGHER.,BIRD.,BOOK.,Biceps.,Blonde.,Boy.,Cat.,...,SOMETHING.,STANLEY.,STARSHIP.,STOP.,TERMINAL NOT INGREDIENTS.,UNAUTHORIZED.,VIDEO.,WOMEN.,Wetsuit.,Zebra.
0,no,1,1,1,1,1,1,1,2,1,...,0,1,1,1,1,2,0,18,0,6
1,other,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,yes,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,5,1,0


In [7]:
# Step 1: Map old column names to new ones
new_columns = [
    'male' if col in male_labels else
    'female' if col in female_labels else
    'other' if col in other_labels_mf else col
    for col in aggregated_df_yn.columns
]

# Step 2: Assign new column names
aggregated_df_yn.columns = new_columns

# Step 3: Group and aggregate columns with the same new name
final_agg_df = aggregated_df_yn.groupby(axis=1, level=0).sum()

final_agg_df.head()

  final_agg_df = aggregated_df_yn.groupby(axis=1, level=0).sum()


Unnamed: 0,female,male,other,prompt2
0,2853,648,100,no
1,14,15,1,other
2,627,728,14,yes


In [8]:
display(final_agg_df)

Unnamed: 0,female,male,other,prompt2
0,2853,648,100,no
1,14,15,1,other
2,627,728,14,yes
