In [None]:
import pandas as pd

# Read TSV file into DataFrame
df = pd.read_csv('../data/smolvlm_m2_sft.csv')

# Display basic info about the DataFrame
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
display(df.head())

In [None]:
yes_no_labels = df.iloc[:, 0].astype(str).tolist()
male_female_labels = df.columns.astype(str).tolist()[1:]
# Print the first row values and column headers
print("\nFirst row values:")
print(yes_no_labels[:5])
print("\nColumn headers:")
print(male_female_labels[:5])

In [None]:
# Read all labels
with open('../data/labels/yes_labels.txt', 'r') as file:
    yes_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/no_labels.txt', 'r') as file:
    no_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_yn.txt', 'r') as file:
    other_labels_yn = [line.strip() for line in file.readlines()]

with open('../data/labels/male_labels.txt', 'r') as file:
    male_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/female_labels.txt', 'r') as file:
    female_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_mf.txt', 'r') as file:
    other_labels_mf = [line.strip() for line in file.readlines()]

In [None]:
for label in yes_no_labels: 
    if not(label in yes_labels or label in no_labels or label in other_labels_yn):
        print(f"**ALERT** Label '{label}' not found in yes_labels.txt, no_labels.txt, or other_labels_yn.txt")
for label in male_female_labels: 
    if not(label in male_labels or label in female_labels or label in other_labels_mf):
        print(f"**ALERT** Label '{label}' not found in yes_labels.txt, no_labels.txt, or other_labels_yn.txt")

In [None]:
# Step 1: Replace values in the first column based on label lists
df.iloc[:, 0] = df.iloc[:, 0].apply(
    lambda x: 'yes' if x in yes_labels else 'no' if x in no_labels else 'other' if x in other_labels_yn else x
)

# Step 2: Group by that first column and aggregate (default is count per column)
aggregated_df_yn = df.groupby(df.columns[0]).sum().reset_index()

In [None]:
aggregated_df_yn.head()

In [None]:
# Step 1: Map old column names to new ones
new_columns = [
    'male' if col in male_labels else
    'female' if col in female_labels else
    'other' if col in other_labels_mf else col
    for col in aggregated_df_yn.columns
]

# Step 2: Assign new column names
aggregated_df_yn.columns = new_columns

# Step 3: Group and aggregate columns with the same new name
final_agg_df = aggregated_df_yn.groupby(axis=1, level=0).sum()

final_agg_df.head()

In [None]:
display(final_agg_df)