In [None]:
import pandas as pd

# Read TSV file into DataFrame
df_one = pd.read_csv('../data/smolvlm_m2_baseline.tsv', sep='\t')
df_two = pd.read_csv('../data/smolvlm_m2_t1_results_0_to_5000.tsv', sep='\t')

# Display basic info about the DataFrame
print(f"Shape: {df_one.shape}")
print(f"Columns: {list(df_one.columns)}")
print("\nFirst few rows:")
display(df_one.head())

print(f"Shape: {df_two.shape}")
print(f"Columns: {list(df_two.columns)}")
print("\nFirst few rows:")
display(df_two.head())

In [None]:
# Create a boolean mask for rows where prompt2 or prompt3 differ
mask = (df_one['prompt2'] != df_two['prompt2']) | (df_one['prompt3'] != df_two['prompt3'])

# Filter both dataframes to get only the differing rows
df_one_filtered = df_one[mask]
df_two_filtered = df_two[mask]

# Optional: Create a comparison dataframe to see the differences side by side
comparison_df = pd.DataFrame({
    'index': df_one[mask].index,
    'prompt2_df_one': df_one_filtered['prompt2'],
    'prompt2_df_two': df_two_filtered['prompt2'],
    'prompt3_df_one': df_one_filtered['prompt3'],
    'prompt3_df_two': df_two_filtered['prompt3']
})

print(f"Found {len(df_one_filtered)} rows with differences")
print("\nComparison of differing rows:")
display(comparison_df)

In [None]:
# Read all labels
with open('../data/labels/yes_labels.txt', 'r') as file:
    yes_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/no_labels.txt', 'r') as file:
    no_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_yn.txt', 'r') as file:
    other_labels_yn = [line.strip() for line in file.readlines()]

with open('../data/labels/male_labels.txt', 'r') as file:
    male_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/female_labels.txt', 'r') as file:
    female_labels = [line.strip() for line in file.readlines()]

with open('../data/labels/other_labels_mf.txt', 'r') as file:
    other_labels_mf = [line.strip() for line in file.readlines()]

In [None]:
def replace_label(prompt_text, yn=True):
    if yn:
        if prompt_text in yes_labels:
             return 'yes'
        elif prompt_text in no_labels:
            return 'no'
        elif prompt_text in other_labels_yn:
            return 'other'
        else:
            print(f"ALERT: {prompt_text} not found in yes/no labels")
            return prompt_text
    else:
        if prompt_text in male_labels:
             return 'male'
        elif prompt_text in female_labels:
            return 'female'
        elif prompt_text in other_labels_mf:
            return 'other'
        else:
            print(f"ALERT: {prompt_text} not found in male/female labels")
            return prompt_text

# Apply the function to replace labels
comparison_df['prompt2_df_one'] = comparison_df['prompt2_df_one'].apply(replace_label)

In [None]:
comparison_df['prompt2_df_two'] = comparison_df['prompt2_df_two'].apply(replace_label)
comparison_df['prompt3_df_one'] = comparison_df['prompt3_df_one'].apply(replace_label, yn=False)
comparison_df['prompt3_df_two'] = comparison_df['prompt3_df_two'].apply(replace_label, yn=False)

In [None]:
comparison_df.head()

In [None]:
def assign_yn_group(row):
   prompt2_one = row['prompt2_df_one']
   prompt2_two = row['prompt2_df_two']
   
   transition = f"{prompt2_one} → {prompt2_two}"
   
   group_mapping = {
       'no → no': 1,
       'yes → yes': 2,
       'other → other': 3,
       'no → yes': 4,
       'no → other': 5,
       'yes → no': 6,
       'yes → other': 7,
       'other → yes': 8,
       'other → no': 9
   }
   
   return group_mapping.get(transition, None)

# Apply the function to create the yn_groups column
comparison_df['yn_groups'] = comparison_df.apply(assign_yn_group, axis=1)
comparison_df.head()

In [None]:
def assign_gender_group(row):
   prompt3_one = row['prompt3_df_one']
   prompt3_two = row['prompt3_df_two']
   
   transition = f"{prompt3_one} → {prompt3_two}"
   
   group_mapping = {
       'female → female': 1,
       'male → male': 2,
       'other → other': 3,
       'female → male': 4,
       'female → other': 5,
       'male → female': 6,
       'male → other': 7,
       'other → male': 8,
       'other → female': 9
   }
   
   return group_mapping.get(transition, None)

# Apply the function to create the mf_groups column
comparison_df['mf_groups'] = comparison_df.apply(assign_gender_group, axis=1)
comparison_df.head()

In [None]:
def convert_yn_group_back(group_num):
   yn_mapping = {
       1: 'no → no',
       2: 'yes → yes', 
       3: 'other → other',
       4: 'no → yes',
       5: 'no → other',
       6: 'yes → no',
       7: 'yes → other',
       8: 'other → yes',
       9: 'other → no'
   }
   return yn_mapping.get(group_num, group_num)

def convert_mf_group_back(group_num):
   mf_mapping = {
       1: 'female → female',
       2: 'male → male',
       3: 'other → other', 
       4: 'female → male',
       5: 'female → other',
       6: 'male → female',
       7: 'male → other',
       8: 'other → male',
       9: 'other → female'
   }
   return mf_mapping.get(group_num, group_num)

In [None]:
# Create the pairing and get frequency counts
pairing_counts = comparison_df.groupby(['yn_groups', 'mf_groups']).size().reset_index(name='count')

# comparison_df = comparison_df[
#    (~comparison_df['yn_groups'].isin([1, 2, 3])) & 
#    (~comparison_df['mf_groups'].isin([1, 2, 3]))
# ]

# Or if you prefer a more compact view, you can use value_counts
pairing_frequency = comparison_df[['yn_groups', 'mf_groups']].value_counts().reset_index(name='count')

# Convert back to readable labels
pairing_frequency['yn_groups'] = pairing_frequency['yn_groups'].apply(convert_yn_group_back)
pairing_frequency['mf_groups'] = pairing_frequency['mf_groups'].apply(convert_mf_group_back)

print(pairing_frequency)

: 