In [1]:
import pandas as pd
import json

In [2]:
output_file = pd.read_csv('./data/partitions_combined_output.csv')

In [19]:
output_file.predicted_answer.value_counts()

predicted_answer
a      15902
b      15394
c       9547
d       3267
C        502
B        183
D        118
A         70
The       17
Name: count, dtype: int64

In [12]:
output_file

Unnamed: 0,questionID,question_text,question_image,answer_choices,context_image,context_image_caption,text_context,label,predicted_answer,sorted_logits
0,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",teaching_images/erosion_6859.png,The diagram represents the coastal Erosion of ...,Most fossils are preserved by one of five proc...,d,b,"[{'letter': 'b', 'logit': 26.125, 'probability..."
1,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",textbook_images/erosion_and_deposition_by_flow...,FIGURE 10.5 How a Waterfall Forms and Moves. W...,Most fossils are preserved by one of five proc...,d,d,"[{'letter': 'd', 'logit': 26.25, 'probability'..."
2,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",teaching_images/cycle_rock_6723.png,The diagram shows types of rocks and rock form...,Most fossils are preserved by one of five proc...,d,a,"[{'letter': 'a', 'logit': 27.25, 'probability'..."
3,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",teaching_images/erosion_6859.png,The diagram represents the coastal Erosion of ...,1. What is the traditional definition of gravi...,d,d,"[{'letter': 'd', 'logit': 24.0, 'probability':..."
4,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",textbook_images/erosion_and_deposition_by_flow...,FIGURE 10.5 How a Waterfall Forms and Moves. W...,1. What is the traditional definition of gravi...,d,a,"[{'letter': 'a', 'logit': 21.125, 'probability..."
...,...,...,...,...,...,...,...,...,...,...
44995,DQ_010654,Which label represents the smoke?,abc_question_images/nuclear_energy_17093.png,"a. L, b. K, c. W, d. J",textbook_images/nuclear_fission_22987.png,FIGURE 1.4,The two types of air pollutants are primary po...,d,a,"[{'letter': 'a', 'logit': 26.375, 'probability..."
44996,DQ_010654,Which label represents the smoke?,abc_question_images/nuclear_energy_17093.png,"a. L, b. K, c. W, d. J",textbook_images/nuclear_energy_22236.png,FIGURE 11.14 This diagram shows the main parts...,The two types of air pollutants are primary po...,d,a,"[{'letter': 'a', 'logit': 26.375, 'probability..."
44997,DQ_010654,Which label represents the smoke?,abc_question_images/nuclear_energy_17093.png,"a. L, b. K, c. W, d. J",teaching_images/nuclear_energy_7093.png,This Diagram shows how a Nuclear plant Work. H...,"Cigarette smoking can cause serious diseases, ...",d,a,"[{'letter': 'a', 'logit': 25.25, 'probability'..."
44998,DQ_010654,Which label represents the smoke?,abc_question_images/nuclear_energy_17093.png,"a. L, b. K, c. W, d. J",textbook_images/nuclear_fission_22987.png,FIGURE 1.4,"Cigarette smoking can cause serious diseases, ...",d,a,"[{'letter': 'a', 'logit': 25.5, 'probability':..."


## Data Cleaning

In [26]:
import ast
import numpy as np

# First, let's standardize the answer choices
valid_choices = ['a', 'b', 'c', 'd']

# Function to convert string representation of logits to numpy array
def extract_logits(logits_str):
    '''
    The resulting logits will be stored as numpy arrays with shape (4,), 
    corresponding to the logits for ['a', 'b', 'c', 'd'] in that order.
    '''
    try:
        # Convert string to list of dicts
        logits_list = ast.literal_eval(logits_str)
        
        # Create a dictionary to store logits for valid choices
        logits_dict = {item['letter'].lower(): item['logit'] for item in logits_list 
                      if item['letter'].lower() in valid_choices}
        
        # Create array in consistent order
        return np.array([logits_dict.get(choice, 0.0) for choice in valid_choices])
    except:
        return np.zeros(len(valid_choices))

# Process the dataframe
def process_dataset(df):
    # Create a copy to avoid modifying original
    processed_df = df.copy()
    
    # Convert predicted answers to lowercase
    processed_df['predicted_answer'] = processed_df['predicted_answer'].str.lower()
    
    # Filter out invalid predictions
    processed_df = processed_df[processed_df['predicted_answer'].isin(valid_choices)]
    
    # Convert logits to numpy arrays
    processed_df['logits'] = processed_df['sorted_logits'].apply(extract_logits)
    
    # Drop the original sorted_logits column
    processed_df = processed_df.drop('sorted_logits', axis=1)
    
    return processed_df

# Process the dataset
cleaned_output = process_dataset(output_file)

cleaned_output.head(5)

Unnamed: 0,questionID,question_text,question_image,answer_choices,context_image,context_image_caption,text_context,label,predicted_answer,logits
0,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",teaching_images/erosion_6859.png,The diagram represents the coastal Erosion of ...,Most fossils are preserved by one of five proc...,d,b,"[25.75, 26.125, 21.125, 26.0]"
1,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",textbook_images/erosion_and_deposition_by_flow...,FIGURE 10.5 How a Waterfall Forms and Moves. W...,Most fossils are preserved by one of five proc...,d,d,"[26.0, 23.875, 21.0, 26.25]"
2,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",teaching_images/cycle_rock_6723.png,The diagram shows types of rocks and rock form...,Most fossils are preserved by one of five proc...,d,a,"[27.25, 25.0, 22.125, 26.0]"
3,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",teaching_images/erosion_6859.png,The diagram represents the coastal Erosion of ...,1. What is the traditional definition of gravi...,d,d,"[21.875, 23.125, 19.625, 24.0]"
4,DQ_000001,How many actions are depicted in the diagram?,question_images/erosion_6843.png,"a. 6, b. 4, c. 8, d. 7",textbook_images/erosion_and_deposition_by_flow...,FIGURE 10.5 How a Waterfall Forms and Moves. W...,1. What is the traditional definition of gravi...,d,a,"[21.125, 18.25, 16.375, 19.125]"


In [27]:
# Verify the results
print("Unique predictions after cleaning:")
print(cleaned_output.predicted_answer.value_counts())
print("\nSample logits array:")
print(cleaned_output['logits'].iloc[0])

Unique predictions after cleaning:
predicted_answer
a    15972
b    15577
c    10049
d     3385
Name: count, dtype: int64

Sample logits array:
[25.75  26.125 21.125 26.   ]


## Upload Dataset to Huggingface Hub

In [34]:
from datasets import Dataset

In [33]:
!huggingface-cli login
# !huggingface-cli whoami

Sakonii
[1morgs: [0m talentless-us,discord-community


In [35]:
dataset = Dataset.from_pandas(cleaned_output)
dataset.push_to_hub("Sakonii/multimodal-logits-dataset", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Sakonii/multimodal-logits-dataset/commit/04e78829a9dc352d2e401bebd2d2d68e28d482da', commit_message='Upload dataset', commit_description='', oid='04e78829a9dc352d2e401bebd2d2d68e28d482da', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Sakonii/multimodal-logits-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Sakonii/multimodal-logits-dataset'), pr_revision=None, pr_num=None)