In [1]:
import pandas as pd
import json

In [2]:
# Read json data
json_data = []
with open('tqa_v2_test.json', 'r') as file:
    json_data = json.load(file)

In [16]:
diagram_questions = []
non_diagram_mcq_questions = []
non_diagram_true_false_questions = []

# Extract all diagram and non-diagram questions from the json file
for lesson_data in json_data:
    lesson_name = lesson_data["lessonName"]
    diagram_question_data = lesson_data["questions"]["diagramQuestions"]
    non_diagram_question_data = lesson_data["questions"]["nonDiagramQuestions"]
    
    # If there is an accompanying diagram
    if len(diagram_question_data) != 0:
        for _, question_details in diagram_question_data.items():
            question = question_details["beingAsked"]["processedText"]
            answer_choice_1 = question_details["answerChoices"]["a"]["processedText"]
            answer_choice_2 = question_details["answerChoices"]["b"]["processedText"]
            answer_choice_3 = question_details["answerChoices"]["c"]["processedText"]
            answer_choice_4 = question_details["answerChoices"]["d"]["processedText"]
            
            correct_option = question_details["correctAnswer"]["processedText"]
            correct_answer = question_details["answerChoices"][correct_option]["processedText"]
            
            image_path = question_details["imagePath"]
            image_has_labels_to_guess = "Yes" if image_path.startswith("abc_question_images") else "No"

            diagram_questions.append({
                "lesson_name": lesson_name,
                "question_name": question,
                "answer_choice_1": answer_choice_1,
                "answer_choice_2": answer_choice_2,
                "answer_choice_3": answer_choice_3,
                "answer_choice_4": answer_choice_4,
                "correct_answer": correct_answer,
                "image_path": "./" + image_path,
                "image_has_labels_to_guess": image_has_labels_to_guess
            })
    
    # If there is no accompanying diagram
    if len(non_diagram_question_data) != 0:
        for _, question_details in non_diagram_question_data.items():
            # Handle regular MCQ type questions
            if question_details["questionSubType"] == "Multiple Choice":
                question = question_details["beingAsked"]["processedText"]
                answer_choice_1 = question_details["answerChoices"]["a"]["processedText"]
                answer_choice_2 = question_details["answerChoices"]["b"]["processedText"]
                answer_choice_3 = question_details["answerChoices"]["c"]["processedText"] if "c" in question_details["answerChoices"] else None
                answer_choice_4 = question_details["answerChoices"]["d"]["processedText"]
                
                correct_option = question_details["correctAnswer"]["processedText"]
                correct_answer = question_details["answerChoices"][correct_option]["processedText"]
                    
                non_diagram_mcq_questions.append({
                    "lesson_name": lesson_name,
                    "question_name": question,
                    "answer_choice_1": answer_choice_1,
                    "answer_choice_2": answer_choice_2,
                    "answer_choice_3": answer_choice_3,
                    "answer_choice_4": answer_choice_4,
                    "correct_answer": correct_answer
                })
            # Handle True/False questions
            elif question_details["questionSubType"] == "True or False":
                question = question_details["beingAsked"]["processedText"]
                
                # Some questions start with one or multiple underscore(_) character(s), so remove them
                if "_" in question:
                    question = question.replace("_", "")
                
                correct_answer = "true" if question_details["correctAnswer"]["processedText"] == "a" else "false"
                non_diagram_true_false_questions.append({
                    "lesson_name": lesson_name,
                    "question_name": question,
                    "answer_choice_1": "true",
                    "answer_choice_2": "false",
                    "correct_answer": correct_answer
                })

In [17]:
from transformers import pipeline

# Generate captions for all diagram questions
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

for question_data in diagram_questions:
    image_path = question_data["image_path"]
    question_data["caption"] = pipe(image_path)[0]["generated_text"]
    question_data["image_path"] = "../Dataset/test" + image_path[1:]

In [18]:
# Write all relevant data to CSV files
pd.DataFrame(diagram_questions).to_csv("DiagramQuestionsData.csv")
pd.DataFrame(non_diagram_mcq_questions).to_csv("NonDiagram_MCQ_QuestionsData.csv")
pd.DataFrame(non_diagram_true_false_questions).to_csv("NonDiagram_True_False_QuestionsData.csv")