In [None]:
import csv
import json

def generate_json_from_csv(csv_file):
    train_data = []
    
    with open(csv_file, newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        
        for idx, row in enumerate(reader):
            context = row['easyocr_text'] + ' ' + row['tesseract_text']
            question = f"What is the value of the {row['entity_name']}?"
            entity_value = row['entity_value']
            answer_starts = row['ans_starting index'].split('_')
            is_impossible = row['is_impossible'].lower() == 'true'
            
            answers = [{"text": entity_value, "answer_start": int(start)} for start in answer_starts if start.isdigit()]
            
            qas = {
                "id": f"{idx+1:05}",
                "is_impossible": is_impossible,
                "question": question,
                "answers": answers if not is_impossible else []
            }
            
            train_data.append({
                "context": context,
                "qas": [qas]
            })
    
    return train_data

# Usage
csv_file = "./dataset/final_train.csv.csv"  # Replace with your CSV file path
json_data = generate_json_from_csv(csv_file)

# Write to JSON file
with open("train.json", "w", encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

print("JSON data created successfully!")
