In [None]:
import json
private_test_path = 'dataset/alqac25_private_test_task2.json'
vistral_answers_path = 'dataset/task2/vistral_answers.json'
qwen_tf_answers_path = 'dataset/task2/qwen_true_false_results.json'
qwen_mc_answers_path = 'dataset/task2/qwen_multiple_choice_results.json'

with open(private_test_path, 'r', encoding='utf-8') as f:
    private_test = json.load(f)

with open(vistral_answers_path, 'r', encoding='utf-8') as f:
    vistral_answers = json.load(f)
with open(qwen_tf_answers_path, 'r', encoding='utf-8') as f:
    qwen_tf_answers = json.load(f)
    
with open(qwen_mc_answers_path, 'r', encoding='utf-8') as f:
    qwen_mc_answers = json.load(f)

vistral_mapping = {item['question_id']: item for item in vistral_answers}
tf_mapping = {item['question_id']: item for item in qwen_tf_answers}
mc_mapping = {item['question_id']: item for item in qwen_mc_answers}
    
def calculate_confidence(answer, answer_type):
    if answer_type == "tf":
        return abs(answer['scores']['Đúng'] - answer['scores']['Sai'])
    elif answer_type == "mc":
        scores = answer['choice_scores']
        scores = [score for key, score in scores.items()]
        scores.sort(reverse=True)
        return scores[0] - scores[1] 

def merge_answers(tf_threshold=0.5, mc_threshold=0.2):
    merged_answers = []
    stats = {
        "same": 0,
        "chosen_from_vistral": 0,
        "chosen_from_qwen": 0,
        "total_questions": 0
    }

    for item in private_test:
        question_id = item['question_id']
        question_type = item['question_type']
        stats["total_questions"] += 1
        final_answer = None
        
        vistral_answer = vistral_mapping[question_id]
        qwen_answer = None
        if question_type != "Tự luận":
            qwen_answer = tf_mapping[question_id] if question_type == "Đúng/Sai" else mc_mapping[question_id]
        if question_type == "Tự luận":
            final_answer = vistral_answer['answer']
            stats["chosen_from_vistral"] += 1
        else:
            confidence = 0.0
            if vistral_answer['answer'] == qwen_answer['predicted_answer']:
                final_answer = vistral_answer['answer']
                stats["same"] += 1
            elif question_type == "Đúng/Sai":
                confidence = calculate_confidence(qwen_answer, "tf")
                if confidence >= tf_threshold:
                    final_answer = qwen_answer['predicted_answer']
                    stats["chosen_from_qwen"] += 1
                else:
                    final_answer = vistral_answer['answer']
                    stats["chosen_from_vistral"] += 1
            elif question_type == "Trắc nghiệm":
                confidence = calculate_confidence(qwen_answer, "mc")
                if confidence is not None and confidence >= mc_threshold:
                    final_answer = qwen_answer['predicted_answer']
                    stats["chosen_from_qwen"] += 1
                else:
                    final_answer = vistral_answer['answer']
                    stats["chosen_from_vistral"] += 1
            print(f"{question_id=}, {question_type=}, {final_answer=}, {confidence=}")

        merged_answers.append({
            "question_id": question_id,
            "answer": final_answer
        })

    return merged_answers, stats

merged_answers, stats = merge_answers(tf_threshold=0.5, mc_threshold=0.5)
with open('dataset/task2/entry03.json', 'w', encoding='utf-8') as f:
    json.dump(merged_answers, f, indent=4, ensure_ascii=False)