### multiple subdomain 결과 재정렬

In [None]:
# import json
# domain = '경제'
# with open(f'evaluation/eval_data/multiple_subdomain_results/{domain}_subdomain_classified.json', 'r', encoding='utf-8') as f:
    # data = json.load(f)

#  new_data = []
# for d in data:
#     if d['qna_subdomain_reason'] in d['qna_reason']:
#         d['qna_reason'] = d['qna_reason'].replace(d['qna_subdomain_reason'], '').strip()
#         # print(d)
#     new_d = {
#         'file_id': d['file_id'],
#         'title': d['title'],
#         'chapter': d['chapter'],
#         'tag': d['tag'],
#         'domain': d['domain'],
#         'subdomain': d['subdomain'],
#         'domain_reason': d['domain_reason'],
#         'subdomain_reason': d['subdomain_reason'],
#         'question': d['question'],
#         'options': d['options'],
#         'answer': d['qna_answer'],
#         'explanation': d['explanation']
#     }
#     new_data.append(new_d)
    
# with open(f'evaluation/eval_data/subdomain_results_old/{domain}_subdomain_classified.json', 'w', encoding='utf-8') as f:
#     json.dump(new_data, f, ensure_ascii=False, indent=4)

## 모의고사 만들기 파이프라인

### mock 데이터에서 모의고사 문제 추출

In [3]:
import os, json, random
import logging
from datetime import datetime

BASE_DIR = '/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data'
PROCESSED_DIR = os.path.join(BASE_DIR, '2_subdomain')
EXAM_DIR = os.path.join(BASE_DIR, '4_multiple_exam')

# 세트 이름 매핑
set_names = {
    1: '1st',
    2: '2nd',
    3: '3rd',
    4: '4th',
    5: '5th'
}

# 로깅 설정
log_file = f'/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/logs/mock_exam_extraction.log'
os.makedirs(os.path.dirname(log_file), exist_ok=True)

logger = logging.getLogger('mock_exam_extraction')
logger.setLevel(logging.INFO)

# 기존 핸들러 제거 (중복 방지)
if logger.handlers:
    logger.handlers.clear()

# 파일 핸들러
file_handler = logging.FileHandler(log_file, encoding='utf-8', mode='a')
file_handler.setLevel(logging.INFO)

# 콘솔 핸들러
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# 포맷 설정
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

# 핸들러 추가
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# 부모 로거로 전파 방지 (중복 출력 방지)
logger.propagate = False

# multiple_subdomain_classified_ALL.json 파일에서 모든 데이터 로드
ALL_DATA_FILE = os.path.join(PROCESSED_DIR, 'multiple_subdomain_classified_ALL.json')

logger.info(f"데이터 파일 로딩 시작: {ALL_DATA_FILE}")
with open(ALL_DATA_FILE, 'r', encoding='utf-8') as f:
    all_data = json.load(f)
logger.info(f"데이터 로딩 완료: 총 {len(all_data)}개 문제")

with open(os.path.join(BASE_DIR, 'exam_statistics.json'), 'r', encoding='utf-8') as f:
    stats = json.load(f)

# 사용된 문제 추적 (file_id, tag) 튜플로 식별
used_questions = set()

# 4개의 과목별로 처리 (금융일반, 금융심화, 금융실무1, 금융실무2)
for exam_name in stats.keys():
    logger.info(f"{'='*50}")
    logger.info(f"과목: {exam_name}")
    
    # 3세트를 위한 리스트 초기화
    exam_data_sets = [[], [], [], [], []]
    total_exam_questions = 0
    
    # domain별로 처리
    for domain in stats[exam_name].keys():
        logger.info(f"{'-'*50}")
        logger.info(f"도메인: {domain}")
        
        domain_exam_questions = stats[exam_name][domain]['exam_questions']
        total_exam_questions += domain_exam_questions
        
        # 해당 domain의 데이터 필터링
        domain_data = [d for d in all_data if d['domain'] == domain]
        
        # subdomain 별로 문제 추출 - 5세트용
        for subdomain, needed_count in stats[exam_name][domain]['exam_subdomain_distribution'].items():
            # 해당 subdomain의 데이터 필터링
            subdomain_data = [d for d in domain_data if d['subdomain'] == subdomain]
            random.shuffle(subdomain_data)
            
            try:
                # 1세트 샘플링
                sample1 = random.sample(subdomain_data, needed_count)
                remaining_data = [d for d in subdomain_data if d not in sample1]
                
                # 2세트 샘플링 (1세트 제외한 데이터에서)
                sample2 = random.sample(remaining_data, needed_count)
                remaining_data = [d for d in remaining_data if d not in sample2]
                
                # 3세트 샘플링 (1, 2세트 제외한 데이터에서)
                sample3 = random.sample(remaining_data, needed_count)
                remaining_data = [d for d in remaining_data if d not in sample3]
                
                # 4세트 샘플링 (1, 2, 3세트 제외한 데이터에서)
                sample4 = random.sample(remaining_data, needed_count)
                remaining_data = [d for d in remaining_data if d not in sample4]

                # 5세트 샘플링 (1, 2, 3, 4세트 제외한 데이터에서)
                sample5 = random.sample(remaining_data, needed_count)
                remaining_data = [d for d in remaining_data if d not in sample5]
                
                logger.info(f"  - {subdomain}: {needed_count} x 5세트 (총 {len(subdomain_data)}개 중 {needed_count * 5}개 사용)")
                
                # 각 세트에 추가 및 사용된 문제 추적
                for item in sample1 + sample2 + sample3 + sample4 + sample5:
                    question_id = (item.get('file_id', ''), item.get('tag', ''))
                    used_questions.add(question_id)
                
                exam_data_sets[0].extend(sample1)
                exam_data_sets[1].extend(sample2)
                exam_data_sets[2].extend(sample3)
                exam_data_sets[3].extend(sample4)
                exam_data_sets[4].extend(sample5)
                
            except ValueError:
                # 데이터가 부족한 경우
                total_available = len(subdomain_data)
                sample1 = subdomain_data[:needed_count] if subdomain_data else []
                sample2 = subdomain_data[needed_count:needed_count*2] if len(subdomain_data) > needed_count else []
                sample3 = subdomain_data[needed_count*2:needed_count*3] if len(subdomain_data) > needed_count*2 else []
                sample4 = subdomain_data[needed_count*3:needed_count*4] if len(subdomain_data) > needed_count*3 else []
                sample5 = subdomain_data[needed_count*4:] if len(subdomain_data) > needed_count*4 else []
                
                logger.warning(f"  - (ERROR) {subdomain}: {total_available}/{needed_count*5} (데이터 부족: {needed_count*5 - total_available}개 필요)")
                
                # 사용된 문제 추적
                for item in sample1 + sample2 + sample3 + sample4 + sample5:
                    question_id = (item.get('file_id', ''), item.get('tag', ''))
                    used_questions.add(question_id)
                
                exam_data_sets[0].extend(sample1)
                exam_data_sets[1].extend(sample2)
                exam_data_sets[2].extend(sample3)
                exam_data_sets[3].extend(sample4)
                exam_data_sets[4].extend(sample5)
    
    # 5개 세트로 저장
    for set_num in range(5):
        percentage_total = (len(exam_data_sets[set_num])/total_exam_questions*100) if total_exam_questions > 0 else 0
        logger.info(f"  ====> {set_names[set_num+1]}세트: {len(exam_data_sets[set_num])}/{total_exam_questions} ({percentage_total:.2f}%)")
        
        # 출력 디렉토리 생성
        set_dir = os.path.join(EXAM_DIR, set_names[set_num+1])
        os.makedirs(set_dir, exist_ok=True)
        output_file = os.path.join(set_dir, f'{exam_name}_exam.json')
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(exam_data_sets[set_num], f, ensure_ascii=False, indent=4)
        
        logger.info(f"  ====> 저장 완료: {output_file}")

# 사용되지 않은 나머지 문제 필터링
logger.info(f"\n{'='*50}")
logger.info("사용되지 않은 나머지 문제 필터링 시작...")
remaining_data = []
for item in all_data:
    question_id = (item.get('file_id', ''), item.get('tag', ''))
    if question_id not in used_questions:
        remaining_data.append(item)

# 나머지 문제 저장
logger.info(f"사용되지 않은 나머지 문제: {len(remaining_data)}개")

remaining_file = os.path.join(PROCESSED_DIR, 'multiple_remaining.json')
os.makedirs(PROCESSED_DIR, exist_ok=True)
with open(remaining_file, 'w', encoding='utf-8') as f:
    json.dump(remaining_data, f, ensure_ascii=False, indent=4)

logger.info(f"나머지 문제 저장 완료: {remaining_file}")
logger.info(f"전체: {len(all_data)}개, 사용: {len(used_questions)}개, 남음: {len(remaining_data)}개")
logger.info("모든 작업 완료!")

2025-11-08 16:38:42 - INFO - 데이터 파일 로딩 시작: /Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple_subdomain_classified_ALL.json
2025-11-08 16:38:42 - INFO - 데이터 로딩 완료: 총 54387개 문제
2025-11-08 16:38:42 - INFO - 과목: 금융일반
2025-11-08 16:38:42 - INFO - --------------------------------------------------
2025-11-08 16:38:42 - INFO - 도메인: 경제
2025-11-08 16:38:43 - INFO -   - 미시경제학: 32 x 5세트 (총 4086개 중 160개 사용)
2025-11-08 16:38:43 - INFO -   - 경제정책 및 시사경제: 30 x 5세트 (총 506개 중 150개 사용)
2025-11-08 16:38:43 - INFO -   - 거시경제학: 32 x 5세트 (총 3896개 중 160개 사용)
2025-11-08 16:38:43 - INFO -   - 국제경제학: 31 x 5세트 (총 2330개 중 155개 사용)
2025-11-08 16:38:43 - INFO - --------------------------------------------------
2025-11-08 16:38:43 - INFO - 도메인: 경영
2025-11-08 16:38:43 - INFO -   - 경영컨설팅 및 기술평가: 31 x 5세트 (총 1500개 중 155개 사용)
2025-11-08 16:38:43 - INFO -   - 경영학원론 및 조직관리: 31 x 5세트 (총 828개 중 155개 사용)
2025-11-08 16:38:43 - INFO -   - 재무관리 및 기업가치평가: 32 x 5세트 (총 1588개 중 160개 사용)
20

### multiple options 오류들
- find_multiple_choice_invalid_options.py

## short/essay인 객관식 조정하기

In [None]:
# import os, json

# with open('/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/essay_subdomain_classified_ALL.json', 'r', encoding='utf-8') as f:
#     essay = json.load(f)

# with open('/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/short_subdomain_classified_ALL.json', 'r', encoding='utf-8') as f:
#     short = json.load(f)

In [None]:
# to_multiple = []

# # for e in essay:
# #     if isinstance(e['options'], list):
# #         to_multiple.append(e)
# # print(len(to_multiple))

# new_short = []
# for s in short:
#     if isinstance(s['options'], list):
#         to_multiple.append(s)
#     else:
#         new_short.append(s)
# print(len(to_multiple), len(short), len(new_short))
# # to_multiple

In [None]:
# with open('/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/short_subdomain_classified_ALL.json', 'w', encoding='utf-8') as f:
#     json.dump(new_short, f, ensure_ascii=False, indent=4)

# with open('/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple_shortans_classified_ALL.json', 'w', encoding='utf-8') as f:
#     json.dump(to_multiple, f, ensure_ascii=False, indent=4)

### 원본에 옮기기

In [None]:
import os, time
from tqdm import tqdm

file_list = []

for t in tqdm(to_multiple):
    file_id = t.get('file_id')
    file_list.append(file_id)
    # print(file_id)
    tag = t.get('tag')
    time.sleep(0.5)
        
    file_path = os.popen(f"find /Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/data/FINAL -type f -name '{file_id}_v2.json'").read().strip()
    if file_path == "":
        file_path = os.popen(f"find /Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/data/FINAL -type f -name '{file_id}.json'").read().strip()
    # print(file_path)

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    contents = data['contents']

    for d in contents:
        if d.get('page') == tag.split('_')[1]:
            add_info = d.get('add_info')
            for info in add_info:
                if info.get('tag') == tag:
                    if info.get('description').get('question') == t.get('question'):
                        pass
                    else:
                        # print("질문 다름")
                        info['description']['question'] = t.get('question')

                    if info.get('description').get('answer') == t.get('answer'):
                        pass
                    else:
                        # print("답 다름")
                        info['description']['answer'] = t.get('answer')
                    if info.get('description').get('options') == t.get('options'):
                        pass
                    else:
                        # print("옵션 다름")
                        info['description']['options'] = t.get('options')
    if file_path.endswith("_v2.json"):
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
    else:
        with open(file_path.replace(".json", "_v2.json"), 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# sorted(list(set(file_list)))

## multiple 실패한거

In [None]:
# import os, json

# file_name = '/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple-re_subdomain_classified_ALL.json'


# with open(file_name, 'r', encoding='utf-8') as f:
#     multiple = json.load(f)

# file_failed_question = '/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple-fail_response.json'
# with open(file_failed_question, 'r', encoding='utf-8') as f:
#     multiple_fail = json.load(f)

In [None]:
# for q in multiple:
#     # print(q)
#     # if q['domain'] == '분류실패':
#     if True:
#         for m in multiple_fail:
#             if (m['file_id'] == q['file_id']) and (m['tag'] == q['tag']):
#                 q['domain'] = m['domain']
#                 q['subdomain'] = m['subdomain']
#                 q['classification_reason'] = m['classification_reason']
#                 q['is_calculation'] = m['is_calculation']
#                 # break
#     # break

In [None]:
# with open(file_name, 'w', encoding='utf-8') as f:
#     json.dump(multiple, f, ensure_ascii=False, indent=4)

In [None]:
# import json

# file_fail_response = '/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple-fail_fail_response.json'

# with open(file_fail_response, 'r', encoding='utf-8') as f:
#     multiple_fail_again = json.load(f)

# # response 파싱 다시 해보기
# for q in multiple:
#     # if q['domain'] == '':
#     if True:
#         for m in multiple_fail_again:
#             file_id = m['qna_id'].split('_')[0]
#             tag = m['qna_id'].replace(file_id+'_', '')
#             if (file_id == q['file_id']) and (tag == q['tag']):
#                 q['domain'] = m['domain']
#                 q['subdomain'] = m['subdomain']
#                 q['classification_reason'] = m['reason']
#                 q['is_calculation'] = m['is_calculation']
#                 # print(q)
#                 # break

In [None]:
# # 다시 돌릴거 리스트 만들기
# re_run_list = []
# for m in multiple:
#     if (m['domain'] == '') or (m['domain'] == '분류실패'):
#         re_run_list.append(m)

# with open('/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple-re_re_run.json', 'w', encoding='utf-8') as f:
#     json.dump(re_run_list, f, ensure_ascii=False, indent=4)

## 통계

In [1]:
import os, json
from tools.evaluation import qna_subdomain_classifier

file_name = '/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple_subdomain_classified_ALL.json'
with open(file_name, 'r', encoding='utf-8') as f:
    multiple = json.load(f)

classifier = qna_subdomain_classifier.QnASubdomainClassifier()
classifier.save_statistics(multiple)

2025-11-08 16:31:52,384 - INFO - 출력 디렉토리: /Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/multiple_with_subdomain
2025-11-08 16:31:52,397 - INFO - 통계 정보 저장 완료: /Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/multiple_with_subdomain/classification_statistics.json


In [None]:
## 처음 subdomain 돌리고 잘못 부여된 subdomain 조정
# for m in multiple:
#     if m['subdomain'].count('-') >= 1:
#         # print(m)
#         # break
#         m['subdomain'] = m['subdomain'].split('-')[0].strip()
#         # print(m)
#         # break
#     elif m['subdomain'].count('.') >= 1:
#         m['subdomain'] = m['subdomain'].split('.')[1].strip()
        

# classifier.save_statistics(multiple)
#     # print("없는데?", m['subdomain'])

# with open(file_name, 'w', encoding='utf-8') as f:
#     json.dump(multiple, f, ensure_ascii=False, indent=4)


In [None]:
# with open(file_name, 'r', encoding='utf-8') as f:
#     multiple = json.load(f)

# for m in multiple:
#     m['domain'] = ""
#     m['subdomain'] = ""
#     m['classification_reason'] = ""
#     m['is_calculation'] = False

# with open('/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/2_subdomain/multiple.json', 'w', encoding='utf-8') as f:
#     json.dump(multiple, f, ensure_ascii=False, indent=4)


# 객관식 문제 변형

In [3]:
import re
import os, json

EXAM_DIR = '/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/4_multiple_exam'

for exams in os.listdir(EXAM_DIR):
    if exams == "1st":
        first_exam = []
        for exam in os.listdir(os.path.join(EXAM_DIR, exams)):
            with open(os.path.join(EXAM_DIR, exams, exam), 'r', encoding='utf-8') as f:
                first_exam += json.load(f)
    elif exams == "2nd":
        second_exam = []
        for exam in os.listdir(os.path.join(EXAM_DIR, exams)):
            with open(os.path.join(EXAM_DIR, exams, exam), 'r', encoding='utf-8') as f:
                second_exam += json.load(f)
    elif exams == "3rd":
        third_exam = []
        for exam in os.listdir(os.path.join(EXAM_DIR, exams)):
            with open(os.path.join(EXAM_DIR, exams, exam), 'r', encoding='utf-8') as f:
                third_exam += json.load(f)

In [None]:
len(first_exam), len(second_exam), len(third_exam)

In [None]:
multiple = first_exam + second_exam + third_exam

pick_right = []
pick_wrong = []
pick_abcd = []

# 리스트를 순회하면서 remove()를 하면 인덱스가 꼬이므로, 새로 분류하는 방식으로 변경
# remaining = []
for m in multiple:
    question = m['question']
    
    # 3. ㄱ/ㄴ/ㄷ/ㄹ 등에서 옳은 것을 모두 고르는 문제 (다중 선택)
    # 종결어미 '다.' 제외: 줄바꿈, 문장 시작, 또는 공백 후에 나오고 뒤에 공백이 아닌 문자가 오는 경우만 매칭
    # 가나다라마바사아자차카타파하까지 포함 (바, 사, 아 등도 포함)
    if  ('보기' in question) or \
        ('옳은 것을 모두 고른 것은?' in question) or \
         ('옳은 것을 모두 고르면?' in question) or \
         ('옳은 것을 모두 고른 것은' in question) or \
         ('옳은 것을 모두' in question and '?' in question) or \
         ('모두 고른 것은?' in question) or \
         ('모두 고르면?' in question) or \
         ('모두 고른 것은' in question) or \
         ('모두 묶인 것은?' in question) or \
         (re.search(r'[ㄱㄴㄷㄹ][\.]', question) and ('모두 고른' in question or '모두 묶인' in question)) or \
         (re.search(r'(?:^|\n| )[가나다라마바사아자차카타파하]\.(?!\s|$)', question) and ('모두 고른' in question or '모두 묶인' in question)) or \
         (re.search(r'[㉠㉡㉢㉣㉤]', question) and ('모두 고른' in question or '모두 묶인' in question)) or \
         (re.search(r'[ⓐⓑⓒⓓⓔ]', question) and ('모두 고른' in question or '모두 묶인' in question)):
        pick_abcd.append(m)
    # 1. 옳은 것을 고르는 문제 (단일 선택)
    # if ('옳은' in question) or ('옳게' in question) or ('해당하는' in question) or ('적절한' in question) or ('적절하게' in question) or ('바르게' in question) or ('올바른' in question) or ('가장 깊은' in question) or ('가장 타당한' in question) or ('관련 있는' in question):
    #     pick_right.append(m)
    # 2. 옳지 않은 것을 고르는 문제 (단일 선택)
    elif ('않은' in question) or ('못한' in question) or ('없는' in question) or ('거리가 먼' in question) or ('아닌' in question) or ('아니하는 것' in question) or ('않는' in question) or ("않게" in question) or ('잘못된' in question) or ('틀린' in question) or ('다른' in question) or ('무관한' in question) or ('가장 먼' in question) or ('어려운' in question):
        pick_wrong.append(m)
    else:
        # 분류되지 않은 항목
        # remaining.append(m)
        pick_right.append(m)

# 분류 결과 확인
print(f"전체: {len(multiple)}")
print(f"옳은 것은? (단일 선택): {len(pick_right)}")
print(f"옳지 않은 것은? (단일 선택): {len(pick_wrong)}")
print(f"옳은 것을 모두 고른 것은? (다중 선택): {len(pick_abcd)}")
# print(f"미분류: {len(remaining)}")
print(f"갯수 일치 여부: {len(pick_right) + len(pick_wrong) + len(pick_abcd) == len(multiple)}")

## 옳지 않은 것

In [None]:
system_prompt = """
당신은 15년 경력의 문제 출제 경험이 뛰어난 전문가입니다.
다음 문제는 옳지 않은 것을 고르는 문제이고, 옳은 것을 고르는 문제로 변형해야 합니다.
주어진 답과 해설을 보고 답을 옳은 선택지로 변형하세요.
이때 변형은 정답인 선택지의 단어를 바꿔 최소화하는 것입니다. (ex. 높은 -> 낮은, 없다 -> 있다)

### 출력형식
[ {
"question_id": "문제번호",
"question": "문제",
"options": "선택지",
"answer": "답",
"explanation": "해설"
}, 
{
"question_id": "문제번호",
"question": "문제",
"options": "선택지",
"answer": "답",
"explanation": "해설"
}
]
"""

In [None]:
import random
# random.seed(42)
user_prompt = ''

to_json = []


for i, p in enumerate(random.sample(pick_wrong, 20)):
    question_id = p['file_id'] + '_' + p['tag']
    question = p['question']
    options = p['options']
    answer = p['answer']
    explanation = p['explanation']

    single_user_prompt = f"""문제번호: {question_id}
문제: {question}
선택지: {options}
답: {answer}
해설: {explanation}
=====================
"""
    to_json.append(
        {
            "question_id": question_id,
            "question": question,
            "options": options,
            "answer": answer,
            "explanation": explanation
        }
    )

    user_prompt += single_user_prompt

In [None]:
with open('multiple_wrong.json', 'w', encoding='utf-8') as f:
    json.dump(to_json, f, ensure_ascii=False, indent=4)

In [None]:
from tools.QueryModels import query_openrouter
from tools.evaluation import qna_subdomain_classifier

classifier = qna_subdomain_classifier.QnASubdomainClassifier()

for model_path in ['anthropic/claude-sonnet-4.5', 'openai/o3', 'google/gemini-2.5-pro']:
    model_name = model_path.split('/')[-1]
    company = model_path.split('/')[0]

    response = query_openrouter(system_prompt, user_prompt, model_name=model_path)
    response = classifier.parse_api_response(response)
    
    with open(f'multiple_wrong_{model_name}.json', 'w', encoding='utf-8') as f:
        json.dump(response, f, ensure_ascii=False, indent=4)


### 이제 전부 옳은 것임! -> 전부 옳지 않은 것으로 바꾸기

In [1]:
system_prompt = """
당신은 15년 경력의 문제 출제 경험이 뛰어난 전문가입니다.
다음 문제는 선택지가 모두 옳은 것으로, 모두 옳지 않은 선지로 변형해야 합니다.
주어진 선택지들과 해설을 보고, 문장의 역/이/대우를 활용하여 옳지 않은 선택지로 변형하세요.
이때 변형은 선택지의 단어를 바꿔 할루시네이션을 최소화해야 합니다. (ex. 높은 -> 낮은, 없다 -> 있다)

### 출력형식
[ {
"question_id": "문제번호",
"question": "문제",
"options": "선택지",
"answer": "답",
"explanation": "해설"
}, 
{
"question_id": "문제번호",
"question": "문제",
"options": "선택지",
"answer": "답",
"explanation": "해설"
}
]
"""

In [16]:
import json

from tools.QueryModels import query_openrouter
from tools.evaluation import qna_subdomain_classifier

classifier = qna_subdomain_classifier.QnASubdomainClassifier()

for model_path in ['google/gemini-2.5-pro']: # 'openai/gpt-5','anthropic/claude-sonnet-4.5', 'openai/o3', 
    model_name = model_path.split('/')[-1]
    company = model_path.split('/')[0]

    user_prompt = ''
    with open(f'multiple_wrong_{model_name}.json', 'r', encoding='utf-8') as f:
        multiple_wrong = json.load(f)

    for i, p in enumerate(multiple_wrong):
        question_id = p['question_id']
        question = p['question']
        options = p['options']
        answer = p['answer']
        explanation = p['explanation']

        single_user_prompt = f"""문제번호: {question_id}
    문제: {question}
    선택지: {options}
    답: {answer}
    해설: {explanation}
    =====================
    """

        user_prompt += single_user_prompt

    response = query_openrouter(system_prompt, user_prompt, model_name=model_path)
    response = classifier.parse_api_response(response)
    
    with open(f'multiple_wrong2right2wrong_{model_name}.json', 'w', encoding='utf-8') as f:
        json.dump(response, f, ensure_ascii=False, indent=4)

2025-11-06 18:12:51,410 - INFO - 출력 디렉토리: /Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/evaluation/eval_data/multiple_with_subdomain


/Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/tools
[DEBUG] Config file path: /Users/jinym/Desktop/Desktop_AICenter✨/SFAIcenter/llm_config.ini


2025-11-06 18:12:55,010 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


## 옳은 것

In [None]:
new = []
for p in pick_right[:10]:
    new.append(
        {
            "question_id": p['file_id'] + '_' + p['tag'],
            "question": p['question'],
            "options": p['options'],
            "answer": p['answer'],
            "explanation": p['explanation']
        }
    )
with open('multiple_right.json', 'w', encoding='utf-8') as f:
    json.dump(new, f, ensure_ascii=False, indent=4)

In [6]:
# 도메인별 통계 계산
from collections import defaultdict

calculation = defaultdict(int)  # 계산 문제 개수
total_by_domain = defaultdict(int)  # 전체 문제 개수

for question in first_exam:
    domain = question.get('domain', '미분류')
    total_by_domain[domain] += 1
    if question.get('is_calculation') == True:
        calculation[domain] += 1

# 결과 출력
print("=" * 60)
print("도메인별 통계")
print("=" * 60)
print(f"{'도메인':<20} {'전체':<10} {'계산문제':<10} {'비율':<10}")
print("-" * 60)

for domain in sorted(total_by_domain.keys()):
    total = total_by_domain[domain]
    calc_count = calculation[domain]
    ratio = (calc_count / total * 100) if total > 0 else 0
    print(f"{domain:<20} {total:<10} {calc_count:<10} {ratio:.2f}%")

print("-" * 60)
print(f"{'합계':<20} {sum(total_by_domain.values()):<10} {sum(calculation.values()):<10} {sum(calculation.values())/sum(total_by_domain.values())*100:.2f}%")
print("=" * 60)

도메인별 통계
도메인                  전체         계산문제       비율        
------------------------------------------------------------
경영                   125        15         12.00%
경제                   125        29         23.20%
내부통제                 63         1          1.59%
노무                   83         1          1.20%
리스크관리                62         14         22.58%
보상처리                 125        6          4.80%
보험계약                 125        6          4.80%
세무                   83         17         20.48%
영업                   62         0          0.00%
자산운용                 63         7          11.11%
회계                   84         36         42.86%
------------------------------------------------------------
합계                   1000       132        13.20%
