### MAP 점수 비교

In [1]:
import json

# 두 파일에서 topk 항목을 불러옴
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Average Precision (AP) 계산
def calculate_ap(true_topk, predicted_topk):
    score = 0.0
    hits = 0
    for i, pred in enumerate(predicted_topk):
        if pred in true_topk:
            hits += 1
            score += hits / (i + 1)
    if hits == 0:
        return 0.0
    return score / len(true_topk)

def calculate_map(ground_truth_file, prediction_file, prediction_key_name="topk", max_k=3):
    ground_truth_data = load_jsonl(ground_truth_file)
    prediction_data = load_jsonl(prediction_file)

    map_scores_by_k = {k: [] for k in range(1, max_k + 1)}

    for gt_item in ground_truth_data:
        pred_item = next((item for item in prediction_data if item['eval_id'] == gt_item['eval_id']), None)
        if pred_item:
            for k in range(1, max_k + 1):
                # ground truth의 topk가 없는 경우
                gt_topk = gt_item.get('topk', [])
                pred_topk = pred_item.get(prediction_key_name, [])[:k]
                
                # ground truth가 비어있는데 prediction도 비어있으면 perfect match
                if not gt_topk and not pred_topk:
                    map_scores_by_k[k].append(1.0)
                # ground truth가 비어있는데 prediction이 있으면 완전 불일치
                elif not gt_topk and pred_topk:
                    map_scores_by_k[k].append(0.0)
                # ground truth가 있는 경우 정상적인 AP 계산
                else:
                    ap = calculate_ap(gt_topk[:k], pred_topk)
                    map_scores_by_k[k].append(ap)

    # 각 k별로 MAP 계산
    map_scores = []
    for k in range(1, max_k + 1):
        map_score = sum(map_scores_by_k[k]) / len(map_scores_by_k[k]) if map_scores_by_k[k] else 0.0
        map_scores.append(f"top-{k}: {map_score:.4f}")

    result = " | ".join(map_scores)
    print(f"{prediction_file} " + result + f"({prediction_key_name})")

    return map_scores_by_k

##### 특정 파일 비교

In [4]:
ground_truth_file = './submissions/0.9273-0.9303.csv'
print(f"ground_truth_file : {ground_truth_file}")

map_score = calculate_map(ground_truth_file, './submissions/0.8689-0.8712.csv')

ground_truth_file : ./submissions/0.9273-0.9303.csv
./submissions/0.8689-0.8712.csv top-1: 0.7727 | top-2: 0.6852 | top-3: 0.6159(topk)


##### 폴더 내 모든 파일 비교

In [3]:
import os

def process_files_in_folder(folder_path):
    map_results = {}
    # 폴더 내의 prediction 파일 처리
    
    prediction_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    prediction_files.sort(reverse=True)  # 내림차순으로 정렬

    gt = f"{folder_path + prediction_files[0]}"

    # 폴더 내의 모든 파일을 처리
    for filename in prediction_files:
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            
            # MAP 계산
            map_score = calculate_map(gt, file_path)
            map_results[filename] = map_score
    
    return map_results

# 경로 설정
folder_path = './submissions/'

# 모든 파일에 대해 calculate_map 수행
process_files_in_folder(folder_path)

./submissions/0.9273-0.9303.csv top-1: 1.0000 | top-2: 1.0000 | top-3: 1.0000(topk)
./submissions/0.9227-0.9258.csv top-1: 0.9273 | top-2: 0.8932 | top-3: 0.8505(topk)
./submissions/0.9167-0.9182.csv top-1: 0.8818 | top-2: 0.7932 | top-3: 0.7706(topk)
./submissions/0.9144-0.9182.csv top-1: 0.8636 | top-2: 0.8011 | top-3: 0.7790(topk)
./submissions/0.9091-0.9136.csv top-1: 0.9273 | top-2: 0.8636 | top-3: 0.8285(topk)
./submissions/0.9068-0.9121.csv top-1: 0.8818 | top-2: 0.7841 | top-3: 0.7770(topk)
./submissions/0.9038_0.9091.csv top-1: 0.8818 | top-2: 0.7841 | top-3: 0.7755(topk)
./submissions/0.9038-0.9076.csv top-1: 0.8818 | top-2: 0.7830 | top-3: 0.7696(topk)
./submissions/0.9023-0.9091.csv top-1: 0.8864 | top-2: 0.7898 | top-3: 0.7755(topk)
./submissions/0.9023-0.9061.csv top-1: 0.8818 | top-2: 0.7807 | top-3: 0.7672(topk)
./submissions/0.9008-0.9045.csv top-1: 0.8818 | top-2: 0.7795 | top-3: 0.7672(topk)
./submissions/0.9000-0.9091.csv top-1: 0.8727 | top-2: 0.8614 | top-3: 0.827

{'0.9273-0.9303.csv': {1: [1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,


### eval.jsonl에 다른 파일의 standalone_query 붙이기

In [1]:
import json

# 파일 경로 설정
a_file = './data/eval.jsonl'  # eval_id 필드가 있는 첫 번째 파일
b_file = "./output/1022-0000_topk3.csv"  # query 필드가 있는 두 번째 파일
output_file = './data/eval_gemma_phrase.jsonl'  # 결과를 저장할 파일

# b.jsonl에서 eval_id와 standalone_query의 매핑을 생성
b_mapping = {}
with open(b_file, 'r', encoding='utf-8') as b_infile:
    for line in b_infile:
        b_doc = json.loads(line)
        # eval_id를 키로, standalone_query를 값으로 추가
        b_mapping[b_doc['eval_id']] = b_doc['standalone_query']

# a.jsonl 파일에서 eval_id에 따른 standalone_query 추가
with open(a_file, 'r', encoding='utf-8') as a_infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in a_infile:
        a_doc = json.loads(line)
        eval_id = a_doc['eval_id']
        
        # b.jsonl에서 해당 eval_id에 대한 standalone_query 가져오기
        standalone_query = b_mapping.get(eval_id, None)  # 존재하지 않을 경우 None
        
        # 새로운 문서 생성
        new_doc = {
            'eval_id': eval_id,
            'standalone_query': standalone_query,
            'msg': a_doc['msg']  # 기존 msg 필드 추가
        }
        
        # JSONL 형식으로 저장
        outfile.write(json.dumps(new_doc, ensure_ascii=False) + '\n')

### 다른 파일의 topk 붙이기

In [17]:
import json

# 파일 경로 설정
a_file = './data/valid_230_rerank_query.jsonl'  # eval_id 필드가 있는 첫 번째 파일
b_file = './output_valid/submission_validset_LJM.csv'  # topk 필드가 있는 두 번째 파일
output_file = './data/valid_230_rerank_with_topk.jsonl'  # 결과를 저장할 파일

# b_file에서 eval_id와 topk의 매핑을 생성
b_mapping = {}
with open(b_file, 'r', encoding='utf-8') as b_infile:
    for line in b_infile:
        b_doc = json.loads(line)
        # eval_id를 키로, topk를 값으로 추가
        b_mapping[b_doc['eval_id']] = b_doc['topk']

# a_file에서 eval_id에 따른 topk 추가
with open(a_file, 'r', encoding='utf-8') as a_infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in a_infile:
        a_doc = json.loads(line)
        eval_id = a_doc['eval_id']
        
        # b_file에서 해당 eval_id에 대한 topk 가져오기
        topk = b_mapping.get(eval_id, None)  # 존재하지 않을 경우 None
        
        # 새로운 문서 생성
        new_doc = {
            'eval_id': eval_id,
            'standalone_query': a_doc['standalone_query'],
            'topk': topk,  # topk 필드 추가
            'msg': a_doc['msg']  # 기존 msg 필드 추가
        }
        
        # JSONL 형식으로 저장
        outfile.write(json.dumps(new_doc, ensure_ascii=False) + '\n')

### 하드보팅
1. submission 파일 생성
2. 1번째 topk가 같은 아이템 목록 파일 생성
3. 1번째 topk가 다른 아이템 목록 파일 생성

In [None]:
import json
import os
from collections import defaultdict, Counter
from typing import List, Union

from datetime import datetime
import time
from zoneinfo import ZoneInfo

def process_jsonl_files(input_paths: Union[List[str], str]):
    current_time = datetime.fromtimestamp(time.time(), tz=ZoneInfo("Asia/Seoul")).strftime("%m%d-%H%M")

    input_files = []
    
    if isinstance(input_paths, str):
        if os.path.isdir(input_paths):
            print(f"Processing directory: {input_paths}")
            input_files = [os.path.join(input_paths, f) for f in os.listdir(input_paths) if f.endswith('.jsonl')]
        else:
            raise ValueError("If a single path is provided, it must be a directory.")
    else:
        input_files = input_paths

    print(f"Found {len(input_files)} JSONL files to process.")

    if not input_files:
        print("No JSONL files found to process.")
        return

    eval_id_data = defaultdict(list)

    for file_path in input_files:
        print(f"Processing file: {file_path}")
        with open(file_path, 'r') as f:
            for line in f:
                try:
                    item = json.loads(line)
                    eval_id = item['eval_id']
                    topk = item.get('topk', [])
                    standalone_query = item.get('standalone_query', '')
                    
                    eval_id_data[eval_id].append({
                        'file': os.path.basename(file_path),
                        'topk': topk,
                        'standalone_query': standalone_query
                    })
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file {file_path}. Skipping this line.")
                except KeyError:
                    print(f"Missing 'eval_id' in a line in file {file_path}. Skipping this line.")

    same_items = []
    diff_items = []
    voted_items = []

    for eval_id, items in eval_id_data.items():
        if len(items) > 1:
            first_topk = items[0]['topk'][0] if items[0]['topk'] else None
            
            # Find the first non-empty standalone_query
            standalone_query = next((item['standalone_query'] for item in items if item['standalone_query']), '')
            
            if all(item['topk'] and item['topk'][0] == first_topk for item in items):
                same_items.append({
                    'eval_id': eval_id,
                    'representative_query': standalone_query,
                    'topk': items[0]['topk'],
                    'files': [item['file'] for item in items]
                })
            else:
                diff_items.append({
                    'eval_id': eval_id,
                    'representative_query': standalone_query,
                    'diff': items,
                })

            # Hard voting for top 3
            all_topk = [item for sublist in [item['topk'] for item in items] for item in sublist]
            vote_count = Counter(all_topk)
            top_3 = [item for item, _ in vote_count.most_common(3)]
            
            voted_items.append({
                'eval_id': eval_id,
                'representative_query': standalone_query,
                'topk': top_3,
                'original_items': items
            })

    # Write same items to a file
    file_name = f"./voting/{current_time}_same_1st_items.jsonl"
    with open(file_name, 'w') as f:
        for item in same_items:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    print(f"Wrote {len(same_items)} items to {file_name}")

    # Write different items to a file
    file_name = f"./voting/{current_time}_diff_1st_items.jsonl"
    with open(file_name, 'w') as f:
        for item in diff_items:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    print(f"Wrote {len(diff_items)} items to {file_name}")

    # Write voted items to a file
    file_name = f"./voting/{current_time}_voted_items.csv"
    with open(file_name, 'w') as f:
        for item in voted_items:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    print(f"Wrote {len(voted_items)} items to {file_name}")

    print(f"\nProcessing complete. {len(same_items)} items with same top-k value and {len(diff_items)} items with different top-k values.")
    print(f"{len(voted_items)} items processed for hard voting.")
    print("\nFirst few voted items:")
    for item in voted_items[:3]:  # Print only first 3 items for brevity
        print(json.dumps(item, indent=2, ensure_ascii=False))

input_files = ['./submissions/0.9038_0.9091.csv', './submissions/0.8962-0.8970.csv', './submissions/0.8689-0.8712.csv', './submissions/0.8485_0.8515.csv']
process_jsonl_files(input_files)

# foler_name = './output_high'
# process_jsonl_files(foler_name)