## Selecting images and annotation.

50/100/50

U / D / P

In [10]:
import os
import random
import cv2
import pandas as pd
import shutil
import xlsxwriter
from tqdm import tqdm

# ================= CONFIGURATION =================
DATASET_DIR = "/media/holidayj/Documents/Data/Platform/Euljiro/Euljiro_inner_20201128_f1038_t1519/Euljiro_/all_for_FleissKappa"
IMAGE_DIR = os.path.join(DATASET_DIR, "images")
LABEL_DIR = os.path.join(DATASET_DIR, "labels")
OUTPUT_DIR = "./survey_material"
EXCEL_FILENAME = "Annotator_Task_Form.xlsx"

# TARGET DISTRIBUTION (50 / 100 / 50)
TARGET_COUNTS = {
    0: 50,   # Ascending
    1: 100,  # Descending (Main focus)
    2: 50    # Passing
}
# =================================================

def parse_yolo_line(line, img_width, img_height):
    parts = line.strip().split()
    class_id = int(parts[0])
    x_center = float(parts[1]) * img_width
    y_center = float(parts[2]) * img_height
    width = float(parts[3]) * img_width
    height = float(parts[4]) * img_height
    
    x1 = int(x_center - width / 2)
    y1 = int(y_center - height / 2)
    x2 = int(x_center + width / 2)
    y2 = int(y_center + height / 2)
    
    return class_id, x1, y1, x2, y2

def main():
    if os.path.exists(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)
    os.makedirs(OUTPUT_DIR)
    
    # --- 1. Collect All Objects ---
    print("Scanning dataset...")
    all_objects = []
    
    image_files = [f for f in os.listdir(IMAGE_DIR) if f.endswith(('.jpg', '.png'))]
    
    for img_file in tqdm(image_files):
        txt_file = img_file.replace('.jpg', '.txt').replace('.png', '.txt')
        txt_path = os.path.join(LABEL_DIR, txt_file)
        img_path = os.path.join(IMAGE_DIR, img_file)
        
        if not os.path.exists(txt_path): continue
            
        img = cv2.imread(img_path)
        if img is None: continue
        h, w, _ = img.shape
        
        with open(txt_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                cls, x1, y1, x2, y2 = parse_yolo_line(line, w, h)
                # Only collect classes 0, 1, 2 (Ignore others if any)
                if cls in TARGET_COUNTS:
                    all_objects.append({
                        'img_name': img_file,
                        'img_path': img_path,
                        'class_id': cls,
                        'box': (x1, y1, x2, y2)
                    })

    # --- 2. Stratified Sampling (50 / 100 / 50) ---
    print(f"Total objects found: {len(all_objects)}")
    
    # Separate lists by class
    objs_0 = [x for x in all_objects if x['class_id'] == 0]
    objs_1 = [x for x in all_objects if x['class_id'] == 1]
    objs_2 = [x for x in all_objects if x['class_id'] == 2]
    
    print(f"Pool size -> Asc: {len(objs_0)}, Desc: {len(objs_1)}, Pass: {len(objs_2)}")
    
    selected_objects = []
    
    # Sample exact amounts (using min to avoid errors if pool is too small)
    selected_objects.extend(random.sample(objs_0, min(len(objs_0), TARGET_COUNTS[0])))
    selected_objects.extend(random.sample(objs_1, min(len(objs_1), TARGET_COUNTS[1])))
    selected_objects.extend(random.sample(objs_2, min(len(objs_2), TARGET_COUNTS[2])))
    
    # Shuffle the final list so annotators don't see all "Ascending" first
    random.shuffle(selected_objects)
    
    print(f"Final Survey Size: {len(selected_objects)} images")
    
    # --- 3. Generate Images & Excel ---
    print("Generating images and Excel sheet...")
    excel_data = []
    ground_truth_data = []
    
    for i, obj in enumerate(tqdm(selected_objects)):
        img = cv2.imread(obj['img_path'])
        x1, y1, x2, y2 = obj['box']
        
        # Draw Blue Box
        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
        
        out_name = f"survey_{i:03d}.jpg"
        cv2.imwrite(os.path.join(OUTPUT_DIR, out_name), img)
        
        excel_data.append({
            'Image_ID': out_name,
            'Your_Choice': '' 
        })
        
        ground_truth_data.append({
            'Image_ID': out_name,
            'GT_Label': obj['class_id']
        })

    # --- 4. Create Excel with Dropdown (Fixed) ---
    workbook = xlsxwriter.Workbook(EXCEL_FILENAME)
    worksheet = workbook.add_worksheet("AnnotationTask")
    
    header_fmt = workbook.add_format({'bold': True, 'bg_color': '#D3D3D3', 'border': 1})
    
    worksheet.write('A1', 'Image File Name', header_fmt)
    worksheet.write('B1', 'Select Action (Click Cell)', header_fmt)
    worksheet.write('C1', 'Notes (Optional)', header_fmt)
    
    worksheet.set_column('A:A', 20)
    worksheet.set_column('B:B', 25)
    worksheet.set_column('C:C', 30)
    
    for row_idx, item in enumerate(excel_data):
        worksheet.write(row_idx + 1, 0, item['Image_ID'])
        
        # *** FIX: Added correct start_row, start_col, end_row, end_col ***
        worksheet.data_validation(row_idx + 1, 1, row_idx + 1, 1, {
            'validate': 'list',
            'source': ['0: Ascending', '1: Descending', '2: Passing']
        })

    workbook.close()
    
    # Save Key
    pd.DataFrame(ground_truth_data).to_excel("Answer_Key_DO_NOT_SEND.xlsx", index=False)
    
    print("\n[SUCCESS]")
    print(f"Generated {len(selected_objects)} images.")
    print(f"Ascending: {sum(1 for x in selected_objects if x['class_id']==0)}")
    print(f"Descending: {sum(1 for x in selected_objects if x['class_id']==1)}")
    print(f"Passing: {sum(1 for x in selected_objects if x['class_id']==2)}")

if __name__ == "__main__":
    main()

Scanning dataset...


100%|██████████| 6378/6378 [00:08<00:00, 767.58it/s]


Total objects found: 12925
Pool size -> Asc: 4380, Desc: 4236, Pass: 4309
Final Survey Size: 200 images
Generating images and Excel sheet...


100%|██████████| 200/200 [00:00<00:00, 249.27it/s]



[SUCCESS]
Generated 200 images.
Ascending: 50
Descending: 100
Passing: 50


## Analyze the Results

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from statsmodels.stats.inter_rater import fleiss_kappa

# ================= CONFIGURATION =================
# Folder where you saved the returned Excel files from annotators
ANNOTATOR_FILES_DIR = "./returned_files" 

# The Master Key file you generated earlier
GROUND_TRUTH_FILE = "Answer_Key_DO_NOT_SEND.xlsx" 
# =================================================

def clean_label(value):
    """Converts '0: Ascending' or 0 to integer 0"""
    if pd.isna(value):
        return -1 # Missing value
    str_val = str(value).strip()
    if ':' in str_val:
        return int(str_val.split(':')[0])
    try:
        return int(float(str_val))
    except:
        return -1

def main():
    # 1. Load Ground Truth (Optional: strictly for your reference)
    gt_df = pd.read_excel(GROUND_TRUTH_FILE)
    print(f"Loaded Ground Truth: {len(gt_df)} samples")

    # 2. Load Annotator Files
    annotator_files = glob.glob(os.path.join(ANNOTATOR_FILES_DIR, "*.xlsx"))
    print(f"Found {len(annotator_files)} annotator files.")
    
    if len(annotator_files) < 2:
        print("Error: Need at least 2 annotator files to calculate Kappa.")
        return

    # 3. Build the Agreement Matrix
    # We need a DataFrame where columns are Annotators, rows are Images
    df_merged = gt_df[['Image_ID']].copy()
    
    for file_path in annotator_files:
        name = os.path.basename(file_path).replace(".xlsx", "")
        # Read file
        temp_df = pd.read_excel(file_path)
        
        # Extract answers (assuming column B is 'Select Action (Click Cell)')
        # If headers changed, check the index. Usually column 1 (0-index based).
        # We look for the column that contains 'Select Action'
        target_col = [c for c in temp_df.columns if "Select Action" in str(c)][0]
        
        # Clean and Merge
        temp_df[name] = temp_df[target_col].apply(clean_label)
        
        # Merge on Image_ID to ensure alignment
        # (Assumes Image_ID is in Column A)
        id_col = [c for c in temp_df.columns if "Image" in str(c)][0]
        temp_df = temp_df[[id_col, name]]
        
        df_merged = df_merged.merge(temp_df, left_on='Image_ID', right_on=id_col, how='left')
        df_merged.drop(columns=[id_col], inplace=True)

    # 4. Convert to Fleiss' Kappa Format
    # Matrix: Rows = Subjects (Images), Cols = Categories (0, 1, 2)
    # Value = Count of how many raters assigned that category
    
    annotator_cols = [c for c in df_merged.columns if c != 'Image_ID']
    print(f"Annotators included: {annotator_cols}")
    
    # Filter out any rows with missing answers (-1)
    valid_rows = df_merged[annotator_cols].ge(0).all(axis=1)
    if not valid_rows.all():
        print(f"Warning: Dropping {sum(~valid_rows)} images due to missing answers.")
        df_merged = df_merged[valid_rows]

    # Count votes for each category (0, 1, 2)
    # Shape: (N_images, 3_categories)
    N_categories = 3
    fleiss_matrix = np.zeros((len(df_merged), N_categories))
    
    for i, row in df_merged.iterrows():
        votes = row[annotator_cols].values
        for vote in votes:
            fleiss_matrix[i, int(vote)] += 1
            
    # 5. Calculate Kappa
    kappa = fleiss_kappa(fleiss_matrix)
    
    print("-" * 30)
    print(f"FLEISS' KAPPA SCORE: {kappa:.4f}")
    print("-" * 30)
    
    # Interpretation
    if kappa < 0:
        print("Interpretation: Poor agreement (Less than chance)")
    elif 0.01 <= kappa <= 0.20:
        print("Interpretation: Slight agreement")
    elif 0.21 <= kappa <= 0.40:
        print("Interpretation: Fair agreement")
    elif 0.41 <= kappa <= 0.60:
        print("Interpretation: Moderate agreement")
    elif 0.61 <= kappa <= 0.80:
        print("Interpretation: Substantial agreement")
    elif 0.81 <= kappa <= 1.00:
        print("Interpretation: Almost perfect agreement")

if __name__ == "__main__":
    main()

## Response Text: "We conducted a blind annotation study with 5 independent raters. To ensure consistency, raters were provided with the standard annotation guidelines used in the original dataset creation (e.g., specifying that 'Descending' requires visible leg motion, otherwise classified as 'Passing' to reduce ambiguity). The resulting Fleiss' Kappa of 0.XX confirms that our strict definitions are reproducible..."

우리는 이미 처음 annotation을 수행할때 투표로 결정을 했었다.
이번에 IAA