In [None]:
import json
import pandas as pd

In [None]:
file_path = '/content/drive/MyDrive/woke-odds/clamber_benchmark.jsonl'

In [None]:
data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f, 1):
        line = line.strip()
        if line:
            try:
                # 첫 번째 파싱: 바깥쪽 따옴표로 감싸진 문자열을 파싱
                parsed_once = json.loads(line)

                # 두 번째 파싱: 실제 JSON 객체로 파싱
                if isinstance(parsed_once, str):
                    item = json.loads(parsed_once)
                else:
                    item = parsed_once

                data.append(item)
            except json.JSONDecodeError as e:
                print(f"Line {line_num} - Error: {e}")
                print(f"Content preview: {line[:100]}...")
                continue

# DataFrame으로 변환
df = pd.DataFrame(data)

In [None]:
print(f"총 데이터 개수: {len(df)}")
print(f"\n컬럼 목록:\n{df.columns.tolist()}")

총 데이터 개수: 3202

컬럼 목록:
['question', 'context', 'clarifying_question', 'require_clarification', 'category', 'subclass', 'predict_ambiguous', 'predict_is_ambiguous_response', 'predict_clarifying_question']


In [None]:
from IPython.display import display
display(df.head())

Unnamed: 0,question,context,clarifying_question,require_clarification,category,subclass,predict_ambiguous,predict_is_ambiguous_response,predict_clarifying_question
0,Give me a list of good coffee shops?,,What do you personally consider important in a...,1,MC,whom,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \n- Could you please spec...
1,Give me some Mother's Day gift ideas.,,"What are your mother's interests, hobbies, or ...",1,MC,whom,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \nCould you please specif...
2,Help me come up with 3 ideas for a new busines...,,"What are your areas of interest or expertise, ...",1,MC,what,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \nWhat specific industry ...
3,Write the first paragraph of a blog post descr...,,What tone or perspective should I use for the ...,1,MC,whom,1,"{""Output"": ""True"", ""Confidence"": 4}",Clarifying question: \n- Which aspect or topic...
4,Give me some tips on how to train for a marathon.,,Can you provide your current fitness level and...,1,MC,whom,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \n- Are you looking for t...


In [None]:
#context 컬럼 확인
print((df['context'].str.len() > 0).sum()) #실제 데이터가 있는 개수
print(df['context'].str.strip().eq('').sum()) #공백만 있는 개수

0
3202


In [None]:
df = df.drop('context', axis=1)

In [None]:
df = df.drop(columns=['predict_ambiguous', 'predict_is_ambiguous_response', 'predict_clarifying_question'])

In [None]:
display(df.head())

Unnamed: 0,question,clarifying_question,require_clarification,category,subclass
0,Give me a list of good coffee shops?,What do you personally consider important in a...,1,MC,whom
1,Give me some Mother's Day gift ideas.,"What are your mother's interests, hobbies, or ...",1,MC,whom
2,Help me come up with 3 ideas for a new busines...,"What are your areas of interest or expertise, ...",1,MC,what
3,Write the first paragraph of a blog post descr...,What tone or perspective should I use for the ...,1,MC,whom
4,Give me some tips on how to train for a marathon.,Can you provide your current fitness level and...,1,MC,whom


In [None]:
#명확화 필요 질문(1)의 개수
print(df['require_clarification'].value_counts())

require_clarification
1    1601
0    1601
Name: count, dtype: int64


In [None]:
# category 값 개수 조회
print(df['category'].value_counts())
print("비율(%):")
print(df['category'].value_counts(normalize=True) * 100)
print()

# subclass 값 개수 조회
print(df['subclass'].value_counts())
print("비율(%):")
print(df['subclass'].value_counts(normalize=True) * 100)
print()

# category와 subclass 조합 확인
print(df.groupby(['category', 'subclass']).size().sort_values(ascending=False))

category
MC    1602
FD     800
LA     800
Name: count, dtype: int64
비율(%):
category
MC    50.031230
FD    24.984385
LA    24.984385
Name: proportion, dtype: float64

subclass
none            801
polysemy        400
NK              400
ICL             400
co-reference    400
what            201
whom            200
where           200
when            200
Name: count, dtype: int64
비율(%):
subclass
none            25.015615
polysemy        12.492192
NK              12.492192
ICL             12.492192
co-reference    12.492192
what             6.277327
whom             6.246096
where            6.246096
when             6.246096
Name: proportion, dtype: float64

category  subclass    
MC        none            801
FD        ICL             400
          NK              400
LA        co-reference    400
          polysemy        400
MC        what            201
          when            200
          where           200
          whom            200
dtype: int64


In [None]:
#명확화 필요 질문(1)의 카테고리와 서브카테고리 조회
clarification_required = df[df['require_clarification'] == 1]

print(clarification_required['category'].value_counts())
print()
print(clarification_required['subclass'].value_counts())
print()
print(clarification_required.groupby(['category', 'subclass']).size().sort_values(ascending=False))

category
MC    801
FD    400
LA    400
Name: count, dtype: int64

subclass
what            201
whom            200
when            200
where           200
NK              200
ICL             200
co-reference    200
polysemy        200
Name: count, dtype: int64

category  subclass    
MC        what            201
FD        ICL             200
LA        co-reference    200
FD        NK              200
LA        polysemy        200
MC        when            200
          where           200
          whom            200
dtype: int64


In [None]:
#명확화 필요 없는 질문(0)의 카테고리와 서브카테고리 조회
clarification_not_required = df[df['require_clarification'] == 0]

print(clarification_not_required['category'].value_counts())
print()
print(clarification_not_required['subclass'].value_counts())
print()
print(clarification_not_required.groupby(['category', 'subclass']).size().sort_values(ascending=False))

category
MC    801
FD    400
LA    400
Name: count, dtype: int64

subclass
none            801
NK              200
ICL             200
co-reference    200
polysemy        200
Name: count, dtype: int64

category  subclass    
MC        none            801
FD        ICL             200
          NK              200
LA        co-reference    200
          polysemy        200
dtype: int64


In [None]:
#모호하지 않은 질문의 예시 조회
sample = df[df['require_clarification'] == 0].iloc[0]
print(sample)
print()
print(f"Question: {sample['question']}\n")

question                 Is Mozambique a geographic distribution of Man...
clarifying_question                                                       
require_clarification                                                    0
category                                                                FD
subclass                                                                NK
Name: 178, dtype: object

Question: Is Mozambique a geographic distribution of Mantodea?



In [None]:
df_processed = df.copy()

# Category 매핑 (FD->EM, MC->AO)
category_mapping = {
    'FD': 'EM',  # Epistemic Misalignment
    'MC': 'AO',  # Aleatoric Output
    'LA': 'LA'   # Linguistic Ambiguity (유지)
}

#category 이름 변경 -> 약어와 값 일치
df_processed['category'] = df_processed['category'].replace(category_mapping)
#모호하지 않은 질문(0)에 대해 카테고리 None으로 변경
df_processed.loc[df_processed['require_clarification'] == 0, 'category'] = 'NONE'

# Subclass 매핑
subclass_mapping = {
    'whom': 'WHOM',
    'what': 'WHAT',
    'when': 'WHEN',
    'where': 'WHERE',
    'NK': 'UNF',
    'ICL': 'CONT',
    'co-reference': 'SEM',
    'polysemy': 'LEX'
}

#Subclass 이름 변경 -> 약어와 값 일치
df_processed['subclass'] = df_processed['subclass'].replace(subclass_mapping)
#모호하지 않은 질문(0)에 대해 서브클래스 None으로 변경
df_processed.loc[df_processed['require_clarification'] == 0, 'subclass'] = 'NONE'

In [None]:
print(df_processed.head(5))

                                            question  \
0               Give me a list of good coffee shops?   
1              Give me some Mother's Day gift ideas.   
2  Help me come up with 3 ideas for a new busines...   
3  Write the first paragraph of a blog post descr...   
4  Give me some tips on how to train for a marathon.   

                                 clarifying_question  require_clarification  \
0  What do you personally consider important in a...                      1   
1  What are your mother's interests, hobbies, or ...                      1   
2  What are your areas of interest or expertise, ...                      1   
3  What tone or perspective should I use for the ...                      1   
4  Can you provide your current fitness level and...                      1   

  category subclass  
0       AO     WHOM  
1       AO     WHOM  
2       AO     WHAT  
3       AO     WHOM  
4       AO     WHOM  


In [None]:
display(df_processed[df_processed['require_clarification'] == 0][['question', 'category', 'subclass']].head(10))

Unnamed: 0,question,category,subclass
178,Is Mozambique a geographic distribution of Man...,NONE,NONE
186,What's the latitude range where Diomedeidae is...,NONE,NONE
193,Does Ophisaurus live in subtropical habitat?,NONE,NONE
196,Does Pseudophanella have hemimetabolous as its...,NONE,NONE
208,What kind of visual system does Baratha have?,NONE,NONE
211,In which geological period did Dysdera first a...,NONE,NONE
219,Does Mollisia poaeoides have Poa nemoralis as ...,NONE,NONE
222,What does the term 'standard length' refer to ...,NONE,NONE
223,Does Navia mima produce oxygen?,NONE,NONE
235,What is the average life span of Bothrops asper?,NONE,NONE


In [None]:
# category와 subclass 조합 확인
print(df_processed.groupby(['category', 'subclass']).size().sort_values(ascending=False))

category  subclass
NONE      NONE        1601
AO        WHAT         201
          WHEN         200
          WHOM         200
          WHERE        200
EM        CONT         200
          UNF          200
LA        LEX          200
          SEM          200
dtype: int64


In [None]:
#혹시 몰라서 NONE 값 처리 안 한 데이터셋 남김

df_original_kept = df.copy()
# category, subclass 매핑만 적용 (NONE 설정 안 함)
df_original_kept['category'] = df_original_kept['category'].replace(category_mapping)
df_original_kept['subclass'] = df_original_kept['subclass'].replace(subclass_mapping)

# category와 subclass 조합 확인
print(df_original_kept.groupby(['category', 'subclass']).size().sort_values(ascending=False))

category  subclass
AO        none        801
LA        SEM         400
          LEX         400
EM        UNF         400
          CONT        400
AO        WHAT        201
          WHOM        200
          WHEN        200
          WHERE       200
dtype: int64


In [None]:
#데이터셋 Parquet 형식으로 저장
df_processed.to_parquet('/content/drive/MyDrive/Colab Notebooks/woke-odds/df_processed.parquet', index=False)

####모델 입력 형식에 맞게 데이터셋 변환

In [None]:
!pip install transformers datasets scikit-learn -q

In [None]:
import pandas as pd
import json
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np

In [None]:
# 데이터셋 로드
parquet_path = '/content/drive/MyDrive/woke-odds/df_processed.parquet'
df_loaded = pd.read_parquet(parquet_path)

print(f"로드된 데이터 개수: {len(df_loaded)}")
display(df_loaded.head())

로드된 데이터 개수: 3202


Unnamed: 0,question,clarifying_question,require_clarification,category,subclass
0,Give me a list of good coffee shops?,What do you personally consider important in a...,1,AO,WHOM
1,Give me some Mother's Day gift ideas.,"What are your mother's interests, hobbies, or ...",1,AO,WHOM
2,Help me come up with 3 ideas for a new busines...,"What are your areas of interest or expertise, ...",1,AO,WHAT
3,Write the first paragraph of a blog post descr...,What tone or perspective should I use for the ...,1,AO,WHOM
4,Give me some tips on how to train for a marathon.,Can you provide your current fitness level and...,1,AO,WHOM


In [None]:
model_name = "microsoft/Phi-4-mini-reasoning"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

In [None]:
system_prompt = """You are an AI system that determines if the question requires clarification and classifies the ambiguity.

Task:
1. Determine if the question requires clarification: clear(no clarification needed) or ambiguous(clarification needed)
2. Classify the ambiguity:
   - If question is clear, set category=NONE and subclass=NONE
   - If question is ambiguous, classify category and subclass

Output format: category|subclass

Categories:
- EM (Epistemic Misalignment): Questions with unfamiliar entities or self-contradictions
- LA (Linguistic Ambiguity): Questions with lexical or semantic ambiguity
- AO (Aleatoric Output): Questions with missing contextual information causing confusion
- NONE: Clear questions that don't require clarification

Subclasses:
For EM:
- UNF (UNFAMILIAR): Query contains unfamiliar entities or facts
- CONT (CONTRADICTION): Query contains self-contradictions

For LA:
- LEX (LEXICAL): Query contains terms with multiple meanings
- SEM (SEMANTIC): Query lacks context leading to multiple interpretations

For AO:
- WHOM: Query output contains confusion due to missing personal elements
- WHEN: Query output contains confusion due to missing temporal elements
- WHERE: Query output contains confusion due to missing spatial elements
- WHAT: Query output contains confusion due to missing task-specific elements
"""

In [None]:
dataset = []

for idx, row in df_loaded.iterrows():
    data = {
         "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": row['question']
            },
            {
                "role": "assistant",
                "content": f"{row['category']}|{row['subclass']}"
            }
        ]
    }
    dataset.append(data)

print(f"변환 완료: {len(dataset)}개")

변환 완료: 3202개


In [None]:
#샘플 확인
print(json.dumps(dataset[0], indent=2, ensure_ascii=False))

{
  "messages": [
    {
      "role": "system",
      "content": "You are an AI system that determines if the question requires clarification and classifies the ambiguity.\n\nTask:\n1. Determine if the question requires clarification: clear(no clarification needed) or ambiguous(clarification needed)\n2. Classify the ambiguity:\n   - If question is clear, set category=NONE and subclass=NONE\n   - If question is ambiguous, classify category and subclass\n\nOutput format: category|subclass\n\nCategories:\n- EM (Epistemic Misalignment): Questions with unfamiliar entities or self-contradictions\n- LA (Linguistic Ambiguity): Questions with lexical or semantic ambiguity\n- AO (Aleatoric Output): Questions with missing contextual information causing confusion\n- NONE: Clear questions that don't require clarification\n\nSubclasses:\nFor EM:\n- UNF (UNFAMILIAR): Query contains unfamiliar entities or facts\n- CONT (CONTRADICTION): Query contains self-contradictions\n\nFor LA:\n- LEX (LEXICAL): Qu

####Train/Validation/Test Split

In [None]:
from sklearn.model_selection import train_test_split

subclasses = [data['messages'][2]['content'].split('|')[1].strip() for data in dataset]

# 1. Train / Temp 분할 (80:20)
train_data, temp_data = train_test_split(
    dataset,
    test_size=0.2,
    random_state=42,
    stratify=subclasses  # 클래스 비율 유지
)

temp_subclasses = [data['messages'][2]['content'].split('|')[1].strip() for data in temp_data]

# 2. Temp를 Valid / Test 분할 (50:50 = 전체의 10%:10%)
valid_data, test_data = train_test_split(
    temp_data,
    test_size=0.5,
    random_state=42,
    stratify=temp_subclasses
)

print(f"Train: {len(train_data)}개 (80%)")
print(f"Valid: {len(valid_data)}개 (10%)")
print(f"Test: {len(test_data)}개 (10%)")

Train: 2561개 (80%)
Valid: 320개 (10%)
Test: 321개 (10%)


In [None]:
#작업 경로 지정
import os
os.chdir('/content/drive/MyDrive/woke-odds')
print(os.getcwd())

/content/drive/MyDrive/woke-odds


In [None]:
with open('ambiguity_train_1110.jsonl', 'w', encoding='utf-8') as f:
    for data in train_data:
        f.write(json.dumps(data, ensure_ascii=False) + '\n')

with open('ambiguity_valid_1110.jsonl', 'w', encoding='utf-8') as f:
    for data in valid_data:
        f.write(json.dumps(data, ensure_ascii=False) + '\n')

with open('ambiguity_test_1110.jsonl', 'w', encoding='utf-8') as f:
    for data in test_data:
        f.write(json.dumps(data, ensure_ascii=False) + '\n')

In [None]:
train_data[0]

{'messages': [{'role': 'system',
   'content': "You are an AI system that determines if the question requires clarification and classifies the ambiguity.\n\nTask:\n1. Determine if the question requires clarification: clear(no clarification needed) or ambiguous(clarification needed)\n2. Classify the ambiguity:\n   - If question is clear, set category=NONE and subclass=NONE\n   - If question is ambiguous, classify category and subclass\n\nOutput format: category|subclass\n\nCategories:\n- EM (Epistemic Misalignment): Questions with unfamiliar entities or self-contradictions\n- LA (Linguistic Ambiguity): Questions with lexical or semantic ambiguity\n- AO (Aleatoric Output): Questions with missing contextual information causing confusion\n- NONE: Clear questions that don't require clarification\n\nSubclasses:\nFor EM:\n- UNF (UNFAMILIAR): Query contains unfamiliar entities or facts\n- CONT (CONTRADICTION): Query contains self-contradictions\n\nFor LA:\n- LEX (LEXICAL): Query contains terms

####데이터셋 저장

In [None]:
#JSONL 형식으로

# Train 저장
with open('/content/drive/MyDrive/Colab Notebooks/woke-odds/final_CLAMBER_train.jsonl', 'w', encoding='utf-8') as f:
    for item in train_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
print("✅ train 저장 완료")

# Validation 저장
with open('/content/drive/MyDrive/Colab Notebooks/woke-odds/final_CLAMBER_valid.jsonl', 'w', encoding='utf-8') as f:
    for item in val_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
print("✅ valid 저장 완료")


# Test 저장
with open('/content/drive/MyDrive/Colab Notebooks/woke-odds/final_CLAMBER_test.jsonl', 'w', encoding='utf-8') as f:
    for item in test_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("✅ test 저장 완료")

✅ train 저장 완료
✅ valid 저장 완료
✅ test 저장 완료


In [None]:
# JSONL 파일 확인
print("📄 JSONL 파일 첫 번째 데이터:")
with open('/content/drive/MyDrive/Colab Notebooks/woke-odds/final_CLAMBER_train.jsonl', 'r', encoding='utf-8') as f:
    first_line = json.loads(f.readline())
    print(f"Question: {first_line['question']}")
    print(f"Label: {first_line['require_clarification']}|{first_line['category']}|{first_line['subclass']}")
    print(f"Text preview: {first_line['text'][:200]}...")


📄 JSONL 파일 첫 번째 데이터:
Question: Is Polymixis rufocincta multicellular?
Label: 0|NONE|NONE
Text preview: <|system|>Your name is Phi, an AI math expert developed by Microsoft. You are an ambiguity detection and classification system.
Task:
1. Determine if the question requires clarification:
   - 0: Clear...
