In [20]:
### 0. 필수 모듈 설치

!pip install -q openai==0.28
!pip install -q google-generativeai

In [21]:
### 1. 모듈 & 데이터 파일 불러오기

import pandas as pd
import time
from tqdm import tqdm
import openai
import google.generativeai as genai

# 데이터 로딩
# CSV 로딩 + 열 이름 지정 + 점수 저장 열 추가
try:
    df = pd.read_csv("sense_applicability_sentences.csv", header=0, encoding='cp949')
except UnicodeDecodeError:
    df = pd.read_csv("sense_applicability_sentences.csv", header=0, encoding='euc-kr')


df.rename(columns={
    "Sense A": "sense_a",
    "Sense B": "sense_b",
    "Sense C": "sense_c",
}, inplace=True)

In [22]:
### 2. 모델 & 실험 프롬프트 세팅


# API 키 설정
openai.api_key = "..."
genai.configure(api_key="...")


# 프롬프트 생성 함수
def make_prompt(context, target, sense):
    return f"""

    Purpose of the Study
    This study investigates how you judge the applicability of different possible meanings (senses) of polysemous verbs like *break* and *freeze* when used in context. You will help us understand whether people make a clear-cut choice or see the possible senses as more graded and overlapping.

    What You Will Do
    You will read short English sentences containing either *break* or *freeze*. For each sentence, you will see three possible sense descriptions for the target verb. Your task is to rate how well each sense fits the verb as it is used in that sentence.

    Rating Scale
    Please rate each sentence-meaning pair on a 1 to 5 scale, where:
    1 = Not at all applicable: The sense does not fit the verb in this sentence at all.
    2 = Slightly applicable
    3 = Moderately applicable
    4 = Very applicable
    5 = Perfectly applicable: The sense fully matches the meaning of the verb in this sentence.

    Instructions
    • Please read each sentence carefully and focus on how the verb ('break' or 'freeze') is used in context.
    • Give your honest judgment for each of the senses.
    • Do not compare your ratings across sentences — focus on one sentence at a time.

    Context: {context}
    Sentence: {target}
    Meaning: {sense}

    Please respond with a number from 1 to 5 without any extra explanation.
    Response: """


# 모델 질의 함수
def query_gpt(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        temperature=0,
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content']

def query_gemini(prompt):
    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(prompt)
    return response.text

In [None]:
### 3. 평가 & 결과 저장

n_run = 10  # 반복 횟수

# 실험 실행
for idx, row in tqdm(df.iterrows(), total=len(df)):
    context = row['Context']
    target = row['Target']
    senses = [row['sense_a'], row['sense_b'], row['sense_c']]

    for model in ['gpt', 'gemini']:
        scores = [[], [], []]  # 각 의미별 점수 저장

        for run_idx in range(n_run):
            for i, sense in enumerate(senses):
                prompt = make_prompt(context, target, sense)
                try:
                    if model == 'gpt':
                        response = query_gpt(prompt)
                    else:
                        response = query_gemini(prompt)

                    score = int(''.join(filter(str.isdigit, response.strip())))
                    if score < 1 or score > 5:
                        raise ValueError("Score out of range")
                except Exception as e:
                    print(f"Error at row {idx}, model {model}, sense {i}, run {run_idx+1}: {e}")
                    score = -1

                scores[i].append(score)
                # time.sleep(2) # 이 부분을 제거하거나 줄여서 속도 개선


        for i, label in enumerate(['a', 'b', 'c']):
            for run_idx, score in enumerate(scores[i], start=1):
                colname = f"{model}_score_{label}_{run_idx}"
                df.at[idx, colname] = int(score) if score != -1 else -1


# 👉 결과 저장
df.to_csv("sense_applicability_results.csv", index=False, encoding='utf-8-sig')
print("✅ 저장 완료")

  0%|          | 0/34 [00:00<?, ?it/s]