In [None]:
# 1. 필요한 라이브러리 설치
!pip install openai==0.28 transformers

In [None]:
# 2. 라이브러리 불러오기
import openai
import random
import json
import time
import random
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
from sentence_transformers import SentenceTransformer, util
from google.colab import files

In [None]:
# 3. OpenAI API 키 설정
openai.api_key = "***"

In [None]:
# 4. JSON 데이터 업로드
uploaded = files.upload()
with open("qa.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
# 5. GPT 번역 함수
def gpt_translate(text, source_lang="Korean", target_lang="English"):
    prompt = f"Translate the following text from {source_lang} to {target_lang}:\n\n{text}"

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful translation assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=512
        )
        translated = response['choices'][0]['message']['content'].strip()
        return translated
    except Exception as e:
        print(f"[ERROR] GPT translation failed: {e}")
        return None

In [None]:
# 6. T5 패러프레이즈 모델 로딩
model_name = "ramsrigouthamg/t5_paraphraser"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
paraphraser = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def generate_paraphrases(text, num_return_sequences=3):
    input_text = f"paraphrase: {text}"
    outputs = paraphraser(
        input_text,
        max_length=100,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=100,
        top_p=0.92,
        temperature=0.8
    )
    return [output['generated_text'] for output in outputs]

In [None]:
# 7. 동의어 사전 및 치환 함수
synonym_dict = {
    "공지": ["안내", "알림", "제공"],
    "결과는": ["성적은", "점수는", "채점 결과는"],
    "예선대회": ["예선"],
    "본선대회": ["본선", "최종 라운드"],
    " 접수": ["신청", "등록", "참가 등록"],
    "접수확인": ["신청 여부 확인", "등록 확인", "참가 확인"],
    "문제보기": ["문제 열람", "문제 확인", "문제 페이지 확인"],
    "답안": ["해답", "풀이", "제출한 내용"],
    "답안 제출": ["제출", "응답 작성", "답변 등록"],
    "팀장": ["대표자", "대표", "리더"],
    "팀원": ["참가자", "멤버"],
    "수정제출": ["다시 제출", "재제출", "답변 수정"],
    "참가 자격": ["출전 조건", "참여 가능자", "응시 가능 여부"],
    "대회 일정": ["대회 날짜", "일정 계획"],
    "참가 부문": ["출전 카테고리", "응시 구분"],
    "팀 구성": ["팀 만들기", "팀 편성", "팀 결성"],
    "사이트": ["홈페이지", "웹사이트"],
    "문제": ["질문", "지문", "문항"],
    "사무국으로": ["운영진에게", "운영 사무실에", "대회 운영팀에"],
    "문의": ["질문", "상담", "연락"]
}

def synonym_replace(korean_text):
    if not korean_text:
        return ""
    words = korean_text.split()
    new_words = []
    for word in words:
        for key, synonyms in synonym_dict.items():
            if key in word and random.random() < 0.5:
                word = word.replace(key, random.choice(synonyms))
                break
        new_words.append(word)
    return ' '.join(new_words)

In [None]:
# 8. 스타일 변환 함수
style_templates = [
    lambda q: q.replace("어떻게 되나요", "어떻게 돼요"),
    lambda q: q.replace("언제 공지되나요", "언제 나와요"),
    lambda q: q.replace("가능한가요", "할 수 있나요"),
    lambda q: q.replace("확인하고 싶습니다", "어떻게 확인해요?"),
    lambda q: q.replace("문의하시기 바랍니다", "어디에 물어보면 돼요?"),
    lambda q: q.replace("답안을 제출할 수 없습니다", "답안을 못 올리는데 왜 그런가요?"),
    lambda q: q.replace("문제보기가 되지 않습니다", "문제가 안 떠요"),
    lambda q: q.replace("참가 자격이 궁금합니다", "누가 참가할 수 있어요?"),
    lambda q: q.replace("접수는 불가합니다", "접수 안 되나요?"),
]

def style_transfer(question: str, templates=style_templates, apply_prob: float = 0.6) -> str:
    if random.random() > apply_prob:
        return question

    q = question
    for func in random.sample(templates, k=min(len(templates), 3)):
        q = func(q)

    return q

In [None]:
# 9. 의미 보존 함수
embed_model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

def filter_meaning_preserved(original, candidates, threshold):
    original_emb = embed_model.encode(original, convert_to_tensor=True)
    results = []
    for c in candidates:
        c_emb = embed_model.encode(c, convert_to_tensor=True)
        sim = util.pytorch_cos_sim(original_emb, c_emb).item()
        if sim >= threshold:
            results.append(c)
    return results

In [None]:
# 10. 유사문장 제거
def deduplicate_by_similarity(sentences, threshold):
    unique = []
    embeddings = embed_model.encode(sentences, convert_to_tensor=True)

    for i, sent in enumerate(sentences):
        is_duplicate = False
        for u_idx in range(len(unique)):
            sim = util.pytorch_cos_sim(embeddings[i], embeddings[sentences.index(unique[u_idx])]).item()
            if sim >= threshold:
                is_duplicate = True
                break
        if not is_duplicate:
            unique.append(sent)
    return unique

In [None]:
# 11. 중복 제거된 문장 찾기
def is_similar_to_any(candidate, existing_list, threshold):
    candidate_embed = embed_model.encode(candidate, convert_to_tensor=True)
    for text in existing_list:
        sim = util.pytorch_cos_sim(candidate_embed, embed_model.encode(text, convert_to_tensor=True)).item()
        if sim >= threshold:
            return True
    return False

In [None]:
# 12. 의미 보존 + 중복 제거
def enforce_min_questions(original, candidates, min_count):
    preserved = filter_meaning_preserved(original, candidates, threshold=0.7)
    deduped = deduplicate_by_similarity(preserved, threshold=0.93)

    if len(deduped) < min_count:
        rejected = [p for p in preserved if p not in deduped and not is_similar_to_any(p, deduped, threshold=0.93)]
        rejected_sorted = sorted(
            rejected,
            key=lambda x: -util.pytorch_cos_sim(
                embed_model.encode(original, convert_to_tensor=True),
                embed_model.encode(x, convert_to_tensor=True)
            ).item()
        )

        for r in rejected_sorted:
            deduped.append(r)
            if len(deduped) >= min_count:
                break

    return deduped

In [None]:
# 13. 전체 백번역 증강 파이프라인
def full_back_translate_with_paraphrase_style(korean_text, num_augments):
    en_text = gpt_translate(korean_text, source_lang='Korean', target_lang='English')
    if not en_text:
        return [korean_text]

    paraphrases = generate_paraphrases(en_text, num_return_sequences=num_augments)

    results = []
    for p in paraphrases:
        ko_text = gpt_translate(p, source_lang='English', target_lang='Korean')
        if not ko_text:
            ko_text = korean_text
        ko_text_synonym = synonym_replace(ko_text)
        final_text = style_transfer(ko_text_synonym)
        results.append(final_text)

    return results

In [None]:
# 14. 증강 실행 및 저장
augmented_data = []
for item in data:
    question = item['Question']
    answer = item['Answer']

    augmented_questions = full_back_translate_with_paraphrase_style(question, num_augments=8)
    all_questions = [question] + augmented_questions
    final_questions = enforce_min_questions(question, all_questions, 5)

    for aug_q in final_questions:
        augmented_data.append({
            "Question": aug_q,
            "Answer": answer
        })

In [None]:
# 15. JSON 저장 및 다운로드
output_filename = "augmented_qa.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(augmented_data, f, ensure_ascii=False, indent=4)

files.download(output_filename)