In [None]:
!pip install textattack==0.3.5

In [None]:
!pip install transformers torch sentencepiece

In [None]:
# 3. JSON 데이터 업로드
from google.colab import files
import json

uploaded = files.upload()

with open("open_problem.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

In [None]:
# 라이브러리 import
import random
import json
import requests
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from google.colab import files

# 1. Papago API 설정
client_id = '64yuz72rjr'
client_secret = 'mpxNeA4kw4gIEe2TgfRDfnWSzeWBJ1Sz0FfLkUDW'

def papago_translate(text, source_lang, target_lang):
    url = "https://naveropenapi.apigw.ntruss.com/nmt/v1/translation"
    headers = {
        "X-NCP-APIGW-API-KEY-ID": client_id,
        "X-NCP-APIGW-API-KEY": client_secret
    }
    data = {
        "source": source_lang,
        "target": target_lang,
        "text": text
    }
    response = requests.post(url, headers=headers, data=data)
    if response.status_code == 200:
        return response.json()['message']['result']['translatedText']
    else:
        print(f"번역 오류 발생: {response.status_code}, 응답: {response.text}")
        return None

# 2. HuggingFace T5 Paraphraser 로딩
model_name = "ramsrigouthamg/t5_paraphraser"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
paraphraser = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def generate_paraphrases(text, num_return_sequences=3):
    input_text = f"paraphrase: {text} </s>"
    outputs = paraphraser(
        input_text,
        max_length=100,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=100,
        top_p=0.92,
        temperature=0.8
    )
    return [output['generated_text'] for output in outputs]

# 3. Synonym 치환
synonym_dict = {
    # 다양한 단어 확장
    "국가": ["나라","국민국"],
    "수도": ["중심 도시","행정 중심지"],
    "발명": ["개발","고안"],
    "역사": ["과거","연혁"],
    "과학": ["자연 과학","이론 체계"],
    "환경": ["생태계","자연환경"],
    "경제": ["재정","산업"],
    "합": ["덧셈 결과","총합"],
    "곱": ["곱셈 결과","곱한 값"],
    "차이": ["뺄셈 결과","차분"],
    "비율": ["비","비율 값"],
    "각도": ["회전량","각도값"],
    "수열": ["수의 나열","순서열"],
    "방정식": ["수식","연립식"],
    "참": ["진리","사실"],
    "거짓": ["허위","오류"],
    "전제": ["가정","조건"],
    "결론": ["결과","최종 판단"],
    "증명": ["논증","입증"],
    "명제": ["주장","문장"],
    "추론": ["유추","논리적 도출"]
}

def synonym_replace(korean_text):
    words = korean_text.split()
    new_words = []
    for word in words:
        replaced = False
        for key, synonyms in synonym_dict.items():
            if key in word:
                if random.random() < 0.5:
                    word = word.replace(key, random.choice(synonyms))
                    replaced = True
                    break
        new_words.append(word)
    return ' '.join(new_words)

# 4. Style Transfer
def style_transfer(text, apply_prob=0.6):
    if random.random() > apply_prob:
        return text

    endings = {
        "했습니다.": ["해요.", "했어요."],
        "했습니다": ["해요", "했어요"],
        "한다.": ["해.", "했어."],
        "합니다.": ["해요.", "했어요."],
        "합니다": ["해요", "했어요"],
        "있습니다.": ["있어요.", "있어."],
        "있습니다": ["있어요", "있어"]
    }

    for formal, casual_list in endings.items():
        if formal in text:
            text = text.replace(formal, random.choice(casual_list))
    return text

# 5. 전체 프로세스 함수
def full_back_translate_with_paraphrase_style(korean_text, num_augments=5):
    en_text = papago_translate(korean_text, source_lang='ko', target_lang='en')
    time.sleep(0.2)

    paraphrases = generate_paraphrases(en_text, num_return_sequences=num_augments)
    time.sleep(0.2)

    results = []
    for p in paraphrases:
        ko_text = papago_translate(p, source_lang='en', target_lang='ko')
        time.sleep(0.2)

        ko_text_synonym = synonym_replace(ko_text)
        final_text = style_transfer(ko_text_synonym)

        results.append(final_text)

    return results

# 7. 증강하기
augmented_data = []

for item in raw_data:
    question = item['Question']
    answer = item['Answer']

    # 질문을 여러 개 증강
    augmented_questions = full_back_translate_with_paraphrase_style(question, num_augments=5)

    for aug_q in augmented_questions:
        augmented_data.append({
            "Question": aug_q,
            "Answer": answer
        })

# 8. 증강된 데이터 저장
output_filename = "augmented_open_problem.json"

with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(augmented_data, f, ensure_ascii=False, indent=4)

# 9. Colab에서 바로 다운로드
files.download(output_filename)