In [None]:
from google.colab import drive
import json
import re

In [None]:
drive.mount('/content/drive')

In [None]:
!ls /content/drive/MyDrive/

In [None]:
file_paths = [
    "/content/drive/MyDrive/Training-20241214T060301Z-001/Training/talksets-train-1.json",
    "/content/drive/MyDrive/Training-20241214T060301Z-001/Training/talksets-train-2.json",
    "/content/drive/MyDrive/Training-20241214T060301Z-001/Training/talksets-train-3.json",
    "/content/drive/MyDrive/Training-20241214T060301Z-001/Training/talksets-train-4.json",
    "/content/drive/MyDrive/Training-20241214T060301Z-001/Training/talksets-train-5.json"
]

In [None]:
# 텍스트 전처리(불용어 제거)
def preprocess_text(text):
  text = re.sub(r"[ㅋㅎㅠㅜ]+", "", text)
  if re.search(r"[#@]", text):
    return None
  return text.strip()

In [None]:
# JSON 파일 구조 확인
with open(file_paths[0], "r", encoding="utf-8") as file:
    data = json.load(file)
    print(data)

In [None]:
# JSON 파일 로드 및 병합
all_sentences = []

for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        if isinstance(data, list):
            all_sentences.extend(data)
        elif isinstance(data, dict) and "sentences" in data:
            all_sentences.extend(data["sentences"])

print(f"총 로드된 데이터 개수: {len(all_sentences)}")

In [None]:
# 전처리 및 형태 변환
processed_data = []

for item in all_sentences:
    if "sentences" not in item:
        continue

    for sentence in item["sentences"]:
        if "text" not in sentence or "is_immoral" not in sentence or "intensity" not in sentence:
            continue

        cleaned_text = preprocess_text(sentence["text"])

        if cleaned_text is None:
            continue

        # 필요한 데이터만 추출
        processed_item = {
            "text": cleaned_text,
            "is_immoral": sentence["is_immoral"],
            "intensity": sentence["intensity"]
        }
        processed_data.append(processed_item)

print(f"전처리된 데이터 개수: {len(processed_data)}")

In [None]:
import random

# intensity 구간별로 데이터를 나눔
intensity_1 = [item for item in processed_data if 1.0 <= item["intensity"] <= 1.9]
intensity_2 = [item for item in processed_data if 2.0 <= item["intensity"] <= 2.9]
intensity_3 = [item for item in processed_data if item["intensity"] > 2.9]

# 총 데이터 50000개로 축소
target_size = 50000

# 각 구간의 목표 개수
size_1 = int(target_size * 0.4)  # 40%
size_2 = int(target_size * 0.3)  # 30%
size_3 = target_size - size_1 - size_2  # 나머지 (30%)

# 각 구간에서 샘플링
sampled_1 = random.sample(intensity_1, min(size_1, len(intensity_1)))
sampled_2 = random.sample(intensity_2, min(size_2, len(intensity_2)))
sampled_3 = random.sample(intensity_3, min(size_3, len(intensity_3)))

filtered_data = sampled_1 + sampled_2 + sampled_3

# 섞기
random.shuffle(filtered_data)

print(f"intensity 1.0 ~ <2.0: {len(sampled_1)}개")
print(f"intensity 2.0 ~ 2.9: {len(sampled_2)}개")
print(f"intensity >2.9: {len(sampled_3)}개")
print(f"최종 데이터 개수: {len(filtered_data)}")

In [None]:
# 첫 번째 항목 출력
if len(processed_data) > 0:
    print("첫 번째 항목:")
    print(json.dumps(processed_data[0], indent=4, ensure_ascii=False))
else:
    print("processed_data가 비어 있습니다. 데이터를 확인하세요!")

In [None]:
output_path = "/content/drive/MyDrive/processed_training_50000_data.json"

# 변환된 데이터 저장
with open(output_path, "w", encoding="utf-8") as file:
    json.dump(filtered_data, file, ensure_ascii=False, indent=4)

print(f"전처리 및 병합된 데이터가 저장되었습니다: {output_path}")