In [None]:
from google.colab import drive
import json
import re

In [None]:
# 구글 드라이브 마운트
drive.mount('/content/drive')

In [None]:
!ls /content/drive/MyDrive/immoral_intensity_data/Training/

In [None]:
file_paths = [
    "/content/drive/MyDrive/immoral_intensity_data/Training/talksets-train-1.json",
    "/content/drive/MyDrive/immoral_intensity_data/Training/talksets-train-2.json",
    "/content/drive/MyDrive/immoral_intensity_data/Training/talksets-train-3.json"
]

In [None]:
# 텍스트 전처리(불용어 제거)
def preprocess_text(text):
  text = re.sub(r"[ㅋㅎㅠㅜ]+", "", text)
  if re.search(r"[#@]", text):
    return None
  return text.strip()

In [None]:
# JSON 파일 구조 확인
with open(file_paths[0], "r", encoding="utf-8") as file:
    data = json.load(file)
    print(data)


In [None]:
# JSON 파일 로드 및 병합
all_sentences = []

for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        if isinstance(data, list):
            all_sentences.extend(data)
        elif isinstance(data, dict) and "sentences" in data:
            all_sentences.extend(data["sentences"])

print(f"총 로드된 데이터 개수: {len(all_sentences)}")

In [None]:
print(all_sentences[0])

In [None]:
# 전처리 및 형태 변환
processed_data = []

for item in all_sentences:
    if "sentences" not in item:
        continue

    for sentence in item["sentences"]:
        if "text" not in sentence or "is_immoral" not in sentence or "intensity" not in sentence or "types" not in sentence:
            continue

        cleaned_text = preprocess_text(sentence["text"])

        if cleaned_text is None:
            continue

        # 필요한 데이터만 추출
        processed_item = {
            "text": cleaned_text,
            "is_immoral": sentence["is_immoral"],
            "intensity": sentence["intensity"],
            "types": sentence["types"] if sentence["types"] else []
        }
        processed_data.append(processed_item)

print(f"전처리된 데이터 개수: {len(processed_data)}")

In [None]:
# 첫 번째 항목 출력
if len(processed_data) > 0:
    print("첫 번째 항목:")
    print(json.dumps(processed_data[0], indent=4, ensure_ascii=False))
else:
    print("processed_data가 비어 있습니다. 데이터를 확인하세요!")

In [None]:
# 변환된 데이터 저장
output_path = "/content/drive/MyDrive/processed_data_immoral/Training/processed_training_data.json"

with open(output_path, "w", encoding="utf-8") as file:
    json.dump(processed_data, file, ensure_ascii=False, indent=4)

print(f"전처리 및 병합된 데이터가 저장되었습니다: {output_path}")