# Data preparing

데이터 전처리 코드

In [None]:
import re
import json
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/paradetox/paradetox.tsv', sep='\t')

# Deduplicating
df = df.drop_duplicates()
# Drop empty rows
df = df[(df['toxic'] != '')]

# function for text cleaning
def clean_text(text):
    try:
        # 특수 문자 제거 (기본적인 구두점은 유지)
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        # 중복 공백 제거
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    except:
        return text

df['toxic'] = df['toxic'].apply(clean_text)
df['neutral1'] = df['neutral1'].apply(clean_text)
df['neutral2'] = df['neutral2'].apply(clean_text)
df['neutral3'] = df['neutral3'].apply(clean_text)

# 너무 짧거나 긴 문장 필터링
mask = (df['toxic'].str.len() > 5) & (df['toxic'].str.len() < 500)
df = df[mask]
df['references'] = df.iloc[:, 1:].apply(lambda r: r.dropna().tolist(), axis=1)

train, valid_test = train_test_split(df, test_size=0.2, random_state=426)
valid, test = train_test_split(valid_test, test_size=0.5, random_state=426)

print(len(train), len(valid), len(test))

train_dict = train.to_dict(orient='records')
valid_dict = valid.to_dict(orient='records')
test_dict = test.to_dict(orient='records')

with open('train.json', 'w') as f:
    json.dump(train_dict, f, indent=4, ensure_ascii=False)

with open('valid.json', 'w') as f:
    json.dump(valid_dict, f, indent=4, ensure_ascii=False)
    
with open('test.json', 'w') as f:
    json.dump(test_dict, f, indent=4, ensure_ascii=False)