<a href="https://colab.research.google.com/github/jaejunchoe/HAIDS-Lab/blob/main/Upload_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer

# Google Colab 환경에 맞게 파일 경로 설정
dataset_path = '/content/drive/MyDrive/IDS/amaxon reviews 2023/dataset/전처리완료_Subscription_Boxes_Reviews.json.gz'
stopwords_path = '/content/drive/MyDrive/IDS/amaxon reviews 2023/dataset/stopwords.txt'
punctuations_path = '/content/drive/MyDrive/IDS/amaxon reviews 2023/dataset/punctuations.txt'
save_dir = '/content/drive/MyDrive/IDS/amaxon reviews 2023/Transnet/T2'

def process_dataset(json_path, select_cols, train_rate, csv_path):
    print('#### Read the json file...')
    if json_path.endswith('gz'):
        df = pd.read_json(json_path, lines=True, compression='gzip')
    else:
        df = pd.read_json(json_path, lines=True)

    # 선택된 컬럼으로 데이터 필터링
    df = df[select_cols]
    df.columns = ['userID', 'itemID', 'review', 'rating']  # 이름을 통일

    # map user(or item) to number
    df['userID'] = df.groupby(df['userID']).ngroup()
    df['itemID'] = df.groupby(df['itemID']).ngroup()


    # Stopwords와 punctuations 파일 읽기
    with open(stopwords_path) as f:
        stop_words = set(f.read().splitlines())
    with open(punctuations_path) as f:
        punctuations = set(f.read().splitlines())

    # 텍스트 전처리 함수
    def clean_review(review):
        review = review.lower()
        for p in punctuations:
            review = review.replace(p, ' ')
        review = WordPunctTokenizer().tokenize(review)
        review = [word for word in review if word not in stop_words]
        return ' '.join(review)

    # Null 값 제거 및 리뷰 전처리
    df = df.drop(df[[not isinstance(x, str) or len(x) == 0 for x in df['review']]].index)
    df['review'] = df['review'].apply(clean_review)

    # 데이터셋 분할
    train, valid = train_test_split(df, test_size=1 - train_rate, random_state=3)
    valid, test = train_test_split(valid, test_size=0.5, random_state=4)

    # 저장 디렉토리 생성
    os.makedirs(csv_path, exist_ok=True)
    train.to_csv(os.path.join(csv_path, 'train.csv'), index=False, header=False)
    valid.to_csv(os.path.join(csv_path, 'valid.csv'), index=False, header=False)
    test.to_csv(os.path.join(csv_path, 'test.csv'), index=False, header=False)

    print(f'#### Split and saved dataset as csv: train {len(train)}, valid {len(valid)}, test {len(test)}')
    print(f'#### Total: {len(df)} reviews, {len(df.groupby("userID"))} users, {len(df.groupby("itemID"))} items.')
    return train, valid, test


# Google Colab에서 직접 실행
if __name__ == '__main__':
    # 데이터셋 컬럼 설정
    select_cols = ['user_id', 'parent_asin', 'text', 'rating']
    train_rate = 0.8  # 학습 데이터 비율

    # 데이터셋 처리
    process_dataset(dataset_path, select_cols, train_rate, save_dir)


#### Read the json file...
#### Split and saved dataset as csv: train 12795, valid 1599, test 1600
#### Total: 15994 reviews, 15225 users, 641 items.
