In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import glob
import os

In [None]:
!pip install kiwipiepy

클렌징, 토큰화, 불용어 제거, 어간 추출 + 기본형(감정분석용)

In [None]:
import pandas as pd
import glob
import os
from kiwipiepy import Kiwi
import re
from tqdm import tqdm
from kiwipiepy import Kiwi
kiwi = Kiwi()

INPUT_DIR = '/content/drive/My Drive/project/리뷰DB'
OUTPUT_DIR = '/content/drive/My Drive/project/리뷰DB_tokenized'

REVIEW_COLUMN = 'review_text'

In [None]:
import re

HYPHENS = r"[\u2010\u2011\u2012\u2013\u2014\u2212\-]"

PROTECT_PHRASES = [
    (r"자기\s*[-]?\s*계발", "자기계발"),
    (r"퍼스널\s*[-]?\s*브랜딩", "퍼스널브랜딩"),
    (r"리\s*[-]?\s*브랜딩", "리브랜딩"),
    (r"브\s*[-]?\s*랜딩", "브랜딩"),
]

def normalize_phrases(text: str) -> str:
    text = str(text)
    text = re.sub(HYPHENS, "-", text)
    for pat, repl in PROTECT_PHRASES:
        text = re.sub(pat, repl, text)
    return text

In [None]:
# 토큰화 제외 대상 정의
for w in ["자기 계발", "자기계발", "퍼스널브랜딩", "리브랜딩",
          "브랜딩", "강추","비추","최애","흔한 남매","흔한남매"]:
    kiwi.add_user_word(w, "NNG")

In [None]:
# 불용어(Stopwords) 리스트 정의
stopwords = {
    '책','작가','저자','구매','주문', '배송', '도서', '리뷰','서평','발송',
    'yes24','교보문고' '내용','포장','이야기',
    '읽다','읽히','그것','무엇',
    '그', '이', '저', '것', '수', '등', '들','때', '거', '해서',
    '하지만', '그리고', '그래서',
    '이다', '보다','보는',
    '그냥','사실','개인적으로','솔직히',
    '왠만해선','정말로'
}

In [None]:
def extract_quad_tokens(text):
    text = normalize_phrases(text)

    text_cleaned = re.sub(r'[^가-힣A-Za-z0-9\s]', '', str(text))
    if not text_cleaned.strip():
        return [], [], [], []

    try:
        res = kiwi.tokenize(text_cleaned)
    except:
        return [], [], [], []

    tokens_surface, tokens_lemma = [], []
    sentiment_tokens, sentiment_lemmas = [], []

    for t in res:
        if t.tag in ['NNG', 'NNP', 'VV', 'VA', 'XR', 'MAG', 'SL']:
            tokens_surface.append(t.form)
            tokens_lemma.append(t.lemma)

            if t.lemma in stopwords or t.form in stopwords:
                continue

            sentiment_tokens.append(t.form)
            sentiment_lemmas.append(t.lemma)

    return tokens_surface, tokens_lemma, sentiment_tokens, sentiment_lemmas

In [None]:
def process_review_db(input_dir, output_dir, target_col):
    print("[Start] 리뷰 DB 전처리 및 토큰화를 시작합니다...\n")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    else:
        print(f"결과 폴더 확인: {output_dir}")

    csv_files = glob.glob(os.path.join(input_dir, '*.csv'))

    if not csv_files:
        print(f" '{input_dir}' 경로에 CSV 파일이 없습니다.")
        return

    print(f"총 {len(csv_files)}개의 파일을 처리합니다.")

    for file_path in tqdm(csv_files, desc="Processing Files"):
        filename = os.path.basename(file_path)
        save_path = os.path.join(output_dir, filename)

        try:
            df = pd.read_csv(file_path)

            if target_col in df.columns:
                df[target_col] = df[target_col].fillna('')

                results = df[target_col].apply(extract_quad_tokens)

                df['tokens'] = [r[0] for r in results]
                df['tokens_lemma'] = [r[1] for r in results]
                df['sentiment_tokens'] = [r[2] for r in results]
                df['sentiment_lemmas'] = [r[3] for r in results]

                df.to_csv(save_path, index=False, encoding='utf-8-sig')

            else:
                df.to_csv(save_path, index=False, encoding='utf-8-sig')

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    print("\n모든 작업이 성공적으로 끝났습니다!")

# 실행
if __name__ == "__main__":
    process_review_db(INPUT_DIR, OUTPUT_DIR, REVIEW_COLUMN)