In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

from tqdm import tqdm  # 진행률 표시용
tqdm.pandas()

## 1. 데이터 로딩

In [3]:
ny_fake = pd.read_csv("NY_fake.csv")
ny_fake = ny_fake[["review_text", "fake"]]

In [4]:
ny_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12419 entries, 0 to 12418
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  12419 non-null  object
 1   fake         12419 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 194.2+ KB


In [5]:
ny_true = pd.read_csv("NY_true.csv")
ny_true = ny_true[["review_text", "fake"]]

In [6]:
ny_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110433 entries, 0 to 110432
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_text  110433 non-null  object
 1   fake         110433 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ MB


In [6]:
lv_fake = pd.read_csv("LV_fake.csv")
lv_fake = lv_fake[["review_text", "fake"]]

In [7]:
lv_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30545 entries, 0 to 30544
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  30545 non-null  object
 1   fake         30545 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 477.4+ KB


In [8]:
lv_true = pd.read_csv("LV_true.csv")
lv_true = lv_true[["review_text", "fake"]]

In [9]:
lv_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299550 entries, 0 to 299549
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_text  299550 non-null  object
 1   fake         299550 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.6+ MB


## 2. 데이터 결합

In [10]:
df = pd.concat([ny_fake, ny_true, lv_fake, lv_true]).reset_index(drop=True)
print(df.shape)
df.head()

(452947, 2)


Unnamed: 0,review_text,fake
0,Ordered delivery and they canceled my order wi...,1
1,Shrimp Anago tendon was nothing special. No eg...,1
2,Great restaurant to dine at. I highly recommen...,1
3,Great place for Asian fusions. Love their food...,1
4,"Bensimon in the east village serves tapas, don...",1


## 3. 기본 전처리

In [14]:
import re

def basic_clean(text):
    # 1. 앞뒤 공백 제거
    text = text.strip()
    
    # 2. HTML 태그 제거
    text = re.sub(r'<.*?>', '', text)
    
    # 3. 영어가 아닌 단어 제거 (숫자, 한글, 특수문자 등 제거)
    text = re.sub(r'\b[^a-zA-Z]+\b', ' ', text)
    
    # 4. 여러 공백 → 하나의 공백
    text = re.sub(r'\s+', ' ', text)
    
    return text


In [17]:
# 전처리 함수 적용
df['cleaned_text'] = df['review_text'].progress_apply(basic_clean)

100%|██████████| 452947/452947 [00:22<00:00, 20285.23it/s]


In [18]:
# 단어 수 기준으로 필터링 (5개 이하 제거)
df = df[df['cleaned_text'].str.split().str.len() > 5].reset_index(drop=True)

In [19]:
df["fake"].value_counts()

fake
0    409917
1     41488
Name: count, dtype: int64

## 4. 데이터 샘플링

In [20]:
# 클래스 0에서 40,000개 추출
df_0 = df[df['fake'] == 0].sample(n=40000, random_state=42)

# 클래스 1에서 40,000개 추출
df_1 = df[df['fake'] == 1].sample(n=40000, random_state=42)

# 결합 및 인덱스 초기화
df_balanced = pd.concat([df_0, df_1]).sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
df_balanced["fake"].value_counts()

fake
1    40000
0    40000
Name: count, dtype: int64

## 5. 임베딩 추출

In [22]:
# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[INFO] Using device: {device}")

# 모델 및 토크나이저 로드
print("[INFO] Loading BERT model and tokenizer...")
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased').to(device)

print("[INFO] Loading RoBERTa model and tokenizer...")
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
roberta_model = AutoModel.from_pretrained('roberta-base').to(device)

# 배치 처리 기반 CLS 추출 함수
def extract_cls_batch(texts, tokenizer, model, batch_size=16):
    model.eval()
    embeddings = []
    dataloader = DataLoader(texts, batch_size=batch_size)

    print(f"[INFO] Starting CLS embedding extraction (batch_size={batch_size})...")
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader, desc="Extracting [CLS] tokens")):
            encodings = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
            encodings = {k: v.to(device) for k, v in encodings.items()}
            outputs = model(**encodings)
            cls_batch = outputs.last_hidden_state[:, 0, :].cpu().tolist()
            embeddings.extend(cls_batch)

    print("[INFO] Embedding extraction complete.")
    return embeddings

[INFO] Using device: cuda
[INFO] Loading BERT model and tokenizer...
[INFO] Loading RoBERTa model and tokenizer...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# [CLS] 임베딩 추출 및 저장
df_balanced['bert_cls'] = extract_cls_batch(df_balanced['cleaned_text'], bert_tokenizer, bert_model, batch_size=32)
df_balanced['roberta_cls'] = extract_cls_batch(df_balanced['cleaned_text'], roberta_tokenizer, roberta_model, batch_size=32)

[INFO] Starting CLS embedding extraction (batch_size=32)...


Extracting [CLS] tokens: 100%|██████████| 2500/2500 [01:07<00:00, 37.26it/s]


[INFO] Embedding extraction complete.
[INFO] Starting CLS embedding extraction (batch_size=32)...


Extracting [CLS] tokens: 100%|██████████| 2500/2500 [01:04<00:00, 38.56it/s]

[INFO] Embedding extraction complete.





In [None]:
df_balanced.head()

In [None]:
df_balanced.to_json("emb_cls.json")

In [11]:
df_clean = pd.read_json("emb_cls.json")

In [9]:
import re
from bs4 import BeautifulSoup
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


import collections
import collections.abc
collections.Callable = collections.abc.Callable

In [5]:
pip install contractions

Defaulting to user installation because normal site-packages is not writeable
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64

In [6]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


In [28]:
# 사용자 정의 전처리 함수
def preprocess_text(text):
    # 1. HTML 태그 제거
    text_no_html = BeautifulSoup(text, "html.parser").get_text()
    
    # 2. 축약어 확장
    expanded_text = contractions.fix(text_no_html)
    
    # 3. 특수 문자 제거
    text_no_specials = re.sub(r'[^a-zA-Z]', ' ', expanded_text)
    
    # 4. 공백 정리
    text_clean = re.sub(r'\s+', ' ', text_no_specials).strip()
    
    # 5. 소문자 변환
    text_lower = text_clean.lower()
    
    # 6. 토큰화 + 불용어 제거
    tokens = word_tokenize(text_lower)
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]

    # === 6.5. 단어 길이 2 초과인 단어가 하나라도 없으면 제거 ===
    if not any(len(word) > 2 for word in filtered_tokens):
        return ''  # 제거 대상이므로 빈 문자열 반환
    
    # 7. 어간 추출
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # 8. 공백으로 연결된 문자열로 반환
    return ' '.join(stemmed_tokens)

In [None]:

# nltk 리소스 다운로드 (최초 실행 시 필요)
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

In [29]:
df_clean

Unnamed: 0,review_text,fake,cleaned_text,bert_cls,roberta_cls
0,Awesome place to eat! Great atmosphere and fri...,1,Awesome place to eat Great atmosphere and frie...,"[0.0792026445, -0.0128625417, 0.3162086904, 0....","[-0.0814717263, 0.0890230536, -0.0438717902, -..."
1,"Great lunch date place! Food is amazing, not a...",1,Great lunch date place Food is amazing not a l...,"[-0.0047885552, 0.108762987, 0.0148257306, -0....","[-0.05344918, 0.0875567347, -0.0044471626, -0...."
2,Best takeout Thai in the area and at a good pr...,1,Best takeout Thai in the area and at a good pr...,"[-0.1299686283, 0.1433178782, -0.0055814832, -...","[-0.030115934100000002, 0.0927834213, -0.01960..."
3,we been here few times for birthdays and coupl...,1,we been here few times for birthdays and coupl...,"[-0.0104819108, -0.207980454, 0.4680296183, -0...","[-0.0079873865, 0.019187642300000002, -0.02611..."
4,Excellent food and service. Yummy Delicious!!!...,1,Excellent food and service Yummy Delicious Flo...,"[-0.3778436184, -0.2384559363, -0.0925718844, ...","[-0.0456091613, 0.0996065363, -0.0168792922, -..."
...,...,...,...,...,...
79995,Huge portions and decent taste. Wait time was ...,0,Huge portions and decent taste Wait time was t...,"[-0.4036887288, -0.34380817410000003, -0.05337...","[-0.0847916529, 0.0360725187, -0.0341092423, -..."
79996,The smell pulls you in and the drinks are fant...,1,The smell pulls you in and the drinks are fant...,"[-0.063299641, 0.1593655795, 0.1619270891, -0....","[-0.0443048216, 0.0366968066, -0.0658430830000..."
79997,One of the best Halal thai restaurants in NYC ...,1,One of the best Halal thai restaurants in NYC ...,"[-0.02582689, -0.111468941, -0.109392680200000...","[-0.0276376922, 0.0901447907, -0.0091079036, -..."
79998,I've been here a handful of times and each tim...,0,I ve been here a handful of times and each tim...,"[-0.0458812565, -0.36073127390000004, 0.279482...","[-0.0723976791, 0.0933143795, -0.0459695086, -..."


In [None]:
# 전처리 함수 적용
df_clean['cleaned_text'] = df_clean['review_text'].progress_apply(preprocess_text)

In [25]:
df_clean

Unnamed: 0,cleaned_text,fake
0,this product is amazing loved it,0
1,worst experience ever do not buy,1
2,excellent quality and fast shipping,0
3,fake item very disappointed,1
4,not bad could be better,0
5,absolutely terrible would not recommend,1
