In [1]:
import random
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.callbacks import EarlyStopping # 검증 손실 개선되지 않으면 학습 자동 멈춤, 과적함 방지.
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error # 결과 평가
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def seed_everything(seed: int=42):
  random.seed(seed)
  np.random.seed(seed)
  os.environ['PYTHONASHSEED'] = str(seed)
my_seed = 42
seed_everything(my_seed)

In [3]:
df_train = pd.read_pickle('df_train.pkl')
df_test = pd.read_pickle('df_test.pkl')

In [4]:
df_train.head()

Unnamed: 0,review_text,fake,clean_text,bert_emb,roberta_emb
48773,Best noodles we've ever had! Was told they had...,0,best noodle ever tell best noodle vegas defian...,"[-0.3053801655769348, -0.14964118599891663, -0...","[-0.06568992882966995, 0.0539405457675457, -0...."
52419,We had some amazing nachos and other food for ...,0,amaze nacho food lunch filet mignon nachos wor...,"[-0.1540858894586563, -0.14516526460647583, 0....","[-0.023452945053577423, 0.0322229228913784, -0..."
29881,Very cute place. Food was good. Got the churro...,1,cute place food good get churro dog two hot bo...,"[-0.07121089845895767, -0.03658287599682808, 0...","[-0.03706225007772446, 0.07646695524454117, -0..."
21786,Always have fresh food and is delicious. Very ...,1,always fresh food delicious good customer serv...,"[-0.18598921597003937, -0.02015671506524086, 0...","[-0.05256029963493347, 0.0839853510260582, -0...."
28594,Thanks for letting me about the free s'more sp...,1,thanks let free special appreciate jessica ser...,"[0.09011292457580566, 0.16610845923423767, 0.0...","[-0.04973651468753815, 0.07160010188817978, -0..."


In [5]:
vectorizer = TfidfVectorizer(max_features=768)

In [6]:
# 훈련 데이터 리뷰 텍스트와 라벨 추출
x_train = df_train['clean_text']
y_train = np.array(df_train['fake'])

# 테스트 데이터 리뷰 텍스트와 라벨 추출
x_test = df_test['clean_text']
y_test = np.array(df_test['fake'])

In [None]:
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.fit_transform(x_test)

In [8]:
tokenized_texts = [sentence.split() for sentence in df_train['clean_text']]

word_counts = Counter()
for sentence in tokenized_texts:
    word_counts.update(sentence)

print(len(word_counts))

39484


In [9]:
model_lr = LogisticRegression(max_iter=1000,random_state=42)

In [10]:
model_lr.fit(x_train, y_train)

In [11]:
# 테스트 데이터 예측 및 이진 분류 임계값 적용
y_pred_binary = (model_lr.predict(x_test) >= 0.5).astype(int)

# 모델 성능 평가 지표 계산 및 출력
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_binary),
    "Precision": precision_score(y_test, y_pred_binary),
    "Recall": recall_score(y_test, y_pred_binary),
    "F1 Score": f1_score(y_test, y_pred_binary),
}

# 성능 지표 출력
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Accuracy: 0.6778
Precision: 0.7132
Recall: 0.5947
F1 Score: 0.6486
