In [6]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, models, callbacks


## Word2Vec

In [7]:
# === 1. 텍스트 전처리 ===
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # 알파벳과 공백만
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    return tokens

In [None]:
# === 2. 데이터 로드 및 전처리 ===
df = pd.read_json("emb_cls.json")
df["tokens"] = df["review_text"].apply(preprocess_text)

In [None]:
df.head()

In [None]:
# === 3. Word2Vec 학습 ===
w2v_model = Word2Vec(sentences=df["tokens"], vector_size=100, window=5, min_count=2, workers=4)
word_vectors = w2v_model.wv
vocab = word_vectors.key_to_index

In [None]:
# === 4. 토크나이저 및 시퀀스 ===
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["tokens"])
sequences = tokenizer.texts_to_sequences(df["tokens"])
word_index = tokenizer.word_index

max_len = 100
X_seq = pad_sequences(sequences, maxlen=max_len)
y = df['fake'].values

In [None]:
# === 5. 임베딩 매트릭스 구성 ===
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

In [None]:
print(embedding_matrix.shape)

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, models, callbacks

from tensorflow.keras.models import Model
from keras.layers import Input, Embedding, SimpleRNN, Dense, Dropout

from tensorflow.keras.optimizers import Adam # 학습률 조절
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.callbacks import EarlyStopping # 검증 손실 개선되지 않으면 학습 자동 멈춤, 과적함 방지.
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error # 결과 평가
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# === 6. RNN 모델 ===
def build_rnn_model():
    input_layer = layers.Input(shape=(max_len,))
    embedding_layer = layers.Embedding(input_dim=len(word_index)+1,
                                       output_dim=embedding_dim,
                                       weights=[embedding_matrix],
                                       input_length=max_len,
                                       trainable=False)(input_layer)

    x = SimpleRNN(128, return_sequences=False, name='Basic_RNN_1')(embedding_layer)
    output = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model(inputs=input_layer, outputs=output)
    model.summary()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

### 모델 학습

In [None]:
def train_model(X, y):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42)

    model = build_rnn_model()
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=1,
              batch_size=128,
              callbacks=[early_stop])

    y_pred_prob = model.predict(X_val)
    y_pred = (y_pred_prob >= 0.5).astype(int)

    print(f"Accuracy : {accuracy_score(y_val, y_pred):.4f}")
    print(f"Precision: {precision_score(y_val, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_val, y_pred):.4f}")
    print(f"F1 Score : {f1_score(y_val, y_pred):.4f}")

    return model

In [None]:
model = train_model(X_seq, y)

In [None]:
# from tensorflow.keras.utils import plot_model
# plot_model(model, show_shapes=True)