In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings('ignore')

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
mail = pd.read_csv('new_spam.csv', index_col = 0)

mail.dropna(axis=0, inplace=True)

mail = mail.replace('spam', 1)
mail = mail.replace('ham', 0)
mail['label'].unique()

array([1, 0])

In [3]:
mail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19964 entries, 1 to 20100
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      19964 non-null  object
 1   label     19964 non-null  int64 
 2   new_text  19964 non-null  object
dtypes: int64(1), object(2)
memory usage: 623.9+ KB


In [4]:
X_data = mail['text']
y_data = mail['label']

In [5]:
vocab_size = 7000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_data) # 토큰화
sequences = tokenizer.texts_to_sequences(X_data) # 토큰에 인덱스 추가
X_data = sequences

In [6]:
max_len = 110
X_data = pad_sequences(X_data, maxlen = max_len)
X_data.shape

(19964, 110)

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=2022)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 모델 구축
# 레이어들을 쌓을 모델을 생성
model = Sequential()
model.add(Embedding(vocab_size, 32))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [9]:
es = EarlyStopping(monitor='val_loss', mode='auto', verbose=1, patience=3)

In [10]:
from time import time
t1 = time()
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=16, callbacks = [es])
t2 = time()

print(t2-t1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping
35.098921060562134


# Model Eval

In [11]:
test_text = pd.read_csv("spam_test_text.csv", encoding = 'utf-8')
test_label = pd.read_csv("spam_test_label.csv", encoding = 'utf-8')

In [12]:
test = tokenizer.texts_to_sequences(test_text['text'])
test = pad_sequences(test, maxlen = max_len)
test.shape

(9896, 110)

In [13]:
pred = model.predict(test)

In [14]:
labels = []
for i in pred:
    if i > 0.5:
        labels.append('spam')
    else:
        labels.append('ham')

In [15]:
from sklearn.metrics import *

print(classification_report(labels, test_label['label'], digits = 6))

              precision    recall  f1-score   support

         ham   0.939682  0.738586  0.827086      4556
        spam   0.811401  0.959551  0.879279      5340

    accuracy                       0.857821      9896
   macro avg   0.875542  0.849069  0.853183      9896
weighted avg   0.870460  0.857821  0.855250      9896

