### SimpleRNN을 이용한 SMS Spam 분류
- 캐글 데이터 : https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [21]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [22]:
df = pd.read_csv(url, encoding='latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


- 데이터 전처리

In [23]:
# Selection
df = df[['v1', 'v2']]

In [24]:
# 결측치 확인
df.isna().sum().sum()

0

In [25]:
# 중복 데이터 확인
df.shape, df.v2.nunique()

((5572, 2), 5169)

In [26]:
df.drop_duplicates(subset=['v2'], inplace=True)
df.shape

(5169, 2)

In [27]:
# ['ham','spam'] --> [0,1]
df.v1 = df.v1.replace(['ham','spam'],[0,1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [28]:
# x, y
x, y = df.v2.values, df.v1.values

- 텍스트 전처리

In [29]:
import re
X_data = [re.sub('[^a-z0-9]', ' ', line.lower()) for line in x]
X_data[:3]

['go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   ',
 'ok lar    joking wif u oni   ',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005  text fa to 87121 to receive entry question std txt rate t c s apply 08452810075over18 s']

In [30]:
# 단어 집합 생성, 크기 확인
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

t = Tokenizer()
t.fit_on_texts(X_data)
vocab_size = len(t.word_index) + 1
vocab_size

8659

In [31]:
sequences = t.texts_to_sequences(X_data)
print(sequences[0])


[50, 426, 3927, 764, 694, 653, 70, 8, 1174, 96, 127, 413, 1175, 145, 2639, 1176, 63, 60, 3928, 129]


In [32]:
max_len = max(len(seq) for seq in sequences)
max_len

190

In [33]:
# 전체 데이터를 max_len 길이에 맞추어 0 padding
sequences = pad_sequences(sequences, max_len)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    sequences, y, stratify=y, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135, 190), (1034, 190), (4135,), (1034,))

#### 모델 정의/설정/학습/평가

In [35]:
import numpy as np
import tensorflow as tf
seed = 2023
np.random.seed(seed)
tf.random.set_seed(seed)

In [36]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [37]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=max_len),        # Embedding vector size:32, max_len은 y_train이 있으므로 -1 안함
    SimpleRNN(32),      # SimpleRNN node 수: 32
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 190, 32)           277088    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 279201 (1.07 MB)
Trainable params: 279201 (1.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [38]:
model.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best-spam-rnn.h5'
mc = ModelCheckpoint(model_path, save_best_only=True)
es = EarlyStopping(patience=10)

In [39]:
hist = model.fit(
    X_train, y_train, validation_split=0.2, verbose=0,
    epochs=100, batch_size=64, callbacks=[mc, es]
)

  saving_api.save_model(


In [40]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.08269337564706802, 0.978723406791687]