In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf



In [2]:
df = pd.read_csv('spam.csv', encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [4]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['v1'].value_counts() # stratify should be applied

v1
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
X = df['v2'].values
y = df['v1'].values

In [7]:
X

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [8]:
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [36]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y

array([0, 0, 1, ..., 0, 0, 0])

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3900,), (1672,), (3900,), (1672,))

In [55]:
# max_words = 1000

# tokenizer = Tokenizer(num_words=max_words)
tokenizer = Tokenizer() # has function to remove 
tokenizer.fit_on_texts(X_train) 

In [56]:
print(tokenizer.word_index, "\n")
print(tokenizer.word_counts, "\n")
print(tokenizer.word_docs, "\n")
print(tokenizer.document_count, "\n")




3900 



In [None]:
sequences = tokenizer.texts_to_sequences(X_train)
print(X_train, "\n")
print(sequences, "\n")

['Goal! Arsenal 4 (Henry, 7 v Liverpool 2 Henry scores with a simple shot from 6 yards from a pass by Bergkamp to give Arsenal a 2 goal margin after 78 mins.'
 "I dont. Can you send it to me. Plus how's mode."
 "Aah bless! How's your arm?" ...
 "Sure, if I get an acknowledgement from you that it's astoundingly tactless and generally faggy to demand a blood oath fo"
 'Nothing but we jus tot u would ask cos u ba gua... But we went mt faber yest... Yest jus went out already mah so today not going out... Jus call lor...'
 'Bull. Your plan was to go floating off to IKEA with me without a care in the world. So i have to live with your mess another day.'] 

[[1827, 2320, 43, 2321, 293, 231, 1828, 21, 2321, 2322, 35, 4, 669, 2323, 48, 222, 3507, 48, 4, 792, 74, 3508, 2, 134, 2320, 4, 21, 1827, 3509, 166, 3510, 263], [1, 94, 27, 3, 68, 14, 2, 10, 443, 421, 934], [3511, 1829, 421, 12, 2324], [112, 95, 1, 37, 63, 142], [328, 50], [71, 617, 2, 79, 75, 2, 384, 7, 149, 35, 5, 670, 2325], [26, 3, 65,

In [58]:
max_len = max([len(sequence) for sequence in sequences])
print(max_len)
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)

162


In [59]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7410

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    tf.keras.layers.LSTM(32), # return_sequences 각 층에서 결과값을 출력할지(True, 최종 3차원), 최종 결과값만 출력할지(False, 최종 1차원)
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

In [61]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
history = model.fit(sequences_matrix, y_train, batch_size=64, epochs=10, validation_split=0.2) # validation_split means X_train will be separated to valid.

Epoch 1/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8347 - loss: 0.4638 - val_accuracy: 0.9590 - val_loss: 0.1434
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9732 - loss: 0.1172 - val_accuracy: 0.9885 - val_loss: 0.0639
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9897 - loss: 0.0546 - val_accuracy: 0.9859 - val_loss: 0.0504
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9951 - loss: 0.0285 - val_accuracy: 0.9885 - val_loss: 0.0459
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9968 - loss: 0.0170 - val_accuracy: 0.9923 - val_loss: 0.0395
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9995 - loss: 0.0072 - val_accuracy: 0.9897 - val_loss: 0.0414
Epoch 7/10
[1m49/49[0m [32m━━━━

In [64]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# 이진분류 예측 및 평가
sequences_test = tokenizer.texts_to_sequences(X_test)
padded_sequences_test = sequence.pad_sequences(sequences_test)
y_pred = model.predict(padded_sequences_test)

# y_pred와 y_test의 shape 확인
print(f"y_pred shape: {y_pred.shape}")
print(f"y_test shape: {y_test.shape}")

# y_pred 처리 (sigmoid 출력이므로 0.5 threshold 사용)
y_pred_classes = (y_pred > 0.5).astype(int).flatten()

# y_test는 이미 binary 형태이므로 그대로 사용
y_test_classes = y_test  # 이미 [0, 1, 0, 1, ...] 형태

# shape 확인
print(f"y_pred_classes shape: {y_pred_classes.shape}")
print(f"y_test_classes shape: {y_test_classes.shape}")

# 평가
accuracy = accuracy_score(y_test_classes, y_pred_classes)
report = classification_report(y_test_classes, y_pred_classes, target_names=['ham', 'spam'])

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
y_pred shape: (1672, 1)
y_test shape: (1672,)
y_pred_classes shape: (1672,)
y_test_classes shape: (1672,)
Accuracy: 0.9880
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1448
        spam       0.98      0.93      0.95       224

    accuracy                           0.99      1672
   macro avg       0.98      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672

