In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, LSTM

In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

#스팸 메일 데이터를 가지고 스팸인지 아닌지 판별하기 
#encoding은 보통 utf-8, cp949로 하면되지만 이번 파일은 latin1
spam_data =  pd.read_csv("spam.csv", encoding="latin1")
#3, 4, 5 열 삭제 후 컬럼명 변경
spam_data = spam_data.dropna(axis=1)
spam_data.columns = ["label", "mail"]

#ham, spam 숫자로 변경
spam_data['label'] = spam_data['label'].replace('spam', 1)
spam_data['label'] = spam_data['label'].replace("ham", 0)

#단어 아니면 삭제
spam_data['mail'] = spam_data['mail'].str.replace("[^\w]", " ")
#혹시나 공백이 있으면
spam_data['mail'] = spam_data['mail'].replace('', np.nan)
spam_data['label'] = spam_data['label'].replace('', np.nan)

#결측치 있으면 모두 제거
spam_data = spam_data.dropna(how='any')

print("# preprocessing done")

#test/train 스플릿
mail_train, mail_test, y_train, y_test = train_test_split(spam_data['mail'], spam_data['label'], test_size=0.2, shuffle=False)


print("# split done")

stopwords = ['a', 'an']

#토근화 진행
X_train = []
for stc in mail_train:
  token = []
  words = stc.split()
  for word in words:
    if word not in stopwords:
      token.append(word)
  X_train.append(token)

X_test =  []

for stc in mail_test:
  token = []
  words = stc.split()
  for word in words:
    if word not in stopwords:
      token.append(word)
  X_test.append(token)

print("# tokenization done")

# preprocessing done
# split done
# tokenization done


In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer

# X_train 단어들을 토대로 정수 인덱스 설정
# 빈도수가 높은 것부터 4000개만 정수 인덱스로 변환하겠다

tokenizer = Tokenizer(7792)
tokenizer.fit_on_texts(X_train)

#위에서 설정된 정수 인덱스를 토대로 변환
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

print("# int_encoding done")

# int_encoding done


In [30]:
print(len(tokenizer.word_index))

7792


In [31]:

#이거를 토대로 7792를 넣은 것이다!
print(len(tokenizer.word_index))

low_count = 0
for word, word_count in tokenizer.word_counts.items():
  if word_count == 1:
    low_count +=1

print(low_count)

7792
4030


In [34]:
max_length = 0
for data in X_train:
  if max_length < len(data):
    max_length = len(data)

print(max_length)

50


In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 50
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [37]:
model = Sequential()
model.add(Embedding(7792, 32))
model.add(LSTM(32))
model.add(Dense(1, activation = 'sigmoid'))

In [38]:
model.compile(loss="binary_crossentropy", optimizer = 'rmsprop', metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f1ca38246a0>

In [41]:
sentence = input()
#토큰화
token_stc = sentence.split()
#정수 인코딩
encode_stc = tokenizer.texts_to_sequences([token_stc])
#패딩
pad_stc = pad_sequences(encode_stc, maxlen=50)

score = model.predict(pad_stc)
print(score)

free 
[[0.07125384]]
