# 워드 임베딩을 사용한 스펨 탐지

* scratch로부터 word embedding을 학습시키는 것과 GloVe와 같은 pre-trained embedding을 사용하는 것과 이를 fine-tuning하는 것을 상호-비교해 본다

In [7]:
import os
import sys
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow import feature_column as fc
import tensorflow_datasets as tfds
plt.rcParams["font.family"] = 'NanumBarunGothic'
TENSORBOARD_BINARY = '/home/hoondori/anaconda3/envs/ai/bin/tensorboard'
os.environ['TENSORBOARD_BINARY'] =  TENSORBOARD_BINARY
%load_ext tensorboard

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [8]:
DATA_DIR = "data"
EMBEDDING_NUMPY_FILE = os.path.join(DATA_DIR, "E.npy")
DATASET_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
EMBEDDING_MODEL = "glove-wiki-gigaword-300"
EMBEDDING_DIM = 300
NUM_CLASSES = 2
BATCH_SIZE = 128


# 데이터 획득

In [9]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    p = tf.keras.utils.get_file(local_file, url, 
        extract=True, cache_dir="/tmp")
    labels, texts = [], []
    local_file = os.path.join("/tmp", "datasets", "SMSSpamCollection")
    with open(local_file, "r") as fin:
        for line in fin:
            label, text = line.strip().split('\t')
            labels.append(1 if label == "spam" else 0)
            texts.append(text)
    return texts, labels

# read data
texts, labels = download_and_read("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip")
len(texts)

5574

# 전처리/데이터셋화

In [11]:
def createTextSequences(texts, labels):
    
    # tokenizer fitting on texts
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None) # 지정 없으면 전수 단어 사용하여 integer 표현으로 변경
    tokenizer.fit_on_texts(texts)
    
    # text sequence 생성
    text_sequences = tokenizer.texts_to_sequences(texts)
    text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences)
    num_records = len(text_sequences)
    max_seqlen = len(text_sequences[0])
    
    # 어휘 
    word2idx = tokenizer.word_index
    idx2word = {v:k for k,v in word2idx.items()}
    word2idx["PAD"] = 0
    idx2word[0] = "PAD"
    vocab_size = len(tokenizer.word_index)

    print("{:d} sentences, max length: {:d}, voca: {:d}".format(num_records, max_seqlen, vocab_size))
      
    return text_sequences, word2idx, idx2word, vocab_size, max_seqlen, tokenizer

text_sequences, word2idx, idx2word, vocab_size, max_seqlen, tokenizer = createTextSequences(texts, labels)
print(f'text={texts[0]}')
print(f'text sequence={text_sequences[0][-10:]}...')

5574 sentences, max length: 189, voca: 9010
text=Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
text sequence=[ 123  351 1328  148 2996 1329   67   58 4436  144]...


In [12]:
def createTextSequences(text_sequences, labels, num_classes, batch_size):
    cat_labels = tf.keras.utils.to_categorical(labels, num_classes=num_classes)
    
    dataset = tf.data.Dataset.from_tensor_slices((text_sequences, cat_labels))
    dataset = dataset.shuffle(10000)
    
    num_records = len(text_sequences)
    test_size = num_records // 4   # 25%
    val_size = (num_records - test_size) // 10 
    
    test_dataset = dataset.take(test_size)
    val_dataset = dataset.skip(test_size).take(val_size)
    train_dataset = dataset.skip(test_size+val_size)
    
    test_dataset = test_dataset.batch(batch_size, drop_remainder=True)
    val_dataset = val_dataset.batch(batch_size, drop_remainder=True)
    train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
    
    return train_dataset, val_dataset, test_dataset

BATCH_SIZE = 128
train_dataset, val_dataset, test_dataset = createTextSequences(text_sequences, labels, num_classes=2, batch_size=BATCH_SIZE)
train_dataset

<BatchDataset shapes: ((128, 189), (128, 2)), types: (tf.int32, tf.float32)>

In [13]:
# gensim에서 제공하는 300차원 GloVe 임베딩을 사용해서 변환
#   sequence of encoded integers => sequence of embeded vector

def build_embedding_matrix(text_sequences, word2idx, embedding_file):
    
    # E : dict of word_index to embedding vector
    
    # 기존 것이 있으면 로딩해서 사용
    if os.path.exists(embedding_file):
        E = np.load(embedding_file)
    else:
        # 기존 것이 없으므로 gensim에서 다운받아 구축
        word_vectors = api.load("glove-wiki-gigaword-300")
        
        # container 준비
        E = np.zeros((len(word2idx), 300))
        
        for word, idx in word2idx.items():
            try:
                E[idx] = word_vectors.word_vec(word)  # 없으면 어떻게?
            except:
                # 없는 경우...
                pass
        
        # 향후 사용을 위해 저장 
        np.save(embedding_file, E)
        
    return E

EMBED_NUMPY_FILE="E.npy"

E = build_embedding_matrix(text_sequences, word2idx, EMBED_NUMPY_FILE)
print(f'Embedding matrix: ', E.shape)

Embedding matrix:  (9010, 300)


# 분류 모델 정의

In [23]:
class SpamClassifierModel(tf.keras.Model):
    
    def __init__(self, run_mode, embedding_weights, vocab_sz, input_length, embed_sz, 
                 num_filters=256, kernel_size=3, num_classes=2, **kwargs):
        
        super(SpamClassifierModel, self).__init__(**kwargs)
        
        if(run_mode == 'scratch'):
            self.embedding = tf.keras.layers.Embedding(
                input_dim = vocab_sz,
                output_dim = embed_sz,
                input_length = input_length,
                trainable = True)  # pass to layer parent
        
        elif(run_mode == 'vectorizer'):
            self.embedding = tf.keras.layers.Embedding(
                input_dim = vocab_sz,
                output_dim = embed_sz,
                input_length = input_length,
                weights = [embedding_weights],  # pass to layer parent
                trainable = False) # pass to layer parent

        else:  # fine-tune
            self.embedding = tf.keras.layers.Embedding(
                input_dim = vocab_sz,
                output_dim = embed_sz,
                input_length = input_length,
                weights = [embedding_weights],  # pass to layer parent
                trainable = True) # pass to layer parent
            
        self.conv = tf.keras.layers.Conv1D(filters=num_filters, kernel_size=3, activation='relu')
        
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        
        self.pool = tf.keras.layers.GlobalMaxPooling1D()
        
        self.dense = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, x):
        
        # embedding  => conv => dropout => pool => dense
        
        x = self.embedding(x)
        x = self.conv(x)
        x = self.dropout(x)
        x = self.pool(x)
        x = self.dense(x)
        
        return x
        
RUN_MODE = "vectorizer"
EMBED_SIZE = 300
model_vectorizer = SpamClassifierModel(run_mode=RUN_MODE, embedding_weights=E, vocab_sz=vocab_size, input_length=max_seqlen, embed_sz=EMBED_SIZE)
model_vectorizer.build(input_shape=(None, max_seqlen))
model_vectorizer.summary()

Model: "spam_classifier_model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      multiple                  2703000   
_________________________________________________________________
conv1d_6 (Conv1D)            multiple                  230656    
_________________________________________________________________
spatial_dropout1d_6 (Spatial multiple                  0         
_________________________________________________________________
global_max_pooling1d_6 (Glob multiple                  0         
_________________________________________________________________
dense_6 (Dense)              multiple                  514       
Total params: 2,934,170
Trainable params: 231,170
Non-trainable params: 2,703,000
_________________________________________________________________


In [24]:
# compile and train
model_vectorizer.compile(optimizer="adam", loss="categorical_crossentropy",
    metrics=["accuracy"])

# train model
NUM_EPOCHS = 1
CLASS_WEIGHTS = {0:1, 1:8 }

model_vectorizer.fit(train_dataset, epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    class_weight=CLASS_WEIGHTS)



<tensorflow.python.keras.callbacks.History at 0x7f18c8558278>

In [17]:
# evaluate against test set
from sklearn.metrics import accuracy_score, confusion_matrix
labels, predictions = [], []
for Xtest, Ytest in test_dataset:
    Ytest_ = model_vectorizer.predict_on_batch(Xtest)
    ytest = np.argmax(Ytest, axis=1)
    ytest_ = np.argmax(Ytest_, axis=1)
    labels.extend(ytest.tolist())
    predictions.extend(ytest.tolist())

print("test accuracy: {:.3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))

test accuracy: 1.000
confusion matrix
[[1118    0]
 [   0  162]]


# 다른 모드들의 결과 : from scratch

In [22]:
RUN_MODE = "scratch"

model_scratch = SpamClassifierModel(run_mode=RUN_MODE, embedding_weights=None,vocab_sz=vocab_size, input_length=max_seqlen, embed_sz=EMBED_SIZE)
model_scratch.build(input_shape=(None, max_seqlen))

# compile and train
model_scratch.compile(optimizer="adam", loss="categorical_crossentropy",
    metrics=["accuracy"])

# train model
NUM_EPOCHS = 1
CLASS_WEIGHTS = {0:1, 1:8 }

model_scratch.fit(train_dataset, epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    class_weight=CLASS_WEIGHTS)

# evaluate against test set
from sklearn.metrics import accuracy_score, confusion_matrix
labels, predictions = [], []
for Xtest, Ytest in test_dataset:
    Ytest_ = model_scratch.predict_on_batch(Xtest)
    ytest = np.argmax(Ytest, axis=1)
    ytest_ = np.argmax(Ytest_, axis=1)
    labels.extend(ytest.tolist())
    predictions.extend(ytest.tolist())

print("test accuracy: {:.3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))

test accuracy: 1.000
confusion matrix
[[1102    0]
 [   0  178]]


# 다른 모드들의 결과 : Fine-tue

In [25]:
RUN_MODE = "finetue"

model_scratch = SpamClassifierModel(run_mode=RUN_MODE, embedding_weights=E,vocab_sz=vocab_size, input_length=max_seqlen, embed_sz=EMBED_SIZE)
model_scratch.build(input_shape=(None, max_seqlen))

# compile and train
model_scratch.compile(optimizer="adam", loss="categorical_crossentropy",
    metrics=["accuracy"])

# train model
NUM_EPOCHS = 1
CLASS_WEIGHTS = {0:1, 1:8 }

model_scratch.fit(train_dataset, epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    class_weight=CLASS_WEIGHTS)

# evaluate against test set
from sklearn.metrics import accuracy_score, confusion_matrix
labels, predictions = [], []
for Xtest, Ytest in test_dataset:
    Ytest_ = model_scratch.predict_on_batch(Xtest)
    ytest = np.argmax(Ytest, axis=1)
    ytest_ = np.argmax(Ytest_, axis=1)
    labels.extend(ytest.tolist())
    predictions.extend(ytest.tolist())

print("test accuracy: {:.3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))

test accuracy: 1.000
confusion matrix
[[1107    0]
 [   0  173]]
