In [17]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Flatten
from keras.layers import Embedding

import random
import os

from sklearn.metrics import f1_score,accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [18]:
params = {
    'EPOCHS':3,
    'LEARNING_RATE':2e-5,
    'BATCH_SIZE':32,
    'SEED':45,
    'DATA_PATH': '../../make_data/preprocessed_data/good_bad_df.csv',
    'SAVE_PATH':'../Models/Glove_sentence.pt',
    'max_length':512
}

In [19]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(params['SEED']) # Seed 고정

In [20]:
data = pd.read_csv(params['DATA_PATH'],index_col=[0])

data.loc[data['ad_label']==1,'ad_label']=0
data.loc[data['ad_label']==2,'ad_label']=1

train_data, test_data = train_test_split(data,test_size=0.2,random_state=params['SEED'],shuffle=True)
valid_data, test_data = train_test_split(test_data,test_size=0.5,random_state=params['SEED'],shuffle=True)

train_data = train_data.sample(frac=1).reset_index(drop=True)
valid_data = valid_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

train_data.reset_index(drop=True,inplace=True)
valid_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [21]:
print(len(train_data))
print(len(train_data.drop_duplicates(['summary'])))

print(len(valid_data))
print(len(valid_data.drop_duplicates(['summary'])))

print(len(test_data))
print(len(test_data.drop_duplicates(['summary'])))

print(train_data['ad_label'].value_counts()/len(train_data))
print(valid_data['ad_label'].value_counts()/len(valid_data))
print(test_data['ad_label'].value_counts()/len(test_data))

6867
6867
858
858
859
859
0    0.696665
1    0.303335
Name: ad_label, dtype: float64
0    0.715618
1    0.284382
Name: ad_label, dtype: float64
0    0.704307
1    0.295693
Name: ad_label, dtype: float64


In [22]:
t = Tokenizer()
t.fit_on_texts(train_data['summary'])
vocab_size = len(t.word_index) + 1

In [23]:
train_docs = t.texts_to_sequences(train_data['summary'])
valid_docs = t.texts_to_sequences(valid_data['summary'])
test_docs = t.texts_to_sequences(test_data['summary'])

In [24]:
# pad documents to a max length of 4 words
max_length = 530
train_docs = pad_sequences(train_docs, maxlen=max_length, padding='post')
valid_docs = pad_sequences(valid_docs, maxlen=max_length, padding='post')
test_docs = pad_sequences(test_docs, maxlen=max_length, padding='post')

In [25]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 358043 word vectors.


In [26]:
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [27]:
import tensorflow as tf
from sklearn.metrics import f1_score

import tensorflow as tf
from sklearn.metrics import f1_score

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(tf.round(y_pred), tf.float32)

        tp = tf.reduce_sum(y_true * y_pred)
        fp = tf.reduce_sum((1 - y_true) * y_pred)
        fn = tf.reduce_sum(y_true * (1 - y_pred))

        self.true_positives.assign_add(tp)
        self.false_positives.assign_add(fp)
        self.false_negatives.assign_add(fn)

    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        f1 = 2 * precision * recall / (precision + recall + tf.keras.backend.epsilon())
        return f1

    def reset_states(self):
        # 변수를 초기화합니다.
        self.true_positives.assign(0.0)
        self.false_positives.assign(0.0)
        self.false_negatives.assign(0.0)

In [28]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=530, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(10000, input_dim=53000))
model.add(Activation('relu'))
model.add(Dropout(0.7))
model.add(Dense(512, input_dim=10000))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(128, input_dim=512))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',F1Score()])
# summarize the model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 530, 100)          2157500   
                                                                 
 flatten_1 (Flatten)         (None, 53000)             0         
                                                                 
 dense_4 (Dense)             (None, 10000)             530010000 
                                                                 
 activation_3 (Activation)   (None, 10000)             0         
                                                                 
 dropout_3 (Dropout)         (None, 10000)             0         
                                                                 
 dense_5 (Dense)             (None, 512)               5120512   
                                                                 
 activation_4 (Activation)   (None, 512)              

In [29]:
# fit the model
model.fit(train_docs, train_data['ad_label'], epochs=params['EPOCHS'], verbose=2)


Epoch 1/3


215/215 - 6s - loss: 0.4695 - accuracy: 0.8003 - f1_score: 0.6429 - 6s/epoch - 29ms/step
Epoch 2/3


  m.reset_state()


215/215 - 6s - loss: 0.2554 - accuracy: 0.9008 - f1_score: 0.8336 - 6s/epoch - 27ms/step
Epoch 3/3
215/215 - 6s - loss: 0.1643 - accuracy: 0.9401 - f1_score: 0.9003 - 6s/epoch - 27ms/step


<keras.callbacks.History at 0x7f1bad028908>

In [30]:
loss, accuracy, f1 = model.evaluate(train_docs, train_data['ad_label'])
print('Accuracy: %f' % (accuracy),'F1_Score: %f' % (f1))

Accuracy: 0.971166 F1_Score: 0.954124


In [31]:
loss, accuracy, f1 = model.evaluate(valid_docs, valid_data['ad_label'], verbose=1)
print('Accuracy: %f' % (accuracy),'F1_Score: %f' % (f1))

 1/27 [>.............................] - ETA: 0s - loss: 0.0960 - accuracy: 1.0000 - f1_score: 1.0000

Accuracy: 0.913753 F1_Score: 0.861940


In [32]:
loss, accuracy, f1 = model.evaluate(test_docs, test_data['ad_label'], verbose=0)
print('Accuracy: %f' % (accuracy),'F1_Score: %f' % (f1))

Accuracy: 0.919674 F1_Score: 0.874317
