In [1]:
import datetime
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import tensorflow_hub as hub
from pymongo import MongoClient
from sklearn.metrics.pairwise import cosine_similarity

from keras.layers import Dense, Input, Conv2D, MaxPool2D, Reshape, Flatten
from keras.layers import LSTM, Dropout, Concatenate, BatchNormalization, LeakyReLU
from keras.models import Sequential, Model
from keras.regularizers import l2 as L2
from keras import optimizers, backend as K
from keras.backend.tensorflow_backend import set_session
from keras.callbacks import ModelCheckpoint, Callback

tf.logging.set_verbosity(0)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
start_date = '2018-04-01'
valid_date = '2018-05-01'

In [3]:
def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def save_obj(obj, name):
    with open('obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [4]:
def labeling_data(ratio):
    print('labeling data')
    df = load_obj('total_data')
    date_list = pd.unique(df['date'])

    df['label'] = np.zeros(len(df), dtype=np.int32)
    top_score = []

    for day in date_list:
        df_by_date = df[df['date'] == day]
        topk = int(ratio * len(df_by_date))
        top_score.extend(df_by_date.score.nlargest(topk).index)

    df['label'][top_score] = 1 

    save_obj(df, 'total_data')
    del df

#labeling_data(0.1)

In [5]:
total_data = load_obj('total_data')
print('total data:', len(total_data))

total data: 1370234


In [6]:
if start_date == '0000-00-00':
    train_df = total_data[total_data.date.apply(str)<'2018-05-01']
else:
    train_df = total_data[(total_data.date.apply(str)<valid_date) \
            & (total_data.date.apply(str)>=start_date)]

train_df = train_df.sample(frac=1).reset_index(drop=True)
valid_df = total_data[total_data.date.apply(str)>=valid_date]

print('train data:', len(train_df))
print('test data:', len(valid_df))

del total_data

train data: 21767
test data: 21862


In [7]:
def fcmodel(fc_dim=128, beta=1e-5, em_dim=512):
    # size : (bs, 512)
    model = Sequential([
        Dense(fc_dim, kernel_regularizer=L2(beta)),
        BatchNormalization(),
        LeakyReLU(0.3),
        Dense(1, activation='sigmoid', kernel_regularizer=L2(beta)),
    ])
    input_a = Input(shape=(em_dim, ))
    output = model(input_a)
    return Model(inputs = input_a, outputs = output)


In [8]:
fraction = 0.5
K.clear_session()
sess = tf.Session(config=tf.ConfigProto(
        #gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=fraction),
        gpu_options=tf.GPUOptions(allow_growth=True),
        log_device_placement=True))


In [9]:
#g = tf.Graph()
#with g.as_default():
embed = hub.Module('https://tfhub.dev/google/universal-sentence-encoder/1')

def embedding(sentence_list):
    with tf.device("/cpu:0"):
        return embed(sentence_list)

In [10]:
model = fcmodel()
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
set_session(sess)

In [11]:
def evaluate_by_top_score(df, pred, ratio_pred=0.1):
    date_list = pd.unique(date)

    df['pred_label'] = np.zeros(len(df), dtype=np.int32)
    top_pred = []

    for day in date_list:
        df_by_date = df[df['date'] == day]
        topk_pred = int(ratio_pred * len(df_by_date))
        top_pred.extend(df_by_date.pred.nlargest(topk_pred).index)

    df['pred_label'][top_pred] = 1 
    cross = pd.crosstab(df['label'], df['pred_label'], rownames=["Actual"], \
            colnames=["Predicted"])
    recall = cross[1][1] / (cross[1][1] + cross[0][1])
    print(cross)

    return recall 


In [12]:
def generator(df, batch_size):
    df_0 = df[df['label']==0]      
    df_1 = df[df['label']==1]            
    while True:                    
        for i in range(len(df)//batch_size): 
            batch_df_0 = df_0.sample(n=batch_size//2, replace=True)
            batch_df_1 = df_1.sample(n=batch_size - batch_size//2, replace=True)
            batch_df = pd.concat([batch_df_0, batch_df_1])
            x_gen = sess.run(embedding(list(batch_df['title'])))
            y_gen = np.array(list(batch_df['label'])).reshape([-1,1])
            
            yield x_gen, y_gen
            

class RecallEvaluation(Callback):
    def __init__(self, validation_df, interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.x = sess.run(embedding(list(validation_df['title'])))
        self.y = list(validation_df['label'])
        self.date = validation_df['date']
        self.recall_max = 0.0
        self.precision_max = 0.0                               
        
    def on_epoch_end(self, epoch, logs={}):                          
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.x)                
            y_pred = np.concatenate(y_pred)
            recall, precision = self.evaluate(self.y, y_pred)        
            recall_top1 = self.evaluate_by_top_score(self.date, self.y, y_pred)
            print('top 10%% precision: %.6f' %recall_top1)

            # save max 
            if recall+precision > self.recall_max + self.precision_max:
            #if recall > self.recall_max and precision > self.precision_max:
                self.recall_max = recall 
                self.precision_max = precision

            print("Evaluation - epoch: %d - recall: [%.6f] / precision: [%.6f]"\
                    %(epoch, recall, precision))
            print('max recall: %.6f / precision: %.6f' %(self.recall_max, self.precision_max))
            print()

    @staticmethod
    def evaluate(label, pred):
        assert len(label) == len(pred)
        df = pd.DataFrame({'label':label, 'pred':pred})

        df['pred'] = np.array(pred)
        df['pred_label'] = np.zeros(len(df), dtype=np.int32)
        df['pred_label'][df['pred'] > 0.5] = 1

        cross = pd.crosstab(df['label'], df['pred_label'], rownames=["Actual"], \
                colnames=["Predicted"])
        recall = cross[1][1] / (cross[1][1] + cross[0][1])
        precision = cross[1][1] / (cross[1][1] + cross[1][0])
        #print(cross)

        return recall, precision

    @staticmethod
    def evaluate_by_top_score(date, label, pred, ratio_score=0.1, ratio_pred=0.1):
        df = pd.DataFrame({'date':date, 'score_label':label, 'pred':pred})
        date_list = pd.unique(date)

        df['pred_label'] = np.zeros(len(df), dtype=np.int32)

        top_pred = []

        for day in date_list:
            df_by_date = df[df['date'] == day]
            #topk_score = int(ratio_score * len(df_by_date))
            topk_pred = int(ratio_pred * len(df_by_date))
            #top_score.extend(df_by_date.score.nlargest(topk_score).index)
            top_pred.extend(df_by_date.pred.nlargest(topk_pred).index)

        #df['score_label'][top_score] = 1
        df['pred_label'][top_pred] = 1
        cross = pd.crosstab(df['score_label'], df['pred_label'], rownames=["Actual"], \
                colnames=["Predicted"])
        recall = cross[1][1] / (cross[1][1] + cross[0][1])
        print(cross)

        return recall


In [13]:
ne = 100
num_steps = 100
bs = 100
learning_rate = 1e-4

df_0 = train_df[train_df['label']==0]
df_1 = train_df[train_df['label']==1]

#define model
model = fcmodel()
adam = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()
rval = RecallEvaluation(valid_df, 1)

#training
print('Training Start!!!')
model.fit_generator(generator(train_df, bs), \
        steps_per_epoch=num_steps, nb_epoch=ne, callbacks = [rval])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 512)               0         
_________________________________________________________________
sequential_2 (Sequential)    (None, 1)                 66305     
Total params: 66,305
Trainable params: 66,049
Non-trainable params: 256
_________________________________________________________________
Training Start!!!




Epoch 1/100


RuntimeError: Module must be applied in the graph it was instantiated for.