In [1]:
import datetime
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
import tensorflow_hub as hub
from pymongo import MongoClient
from sklearn.metrics.pairwise import cosine_similarity

tf.logging.set_verbosity(0)

  from ._conv import register_converters as _register_converters


In [2]:
start_date = '2018-04-01'
valid_date = '2018-05-01'

In [3]:
def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def save_obj(obj, name):
    with open('obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [4]:
def labeling_data(ratio):
    print('labeling data')
    df = load_obj('total_data')
    date_list = pd.unique(df['date'])

    df['label'] = np.zeros(len(df), dtype=np.int32)
    top_score = []

    for day in date_list:
        df_by_date = df[df['date'] == day]
        topk = int(ratio * len(df_by_date))
        top_score.extend(df_by_date.score.nlargest(topk).index)

    df['label'][top_score] = 1 

    save_obj(df, 'total_data')
    del df

#labeling_data(0.1)

In [5]:
total_data = load_obj('total_data')
print('total data:', len(total_data))

total data: 1370234


In [6]:
if start_date == '0000-00-00':
    train_df = total_data[total_data.date.apply(str)<'2018-05-01']
else:
    train_df = total_data[(total_data.date.apply(str)<valid_date) \
            & (total_data.date.apply(str)>=start_date)]

train_df = train_df.sample(frac=1).reset_index(drop=True)
valid_df = total_data[total_data.date.apply(str)>=valid_date]

print('train data:', len(train_df))
print('test data:', len(valid_df))

del total_data

train data: 21767
test data: 21862


In [7]:
def model(batch_x, batch_y, is_training=True):
    # config
    em_dim = 512
    fc_dim = 128
    lr = 1e-4
    beta = 1e-5
    
    # size : (bs, 512)
    print('x shape:', batch_x.shape)
    
    #with tf.variable_scope('CNN', reuse= not is_first):
    W1 = tf.get_variable('weight-1', [em_dim, fc_dim], dtype=tf.float32)
    b1 = tf.get_variable('bias-1', [fc_dim], dtype=tf.float32)
    W2 = tf.get_variable('weight-2', [fc_dim, 1], dtype=tf.float32)
    b2 = tf.get_variable('bias-2', [1], dtype=tf.float32)
    print(tf.get_variable_scope())

    h = tf.matmul(batch_x, W1) + b1
    y = tf.matmul(h, W2) + b2

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=batch_y, logits=y))
    regularizer = tf.nn.l2_loss(W1) + tf.nn.l2_loss(W2)
    regul_loss = beta * regularizer
    loss = loss + regul_loss

    #with tf.variable_scope('CNN', reuse=tf.AUTO_REUSE):
    train_op = tf.train.AdamOptimizer(lr).minimize(loss)
    
    prediction = y #tf.argmax(y,1)
    correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(batch_y,1))
    accur = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    if is_training:
        return loss, accur, train_op, prediction
    else:
        return loss, accur, prediction

In [8]:
embed = hub.Module('https://tfhub.dev/google/universal-sentence-encoder/1')

def embedding(sentence_list):
    with tf.device("/cpu:0"):
        return embed(sentence_list)

In [9]:
fraction = 0.5
sess = tf.Session(config=tf.ConfigProto(
        #gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=fraction),
        gpu_options=tf.GPUOptions(allow_growth=True),
        log_device_placement=True))
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [10]:
def evaluate_by_top_score(df, pred, ratio_pred=0.1):
    date_list = pd.unique(date)

    df['pred_label'] = np.zeros(len(df), dtype=np.int32)
    top_pred = []

    for day in date_list:
        df_by_date = df[df['date'] == day]
        topk_pred = int(ratio_pred * len(df_by_date))
        top_pred.extend(df_by_date.pred.nlargest(topk_pred).index)

    df['pred_label'][top_pred] = 1 
    cross = pd.crosstab(df['label'], df['pred_label'], rownames=["Actual"], \
            colnames=["Predicted"])
    recall = cross[1][1] / (cross[1][1] + cross[0][1])
    print(cross)

    return recall 


In [11]:
ne = 100
num_steps = 100
bs = 100

df_0 = train_df[train_df['label']==0]
df_1 = train_df[train_df['label']==1]

#training
with tf.variable_scope('CNN'):
    for epoch in range(ne):
        print(epoch)
        avg_accur = 0.0
        for step in range(num_steps):
            print('    ', step)
            
            batch_df_0 = df_0.sample(n=bs//2, replace=True)
            batch_df_1 = df_1.sample(n=bs - bs//2, replace=True)
            batch_df = pd.concat([batch_df_0, batch_df_1])
            x = embedding(list(batch_df['title']))
            #x = tf.reshape(x, [bs, 512])
            y = np.array(list(batch_df['label'])).reshape([-1, 1])
            result = model(x, y, True)
            if epoch==0 and step==0:
                print('Global Initializer')
                sess.run(tf.global_variables_initializer())
            loss, accur, _, pred = sess.run(result)
            avg_accur += accur
            
            if epoch==0 and step==0:
                print('Use True')
                tf.get_variable_scope().reuse_variables()
            
        avg_accur /= num_steps
        print('Training accuracy:', avg_accur)

        x = embedding(list(valid_df['title']))
        #y = valid_df['label']
        y = np.array(list(valid_df['label'])).reshape([-1, 1])
        loss, accur, pred = sess.run(model(x, y, False))
        evaluate_by_top_score(valid_df, pred)
    

0
     0
x shape: (?, 512)
<tensorflow.python.ops.variable_scope.VariableScope object at 0x7fe875099438>
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

Global Initializer
Use True
     1
x shape: (?, 512)
<tensorflow.python.ops.variable_scope.VariableScope object at 0x7fe875099438>


ValueError: Variable CNN/CNN/weight-1/Adam_2/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=tf.AUTO_REUSE in VarScope?