In [4]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 100

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import sys, os, gc, types
import time
from subprocess import check_output
import tensorflow as tf

from datareader_rnn import DataReader
from tf_basemodel import TFBaseModel
from tf_utils import shape

In [6]:
root_paths = [
    "/Users/jiayou/Dropbox/JuanCode/Kaggle/Wikipedia/data2/", # Mac
    "/Users/jiayou/Dropbox/Documents/JuanCode/Kaggle/Wikipedia/data2/", # 1080
    '/Users/junxie/Dropbox/JuanCode/Insight/project/data_mini/', # pro
    '/mnt/WD Black/Dropbox/JuanCode/Insight/Project/data_mini/', # paperspace
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break
print(root)

/Users/junxie/Dropbox/JuanCode/Insight/project/data_mini/


In [None]:
class WikiRNN(TFBaseModel):

    def __init__(self, state_size, keep_prob=1, **kwargs):
        self.state_size = state_size
        self.keep_prob = keep_prob
        super(type(self), self).__init__(**kwargs)

    def calculate_loss(self):
        self.data = tf.placeholder(tf.float32, name='data')
        self.given_days = tf.placeholder(tf.int32, name='given_days')
        self.no_loss_days = tf.placeholder(tf.int32, name='no_loss_days')
        self.days = tf.placeholder(tf.int32, name='days')
        batch_size = tf.shape(self.data)[0]
        
#         batch_size = tf.Print(batch_size, [self.data], "data", summarize=1000)
        
        # Features
        self.dayofweek = tf.placeholder(tf.int32, [None])
        self.isweekday = tf.placeholder(tf.int32, [None])
        self.month = tf.placeholder(tf.int32, [None])
        
        self.domain = tf.placeholder(tf.int32, [None])
        self.agent = tf.placeholder(tf.int32, [None])
        self.access = tf.placeholder(tf.int32, [None])
        
        dayofweek_oh = tf.one_hot(self.dayofweek, 7)
        isweekday_oh = tf.one_hot(self.isweekday, 2)
        month = tf.one_hot(self.month, 13)
        
        domain = tf.one_hot(self.domain, 9)
        agent = tf.one_hot(self.agent, 2)
        access = tf.one_hot(self.access, 3)
        
        date_features = tf.concat(
            [
                dayofweek_oh,
                isweekday_oh,
                month,
            ], 
            axis=1,
        )
        date_features = tf.tile(tf.expand_dims(date_features, 0), [batch_size, 1, 1])
        
        page_features = tf.concat(
            [
                domain,
                agent,
                access,
            ], 
            axis=1,
        )
        page_features = tf.tile(tf.expand_dims(page_features, 1), [1, self.days, 1])
        
        features = tf.concat([date_features, page_features], axis=2)
        
        cells = []
        for i in range(len(self.state_size)):
            c = tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.LSTMCell(
                    self.state_size[i],
                ),
                output_keep_prob=self.keep_prob,
            )
            if i != 0:
                c = tf.nn.rnn_cell.ResidualWrapper(c)
            cells.append(c)
        cell = tf.nn.rnn_cell.MultiRNNCell(cells)
        
        # ([batch_size, state_size])
        state = cell.zero_state(tf.shape(self.data)[0], dtype=tf.float32)
        # [batch_size, 1]
        last_output = tf.zeros([tf.shape(self.data)[0], 1], dtype=tf.float32)
        
        loss = tf.constant(0, dtype=tf.float32)
        step = tf.constant(0, dtype=tf.int32)
        output_ta = tf.TensorArray(size=self.days, dtype=tf.float32)
        
        def cond(last_output, state, loss, step, output_ta):
            return step < self.days
        
        def body(last_output, state, loss, step, output_ta):
            inp = tf.concat(
                [
                    last_output,
                    features[:, step, :],
                ],
                axis=1
            )
            
#             inp = tf.cond(
#                 step < 10,
#                 lambda: tf.Print(inp, [step, inp], "input", summarize=200*47),
#                 lambda: inp,
#             )
            
            output, state = cell(inp, state)
            output = tf.layers.dense(
                output,
                1,
                name='dense-top'
            )
            output_ta = output_ta.write(step, tf.transpose(output))
            
            last_output = tf.cond(
                step < self.given_days,
                lambda: tf.expand_dims(self.data[:,step], 1),
                lambda: output,
            )
            last_output.set_shape([None, 1])
            
#             true = tf.maximum(1e-8, self.data[:,step])            
            true = tf.expand_dims(self.data[:,step], 1)
#             true = tf.Print(true, [true], 'true.shape', summarize=1000)
#             output = tf.Print(output, [output], 'output.shape', summarize=1000)
            loss = tf.cond(
                step >= self.no_loss_days,
#                 lambda: loss + tf.reduce_mean(2 * tf.abs(true - output) / tf.maximum(1e-6, true + output)),
                lambda: loss + tf.reduce_mean(tf.abs(true - output)),
                lambda: loss
            )
            loss.set_shape([])
            
            return (last_output, state, loss, step + 1, output_ta)
        
        _, self.final_state, loss, _, output_ta = tf.while_loop(
            cond=cond,
            body=body,
            loop_vars=(last_output, state, loss, step, output_ta)
        )
        
        self.preds = tf.transpose(output_ta.concat())
        self.prediction_tensors = {
            'preds': self.preds
        }
        
        loss = loss / tf.cast(self.days - self.no_loss_days, tf.float32)
#         loss = tf.Print(loss, [loss, self.data[:, -1], self.preds[:, -1]], "Loss = ")
        return loss

    def predict(self, batch_size=1000, num_batches=None):
        preds = []
        states = []
        test_generator = self.reader.test_batch_generator(batch_size)
        for i, test_batch_df in enumerate(test_generator):
            test_feed_dict = {
                getattr(self, placeholder_name, None): data
                for placeholder_name, data in test_batch_df.items() if hasattr(self, placeholder_name)
            }

            batch_preds, batch_states = self.session.run(
                fetches=[self.preds, self.final_state],
                feed_dict=test_feed_dict
            )
            
            sc_std = test_batch_df['sc_std']
            sc_mean = test_batch_df['sc_mean']
            batch_preds = self.reader.inverse_transform(batch_preds, sc_std=sc_std, sc_mean=sc_mean)
            
            preds.append(batch_preds)
            states.append(batch_states)
            print('batch {} processed'.format(i))

        return (np.concatenate(preds, axis=0), states)


In [None]:
def get_nn(reader):
    # 2000 steps make an epoch for all data (200 steps for mini data)
    return WikiRNN(
        name='nn_v1',
        reader=reader,
        work_dir='./tf-data',
        optimizer='adam',
        learning_rate=.001,
        batch_size=128,
        num_validation_batches=1,
        num_training_steps=1000000,
        early_stopping_steps=2000,
        num_restarts=3,
        warm_start_init_step=0,
        regularization_constant=0.0,
        enable_parameter_averaging=False,
        min_steps_to_checkpoint=2000,
        loss_averaging_window=2000,
        log_interval=100,

        state_size=[300, 300],
        keep_prob=1
    )

reader = DataReader(
    data, fpage=fpage, fdate=fdate, 
    min_train_days=100, 
    max_train_days=700, 
    train_predict_days=72, 
    train_loss_days=72,
    val_days=72,
    
    predict_days=64,
    predict_warmup_days=803,
    seed=923
)

nn = get_nn(reader)

In [None]:
def smape(true, pred, sc_std, sc_mean, reader):
    t = (reader.inverse_transform(true, sc_std=sc_std, sc_mean=sc_mean))
    p = (reader.inverse_transform(pred, sc_std=sc_std, sc_mean=sc_mean))
    p = np.round(np.maximum(0, p))
    return (np.abs(t - p) * 200 / np.maximum(1e-10, (t + p))).reshape((-1))
    

val_gen = reader.val_batch_generator(2000)
smapes = []
# start_time = time.time()

for step in nn.fit(yield_interval=20):
#     print('Training step {} started, it cost {} secs so far'.format(step, time.time()-start_time))
    val_batch_df = next(val_gen)
    feed_dict = {
        getattr(nn, placeholder_name, None): data
        for placeholder_name, data in val_batch_df.items() if hasattr(nn, placeholder_name)
    }

    loss_l1, preds = nn.session.run(
        fetches=[nn.loss, nn.preds],
        feed_dict=feed_dict
    )
    loss_smape = smape(
        val_batch_df['data'][:, -reader.train_loss_days:], 
        preds[:, -reader.train_loss_days:], 
        sc_std=val_batch_df['sc_std'],
        sc_mean=val_batch_df['sc_mean'],
        reader=reader
    ).mean()
    smapes.append(loss_smape)
    plt.figure(figsize=(20, 5))
    plt.title('smape')
    plt.plot(smapes, '*-')
    plt.show()
    
    print('val: smape = {}, l1 = {}'.format(loss_smape, loss_l1))
    
    for i in range(1):
        idx = np.random.randint(0, 1000)
        plt.figure(figsize=(20, 5))
        true = val_batch_df['data'][idx, :]
        plt.plot(true, 'g--')
        pred = preds[idx, :]
        plt.plot(pred, 'k.')
        plt.show()

In [None]:
reader = DataReader(
    root,
    min_train_days=100, 
    max_train_days=700, 
    train_predict_days=72, 
    train_loss_days=72,
    val_days=72,
    
    predict_days=64,
    predict_warmup_days=803,
    seed=923
)

In [None]:
nn = get_nn(reader)

In [None]:
nn.restore()

In [None]:
preds, states = nn.predict()

In [None]:
np.save(os.path.join(root, 'pred.{}.npy'.format(nn.name)), np.round(preds[:,-62:]))

In [None]:
reader = DataReader(
    root, 
    min_train_days=100, 
    max_train_days=700, 
    train_predict_days=62, 
    train_loss_days=62,
    val_days=62,
    
    predict_days=0,
    predict_warmup_days=793 - 62,
    seed=923
)

nn = get_nn(reader)

nn.restore()
preds, states = nn.predict()

In [None]:
concat_list = []
for i in range(len(states)):
    concat_list.append(
        np.concatenate([states[i][layer].c for layer in range(len(states[i]))], axis=1)
    )
page_ft = np.concatenate(concat_list, axis=0)
page_ft.shape

In [None]:
np.save(os.path.join(root, 'states.{}.npy'.format(nn.name)), page_ft)