In [6]:
import tensorflow as tf
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
    plt.show()

import pandas as pd
import sys
sys.path.append('../')

from config import *
from sklearn.model_selection import train_test_split

import numpy as np

In [7]:
import random

In [8]:
from tensorflow.python.keras.preprocessing import sequence

In [9]:
tr_clk_list = pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT['creative_id'])
tr_user = pd.read_pickle(TRAIN_DIR+USER_PATH).groupby(['user_id']).agg({'age': 'first'}).reset_index()
ts_clk_list = pd.read_pickle(TEST_DIR+CLK_PATH_DICT['creative_id'])

In [10]:
assert tr_clk_list['user_id'].values.tolist() == tr_user['user_id'].values.tolist()
df = pd.concat([tr_clk_list, tr_user[['age']]], axis=1)

del tr_clk_list, tr_user

In [11]:
### split train validation dataset
####################################
train, val = train_test_split(df, test_size=0.2)

In [None]:
### ssss
# train['clk_list'].map(lambda x: random.shuffle(x))
# val['clk_list'].map(lambda x: random.shuffle(x))
# ts_clk_list['clk_list'].map(lambda x: random.shuffle(x))

In [13]:
tr_x, tr_y = train['creative_id'], train['age']-1
vl_x, vl_y = val['creative_id'], val['age']-1

ts_x = ts_clk_list['creative_id']

In [14]:
sentence_size = int(df['creative_id'].map(lambda x: len(x)).quantile(0.995))
tmp = int(ts_clk_list['creative_id'].map(lambda x: len(x)).quantile(0.99))
print('choose sentences max len: %d, test sentences max len: %d' % (sentence_size, tmp))

choose sentences max len: 192, test sentences max len: 157


In [15]:
### Hyperparameter
############################
SENTENCE_SIZE = 200
BATCH_SIZE = 1024

In [16]:
print("Pad sequences (samples x time)")

### feature_columns[0] means click list
tr_x = sequence.pad_sequences(tr_x,
                                 maxlen=SENTENCE_SIZE, 
                                 padding='post', 
                                 truncating='post',
                                 dtype='int64',
                                 value=0
                                 )
vl_x = sequence.pad_sequences(vl_x,
                                 maxlen=SENTENCE_SIZE, 
                                 padding='post', 
                                 truncating='post',
                                 dtype='int64',
                                 value=0
                                 )
ts_x = sequence.pad_sequences(ts_x, 
                                maxlen=SENTENCE_SIZE, 
                                padding='post',
                                truncating='post',
                                dtype='int64',
                                value=0
                               )
print("x_train shape:", tr_x.shape)
print("x_val shape:", vl_x.shape)
print("x_test shape:", ts_x.shape)

Pad sequences (samples x time)
x_train shape: (720000, 200)
x_val shape: (180000, 200)
x_test shape: (1000000, 200)


In [17]:
### Get tf dataset
def get_train_ds(x, y): 
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(buffer_size=len(x))
    dataset = dataset.batch(BATCH_SIZE)
#     dataset = dataset.repeat(EPOCHS)
    return dataset

def get_test_ds(x, ): 
    dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)
    return dataset

tr_ds = get_train_ds(tr_x, tr_y.values)
vl_ds = get_train_ds(vl_x, vl_y.values)
ts_ds = get_test_ds(ts_x)

In [18]:
### Get vocab size
##############################################
tr_ad_df = pd.read_pickle(TRAIN_DIR+AD_PATH)
ts_ad_df = pd.read_pickle(TEST_DIR+AD_PATH)
vocab_size = max(pd.concat([tr_ad_df, ts_ad_df], axis=0)['creative_id'].unique().tolist()) + 1 ### padding 0 need add 1

print('vocab size: %d' % vocab_size)

vocab size: 4445721


In [23]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.1)),
    tf.keras.layers.Dense(10)
])

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          284526144 
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 10)                650       
Total params: 284,601,098
Trainable params: 284,601,098
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(tr_ds, epochs=10,
                    validation_data=vl_ds, 
                    validation_steps=30)

In [None]:
val_loss, val_acc = model.evaluate(vl_ds)

print('Test Loss: {}'.format(val_loss))
print('Test Accuracy: {}'.format(val_acc))