In [93]:
import tensorflow as tf
import pandas as pd
import sys
sys.path.append('../')
from config import *
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing import sequence
import numpy as np
from tqdm import tqdm
from IPython.display import clear_output

In [94]:
import warnings
warnings.filterwarnings("ignore")

In [95]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [96]:
tf.enable_eager_execution()
tf.test.is_gpu_available()

True

In [None]:
### 占用gpu
a = tf.constant(1)
res = tf.add(a, a)
sess = tf.Session()
sess.run(res)

In [97]:
### Hyperparameter
###################
SAVE_PATH = '/home/huangzc/competition/tencent/model_ckpt/simple_nn/model.ckpt'
EMB_SIZE = 50
BATCH_SIZE = 10240
EPOCHS = 100

In [None]:
tr_log = pd.read_pickle(TRAIN_DIR+USER_LOG_PATH)
ts_log = pd.read_pickle(TEST_DIR+USER_LOG_PATH)

tr_ad_df = pd.read_pickle(TRAIN_DIR+AD_INFO_PATH)
ts_ad_df = pd.read_pickle(TEST_DIR+AD_INFO_PATH)

assert tr_log['creative_id'].values.tolist() == tr_ad_df['creative_id'].values.tolist()
assert ts_log['creative_id'].values.tolist() == ts_ad_df['creative_id'].values.tolist()

tr_ad_id_log = pd.concat([tr_log, tr_ad_df], axis=1)
ts_ad_id_log = pd.concat([ts_log, ts_ad_df], axis=1)

tr_ad_id_log = tr_ad_id_log[['user_id', 'age', 'gender', 'ad_id']]
ts_ad_id_log = ts_ad_id_log[['user_id', 'ad_id']]

In [141]:
#读取
f = open(TRAIN_DIR+'gensim_ad_id_dict.js','r')
a = f.read()
vocab_dict = eval(a)
f.close()
wv_matrix = np.load(TRAIN_DIR+'gensim_ad_id.npy')
vocab = vocab_dict.keys()
vocab_size = len(vocab)

In [99]:
tr_ad_id_log['ad_id'] = tr_ad_id_log['ad_id'].astype(str)
ts_ad_id_log['ad_id'] = ts_ad_id_log['ad_id'].astype(str)

tr_ad_id_log = tr_ad_id_log[tr_ad_id_log['ad_id'].isin(vocab)]
ts_ad_id_log = ts_ad_id_log[ts_ad_id_log['ad_id'].isin(vocab)]

In [100]:
tr_ad_id_log['ad_id_index'] = tr_ad_id_log['ad_id'].map(lambda x: vocab_dict[x])
ts_ad_id_log['ad_id_index'] = ts_ad_id_log['ad_id'].map(lambda x: vocab_dict[x])

In [101]:
def get_clk_list(df):
    return list(df.values)

In [102]:
grid_df = tr_ad_id_log.groupby(['user_id']).agg({'age': 'first', 'gender': 'first', 'ad_id_index': lambda x: get_clk_list(x)}).reset_index()
grid_df_ts = ts_ad_id_log.groupby(['user_id']).agg({'ad_id_index': lambda x: get_clk_list(x)}).reset_index()

In [103]:
feature_columns = ['ad_id_index']
TARGET = 'age'

grid_df[TARGET] = grid_df[TARGET] - 1

In [104]:
### Split into train and valid dataset
train = grid_df[feature_columns+[TARGET]]
test = grid_df_ts[feature_columns]

train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

### x, y dataframe
x_train, y_train = train[feature_columns], train[TARGET]
x_val, y_val = val[feature_columns], val[TARGET]
x_test = test

720000 train examples
180000 validation examples
1000000 test examples


In [105]:
sentence_size = int(grid_df[feature_columns[0]].map(lambda x: len(x)).quantile(0.9))
print('choose sentences max len: %d' % sentence_size)

choose sentences max len: 65


In [106]:
print("Pad sequences (samples x time)")

### feature_columns[0] means click list
x_train = sequence.pad_sequences(x_train[feature_columns[0]],
                                 maxlen=sentence_size, 
                                 padding='post', 
                                 truncating='post',
                                 dtype='int64',
                                 value=float(vocab_size)
                                 )
x_val = sequence.pad_sequences(x_val[feature_columns[0]],
                                 maxlen=sentence_size, 
                                 padding='post', 
                                 truncating='post',
                                 dtype='int64',
                                 value=float(vocab_size)
                                 )
x_test = sequence.pad_sequences(x_test[feature_columns[0]], 
                                maxlen=sentence_size, 
                                padding='post',
                                truncating='post',
                                dtype='int64',
                                value=float(vocab_size)
                               )
print("x_train shape:", x_train.shape)
print("x_val shape:", x_val.shape)
print("x_test shape:", x_test.shape)

Pad sequences (samples x time)
x_train shape: (720000, 65)
x_val shape: (180000, 65)
x_test shape: (1000000, 65)


In [107]:
### Get tf dataset
def get_train_ds(x, y): 
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(buffer_size=len(x))
    dataset = dataset.batch(BATCH_SIZE)
#     dataset = dataset.repeat(EPOCHS)
    return dataset

def get_test_ds(x, ): 
    dataset = tf.data.Dataset.from_tensor_slices((x, )).batch(BATCH_SIZE)
    return dataset

train_ds = get_train_ds(x_train, y_train.values)
valid_ds = get_train_ds(x_val, y_val.values)
test_ds = get_test_ds(x_test)

In [142]:
weights = np.random.rand(1, wv_matrix.shape[1])
weights = np.concatenate([wv_matrix, weights], axis=0)

In [143]:
### Construct Model
#################################
class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size+1, EMB_SIZE, weights=[weights])
#         self.embedding = tf.keras.layers.Embedding(vocab_size, EMB_SIZE)
        self.pool1D = tf.keras.layers.GlobalAveragePooling1D()
        self.dense1 = tf.keras.layers.Dense(16, activation=tf.nn.leaky_relu)
        self.dense2 = tf.keras.layers.Dense(10, activation='softmax')

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.pool1D(x)
        x = self.dense1(x)
        return self.dense2(x)

model = MyModel()

In [136]:
# Restore the weights
# model.load_weights(SAVE_PATH)

In [144]:
### Loss & Metric
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

valid_loss = tf.keras.metrics.Mean(name='valid_loss')
valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_accuracy')

In [145]:
@tf.function
def train_step(features, labels):
    with tf.GradientTape() as tape:
        predictions = model(features)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)

In [146]:
@tf.function
def valid_step(features, labels):
    predictions = model(features)
    v_loss = loss_object(labels, predictions)

    valid_loss(v_loss)
    valid_accuracy(labels, predictions)

In [None]:
for epoch in range(EPOCHS):
  # 在下一个epoch开始时，重置评估指标
    train_loss.reset_states()
    train_accuracy.reset_states()
    valid_loss.reset_states()
    valid_accuracy.reset_states()

    CNT = 0
    for features, labels in train_ds:
#         clear_output(wait=True)
        train_step(features, labels)
        CNT += 1
        print('batch: %d, batch loss: %.4f' % (CNT, train_loss.result()))    
        
    for val_features, val_labels in valid_ds:
        valid_step(val_features, val_labels)

    template = 'Epoch {}, Loss: {:.2f}, Accuracy: {:.2f}%, Valid Loss: {:.2f}, Valid Accuracy: {:.2f}%'
    print (template.format(epoch+1,
                         train_loss.result(),
                         train_accuracy.result()*100,
                         valid_loss.result(),
                         valid_accuracy.result()*100))

batch: 1, batch loss: 2.4333
batch: 2, batch loss: 2.4251
batch: 3, batch loss: 2.4207


win:10, 39.78 <br>
win: 2, 39.38 <br>
win:15, 40.18 <br>
win:20, 

In [None]:
# 保存权重
model.save_weights(SAVE_PATH)