In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing import sequence
import numpy as np
from tqdm import tqdm
from IPython.display import clear_output

tf.enable_eager_execution()
tf.test.is_gpu_available()

import sys
sys.path.append('../')

from config import *

In [2]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"]="-1" 

In [44]:
### Hyperparameter
###################
SAVE_PATH = '/home/huangzc/competition/tencent/model_ckpt/simple_nn/model_all.ckpt'
EMB_SIZE = 50
BATCH_SIZE = 1024
EPOCHS = 100
TARGET = 'age'
COLS_NAME = ['creative_id', 'advertiser_id', 'ad_id', 'product_id', 'product_category', 'industry']

In [4]:
### Read data
### TARGET DF
tr_user_df = pd.read_pickle(TRAIN_DIR+USER_PATH)
tr_user_df = tr_user_df.groupby(['user_id']).agg({'age': 'first', 'gender': 'first'}).reset_index()

In [5]:
tr_ad_df = pd.read_pickle(TRAIN_DIR+AD_PATH)
ts_ad_df = pd.read_pickle(TEST_DIR+AD_PATH)

In [6]:
train_df = []
test_df = []
for col in tqdm(COLS_NAME):
    train_df.append(pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT[col]))
    test_df.append(pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT[col]))
    
train_df.append(tr_user_df)

100%|██████████| 6/6 [04:03<00:00, 40.64s/it]


In [7]:
def get_grid_df(creative_df, advertiser_df, ad_df, product_id_df, product_cat_df, industry_df, user_df=None):
    if user_df is  None:
        user_df = creative_df[['user_id']]
        user_df[TARGET] = np.nan
    assert user_df['user_id'].values.tolist() == creative_df['user_id'].values.tolist() \
    == ad_df['user_id'].values.tolist() == product_id_df['user_id'].values.tolist() \
    == product_cat_df['user_id'].values.tolist() == industry_df['user_id'].values.tolist() \
    == advertiser_df['user_id'].values.tolist()

    del advertiser_df['user_id'], ad_df['user_id'], product_id_df['user_id'], product_cat_df['user_id'], industry_df['user_id']
    
    grid_df = pd.concat([creative_df, advertiser_df, 
                         ad_df, product_id_df, 
                         product_cat_df, industry_df,
                         user_df[[TARGET]]], axis=1)

    return grid_df

In [8]:
grid_df = get_grid_df(*train_df)
grid_df_test = get_grid_df(*test_df)
grid_df[TARGET] = grid_df[TARGET] - 1

In [9]:
train, val = train_test_split(grid_df, test_size=0.2, random_state=2020)
test = grid_df_test

In [10]:
def pad_feature():  
    x_train = []
    x_val = []
    x_test = []
    
    sentence_size = max(int(grid_df[COLS_NAME[0]].map(lambda x: len(x)).quantile(0.99)), 
                        int(grid_df_test[COLS_NAME[0]].map(lambda x: len(x)).quantile(0.99)))
    print('choose sentences max len: %d' % (sentence_size))
    print("Pad sequences (samples x time)")      
    for col in tqdm(COLS_NAME):
        x_train.append(sequence.pad_sequences(train[col],
                                             maxlen=sentence_size, 
                                             padding='post', 
                                             truncating='post',
                                             dtype='int64',
                                             value=0
                                             ))
        x_val.append(sequence.pad_sequences(val[col],
                                             maxlen=sentence_size, 
                                             padding='post', 
                                             truncating='post',
                                             dtype='int64',
                                             value=0
                                             ))
        x_test.append(sequence.pad_sequences(test[col], 
                                            maxlen=sentence_size, 
                                            padding='post',
                                            truncating='post',
                                            dtype='int64',
                                            value=0
                                           ))
    print('feature count: train->%d, valid->%d, test->%d' %(len(x_train), len(x_val), len(x_test)))
    return x_train, x_val, x_test

In [11]:
x_train, x_val, x_test = pad_feature()

  0%|          | 0/6 [00:00<?, ?it/s]

choose sentences max len: 156
Pad sequences (samples x time)


100%|██████████| 6/6 [02:08<00:00, 21.45s/it]

feature count: train->6, valid->6, test->6





In [45]:
### Get tf dataset
def get_train_ds(x, y): 
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(buffer_size=len(x))
    dataset = dataset.batch(BATCH_SIZE)
#     dataset = dataset.repeat(EPOCHS)
    return dataset

def get_test_ds(x, ): 
    dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)
    return dataset

train_ds = get_train_ds(tuple(x_train), train[TARGET].values)
valid_ds = get_train_ds(tuple(x_val), val[TARGET].values)
test_ds = get_test_ds(tuple(x_test))

In [13]:
vocab_sizes = []
temp = pd.concat([tr_ad_df, ts_ad_df], axis=0)
for col in COLS_NAME:
    print(col)
    vocab_sizes.append(max(temp[col].unique().tolist()) + 1) ### padding 0 need add 1

creative_id
advertiser_id
ad_id
product_id
product_category
industry


In [14]:
# weights = np.load('/home/huangzc/competition/tencent/data/train_preliminary/gensim_dict.npy')

In [None]:
### Construct Model
#################################
class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
#         self.embedding = tf.keras.layers.Embedding(vocab_size, EMB_SIZE, weights=[weights])
        self.embeddings = []
        for s in vocab_sizes:
            self.embeddings.append(tf.keras.layers.Embedding(s, EMB_SIZE))
        self.pool1D = tf.keras.layers.GlobalAveragePooling1D()
        self.concat = tf.keras.layers.Concatenate()
        self.dense1 = tf.keras.layers.Dense(128, activation='relu')
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.dense3 = tf.keras.layers.Dense(10, activation='softmax')

    def call(self, inputs):
        embs = []
        for emb, inp in zip(self.embeddings, inputs):
            x = emb(inp)            
            x = self.pool1D(x)
            embs.append(x)
        x = self.concat(embs)
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3(x)

model = MyModel()

In [59]:
# # Restore the weights
model.load_weights(SAVE_PATH)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f07e25529e8>

In [17]:
### Loss & Metric
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

valid_loss = tf.keras.metrics.Mean(name='valid_loss')
valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_accuracy')

In [60]:
@tf.function
def train_step(features, labels):
    with tf.GradientTape() as tape:
        predictions = model(features)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)

In [61]:
@tf.function
def valid_step(features, labels):
    predictions = model(features)
    v_loss = loss_object(labels, predictions)

    valid_loss(v_loss)
    valid_accuracy(labels, predictions)

In [62]:
for epoch in range(EPOCHS):
  # 在下一个epoch开始时，重置评估指标
    train_loss.reset_states()
    train_accuracy.reset_states()
    valid_loss.reset_states()
    valid_accuracy.reset_states()

    CNT = 0
    for features, labels in train_ds:
        train_step(features, labels)
        CNT += 1
        print('batch: {:d}, batch train loss: {:.2f}, train acc: {:.2f}%'.format(CNT, train_loss.result(), train_accuracy.result()*100))  

        if CNT % 50 == 0: 
            for val_features, val_labels in valid_ds:
                valid_step(val_features, val_labels)
            print('##################################################')
            print('batch: {:d}, batch valid loss: {:.2f}, valid acc: {:.2f}%'.format(CNT, valid_loss.result(), valid_accuracy.result()*100)) 
            print('##################################################')
            
    template = 'Epoch {}, Loss: {:.2f}, Accuracy: {:.2f}%, Valid Loss: {:.2f}, Valid Accuracy: {:.2f}%'
    print (template.format(epoch+1,
                         train_loss.result(),
                         train_accuracy.result()*100,
                         valid_loss.result(),
                         valid_accuracy.result()*100))

batch: 1, batch train loss: 1.28, train acc: 50.78%
batch: 2, batch train loss: 1.68, train acc: 39.50%
batch: 3, batch train loss: 1.55, train acc: 42.97%
batch: 4, batch train loss: 1.61, train acc: 40.01%
batch: 5, batch train loss: 1.62, train acc: 38.61%
batch: 6, batch train loss: 1.59, train acc: 38.88%
batch: 7, batch train loss: 1.56, train acc: 38.95%
batch: 8, batch train loss: 1.56, train acc: 38.73%
batch: 9, batch train loss: 1.56, train acc: 38.14%
batch: 10, batch train loss: 1.55, train acc: 38.00%
batch: 11, batch train loss: 1.52, train acc: 38.99%
batch: 12, batch train loss: 1.51, train acc: 39.15%
batch: 13, batch train loss: 1.51, train acc: 39.09%
batch: 14, batch train loss: 1.51, train acc: 39.36%
batch: 15, batch train loss: 1.50, train acc: 39.79%
batch: 16, batch train loss: 1.49, train acc: 40.30%
batch: 17, batch train loss: 1.48, train acc: 40.52%
batch: 18, batch train loss: 1.47, train acc: 40.56%
batch: 19, batch train loss: 1.46, train acc: 40.70%
ba

##################################################
batch: 150, batch valid loss: 1.46, valid acc: 40.39%
batch: 151, batch train loss: 1.43, train acc: 41.94%
batch: 152, batch train loss: 1.43, train acc: 41.93%
batch: 153, batch train loss: 1.43, train acc: 41.94%
batch: 154, batch train loss: 1.43, train acc: 41.94%
batch: 155, batch train loss: 1.43, train acc: 41.94%
batch: 156, batch train loss: 1.43, train acc: 41.93%
batch: 157, batch train loss: 1.43, train acc: 41.95%
batch: 158, batch train loss: 1.43, train acc: 41.97%
batch: 159, batch train loss: 1.43, train acc: 41.97%
batch: 160, batch train loss: 1.43, train acc: 41.96%
batch: 161, batch train loss: 1.43, train acc: 41.97%
batch: 162, batch train loss: 1.43, train acc: 41.98%
batch: 163, batch train loss: 1.43, train acc: 41.98%
batch: 164, batch train loss: 1.43, train acc: 41.99%
batch: 165, batch train loss: 1.43, train acc: 42.00%
batch: 166, batch train loss: 1.43, train acc: 42.00%
batch: 167, batch train loss: 1

batch: 297, batch train loss: 1.39, train acc: 43.22%
batch: 298, batch train loss: 1.39, train acc: 43.23%
batch: 299, batch train loss: 1.39, train acc: 43.25%
batch: 300, batch train loss: 1.39, train acc: 43.26%
##################################################
batch: 300, batch valid loss: 1.45, valid acc: 40.68%
batch: 301, batch train loss: 1.39, train acc: 43.27%
batch: 302, batch train loss: 1.39, train acc: 43.27%
batch: 303, batch train loss: 1.39, train acc: 43.27%
batch: 304, batch train loss: 1.39, train acc: 43.28%
batch: 305, batch train loss: 1.39, train acc: 43.29%
batch: 306, batch train loss: 1.39, train acc: 43.30%
batch: 307, batch train loss: 1.39, train acc: 43.30%
batch: 308, batch train loss: 1.39, train acc: 43.32%
batch: 309, batch train loss: 1.39, train acc: 43.32%
batch: 310, batch train loss: 1.39, train acc: 43.33%
batch: 311, batch train loss: 1.39, train acc: 43.34%
batch: 312, batch train loss: 1.39, train acc: 43.35%
batch: 313, batch train loss: 1

batch: 443, batch train loss: 1.36, train acc: 44.70%
batch: 444, batch train loss: 1.35, train acc: 44.71%
batch: 445, batch train loss: 1.35, train acc: 44.73%
batch: 446, batch train loss: 1.35, train acc: 44.74%
batch: 447, batch train loss: 1.35, train acc: 44.75%
batch: 448, batch train loss: 1.35, train acc: 44.77%
batch: 449, batch train loss: 1.35, train acc: 44.78%
batch: 450, batch train loss: 1.35, train acc: 44.78%
##################################################
batch: 450, batch valid loss: 1.45, valid acc: 40.73%
batch: 451, batch train loss: 1.35, train acc: 44.79%
batch: 452, batch train loss: 1.35, train acc: 44.80%
batch: 453, batch train loss: 1.35, train acc: 44.81%
batch: 454, batch train loss: 1.35, train acc: 44.82%
batch: 455, batch train loss: 1.35, train acc: 44.83%
batch: 456, batch train loss: 1.35, train acc: 44.84%
batch: 457, batch train loss: 1.35, train acc: 44.85%
batch: 458, batch train loss: 1.35, train acc: 44.86%
batch: 459, batch train loss: 1

batch: 589, batch train loss: 1.32, train acc: 46.14%
batch: 590, batch train loss: 1.32, train acc: 46.14%
batch: 591, batch train loss: 1.32, train acc: 46.15%
batch: 592, batch train loss: 1.32, train acc: 46.16%
batch: 593, batch train loss: 1.32, train acc: 46.17%


KeyboardInterrupt: 

In [63]:
# 保存权重
model.save_weights(SAVE_PATH)