In [1]:
import sys
import os
import gc
import logging
sys.path.append(r"..")
from utils import *
from model import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score
from scipy import sparse
from scipy.sparse import csr_matrix, coo_matrix

from collections import defaultdict
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.client import device_lib
#print(tf.__version__)
#print(tf.test.is_built_with_gpu_support)
#print(tf.test.is_gpu_available())
#print(device_lib.list_local_devices())
os.environ["CUDA_VISIBLE_DEVICES"] = '0' #use GPU with ID=0
gpus = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(gpus[0], True)
#对需要进行限制的GPU进行设置
# tf.config.experimental.set_virtual_device_configuration(gpus[0],
#                                                       [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
# gpus

DeepCTR version 0.8.0 detected. Your version is 0.7.5.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.0


In [2]:
path_build = "../../data/tencent2020/build2/"
path_embed = "../../data/tencent2020/embed2/"
path_list = "../../data/tencent2020/feature_series/"
path_sub = "../../data/tencent2020/sub2/"

In [3]:
creative_id_em = load_pickle(f"{path_embed}/creative_id_w2v_matrix.pkl")
print('load successfully!!!')
#ad_id_em = load_pickle(f"{path_embed}/ad_id_w2v_matrix.pkl")
advertiser_id_em = load_pickle(f"{path_embed}/advertiser_id_w2v_matrix.pkl")
product_id_em = load_pickle(f"{path_embed}/product_id_w2v_matrix.pkl")
industry_em = load_pickle(f"{path_embed}/industry_w2v_matrix.pkl")
product_category_em = load_pickle(f"{path_embed}/product_category_w2v_matrix.pkl")
# click_times_em = load_pickle(f"{path_embed}/click_times_w2v_matrix2.pkl")
# time_em = load_pickle(f"{path_embed}/time_w2v_matrix2.pkl")

creative_id_times_em = load_pickle(f"{path_embed}/creative_id_times_w2v_matrix.pkl")
ad_id_times_em = load_pickle(f"{path_embed}/ad_id_times_w2v_matrix3.pkl")
product_id_times_em = load_pickle(f"{path_embed}/product_id_times_w2v_matrix.pkl")
advertiser_id_times_em = load_pickle(f"{path_embed}/advertiser_id_times_w2v_matrix.pkl")
product_category_times_em = load_pickle(f"{path_embed}/product_category_times_w2v_matrix.pkl")
industry_times_em = load_pickle(f"{path_embed}/industry_times_w2v_matrix.pkl")

# creative_id_t_em = load_pickle(f"{path_save}/creative_id_t_w2v_matrix2.pkl")
# # ad_id_t_em = load_pickle(f"{path_save}/ad_id_t_w2v_matrix2.pkl")
# product_id_t_em = load_pickle(f"{path_save}/product_id_t_w2v_matrix2.pkl")
# advertiser_id_t_em = load_pickle(f"{path_save}/advertiser_id_t_w2v_matrix2.pkl")
# product_category_t_em = load_pickle(f"{path_save}/product_category_t_w2v_matrix2.pkl")
# industry_t_em = load_pickle(f"{path_save}/industry_t_w2v_matrix2.pkl")

#time_clicktimes_em = load_pickle(f"{path_embed}/time_clicktimes_w2v_matrix2.pkl")
#time_creativeids_em = load_pickle(f"{path_embed}/time_creativeids_w2v_matrix2.pkl")

w2v_features = [
    {'name':'creative_id', 'size':256, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':creative_id_em},
    #{'name':'ad_id', 'size':128, 'windows':5, 'min_count':1, 'version':1, 'max_len':128, 'em':ad_id_em},
    {'name':'advertiser_id', 'size':64, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':advertiser_id_em},
    {'name':'product_id', 'size':64, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':product_id_em},
    {'name':'industry', 'size':32, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':industry_em},
    {'name':'product_category', 'size':16, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':product_category_em},
#     {'name':'time', 'size':16, 'windows':5, 'min_count':1, 'version':1, 'max_len':128, 'em':time_em},
#     {'name':'click_times', 'size':8, 'windows':5, 'min_count':1, 'version':1, 'max_len':128, 'em':click_times_em},
    
    {'name':'creative_id_times', 'size':256, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':creative_id_times_em},
    {'name': 'ad_id_times', 'size': 128, 'windows': 10, 'min_count': 1, 'version': 1, 'max_len':128, 'em':ad_id_times_em},
    {'name':'product_id_times', 'size':64, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':product_id_times_em},
    {'name':'advertiser_id_times', 'size':64, 'windows':10, 'min_count':1, 'version':1, 'max_len':128, 'em':advertiser_id_times_em},
    {'name':'product_category_times', 'size':32, 'windows':10, 'min_count':1, 'version':1,'max_len':128, 'em':product_category_times_em},
    {'name':'industry_times', 'size':32, 'windows':10, 'min_count':1, 'version':1,'max_len':128, 'em':industry_times_em},   
    
    
    #{'name':'time_clicktimes', 'size':91, 'windows':10, 'min_count':1, 'version':2, 'vocab_size':5000,'max_len':128,'em':time_clicktimes_em},
    #{'name':'time_creativeids', 'size':91, 'windows':10, 'min_count':1, 'version':2,'vocab_size':5000, 'max_len':128,'em': time_creativeids_em},
    
]
dense_features = ['creative_id_len', 'ad_id_len', 'product_id_len', 'product_category_len', 'advertiser_len', 'industry_len','time_len',
                  'mean_clicktimes', 'max_clicktimes', 'min_clicktimes', 'mean_time', 'max_time', 'min_time']  
dense_features = []
base_features = ['creative_id', 'ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry']
base_features = ['creative_id', 'ad_id']

for fea in base_features:
    for g in [1]:
        dense_features.append(f'mean_{fea}_gender_{g}')
        #dense_features.append(f'sum_{fea}_gender_{g}')
    for a in [1,2,3,4,5,6,7,8,9,10]:
        dense_features.append(f'mean_{fea}_age_{a}')
        #dense_features.append(f'sum_{fea}_age_{a}')
dense_features = []

#print(creative_id_em.shape,ad_id_em.shape,advertiser_id_em.shape,product_id_em.shape)
#print(industry_em.shape,product_category_em.shape,click_times_em.shape,time_em.shape)

load successfully!!!


FileNotFoundError: [Errno 2] No such file or directory: '../../data/tencent2020/embed2//creative_id_times_w2v_matrix.pkl'

In [4]:
def lstm_model(units,num_classes,w2v_features, dense_features):
    inputs_dict = dict()
    embed_layer_list = []
    for w2v_f in w2v_features:
        em_name = w2v_f['name']
        em_size = w2v_f['em'].shape[0]
        em_dim = w2v_f['size']
        em_m = w2v_f['em']
        max_len = w2v_f['max_len']

        inputs = keras.Input(shape=(max_len,), name=em_name)
        inputs_dict[em_name] = inputs

        embed_layer_list.append( keras.layers.Embedding(
                em_size, em_dim, input_length=max_len, trainable=False, weights=[em_m],mask_zero=True)(inputs))
    embed_output = keras.layers.concatenate(embed_layer_list, axis=-1)
    #embed_output = keras.layers.Conv1D(512, 5, padding='same', kernel_initializer='normal', activation='relu')(embed_output)
    
    lstm_output = keras.layers.Bidirectional(keras.layers.LSTM(units,return_sequences=True))(embed_output)
    
    #lstm_output = layers.GlobalMaxPooling1D()(lstm_output)
                                     
    
    lstm_output = layers.concatenate([layers.GlobalAveragePooling1D()(lstm_output),
                                      layers.GlobalMaxPooling1D()(lstm_output),
                                     ], axis=-1)
    #lstm_output = layers.BatchNormalization()(lstm_output)
    lstm_output = layers.Dropout(0.3)(lstm_output)
    
    fc = keras.layers.Dense(units, activation='relu')(lstm_output)
    #数值型特征
    numeric_list = []
    for den_f in dense_features:
        inputs = keras.Input(shape=(1,), name=den_f)
        inputs_dict[den_f] = inputs
        numeric_list.append(inputs)   
    if dense_features != []:
        numeric_output = keras.layers.concatenate(numeric_list, axis=-1)
        lstm_numeric_output = keras.layers.concatenate([fc,numeric_output], axis=-1)
    else:
        lstm_numeric_output = fc
    
    
    outputs = keras.layers.Dense(num_classes, activation='softmax')(lstm_numeric_output)
    
    
    model = keras.Model(inputs=inputs_dict, outputs=outputs)
    model.compile(optimizer = keras.optimizers.Adam(0.001),
              loss = keras.losses.sparse_categorical_crossentropy,
              metrics = ['accuracy'])
    return model

In [5]:
def input_fn(feature_dict, label=None, epochs=5, shuffle=True, batch_size=64, fit_key='train'):
    if fit_key == 'train':
        dataset = tf.data.Dataset.from_tensor_slices((feature_dict, label))
    else:
        dataset = tf.data.Dataset.from_tensor_slices((feature_dict))
    if shuffle:
        dataset = dataset.shuffle(100*batch_size)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [6]:
users = pd.read_pickle(f"{path_build}/train_user.pkl")
users.sort_values(by=['user_id'], ascending=[True], inplace=True)
users = users.reset_index(drop=True)

fold_train = False
train_split = [0,2700000]
val_split = [2700000, 3000000]
test_split = [3000000]
train_feature_dict = dict()
val_feature_dict = dict()
test_feature_dict = dict()
if fold_train:
    kfolder = KFold(n_splits=5, shuffle=True, random_state=2020)
    kfold = kfolder.split(user_ids[0:900000])
    fold_index = 2
    FLOD_TRAIN = 1
    for train_index, vali_index in kfold:
        if fold_index <= FLOD_TRAIN:
            fold_index += 1
            continue
        print(train_index, vali_index)
        break
    
    for fea in w2v_features:
        name = fea['name']
        max_length = fea['max_len']
        print(name)
        user_ids = np.load(f"{path_list}{name}_list_int.npy", allow_pickle=True)        
        train_feature_dict[name] = keras.preprocessing.sequence.pad_sequences(
            user_ids[train_index],value = 0,padding = 'post',maxlen = max_length )
        val_feature_dict[name] = keras.preprocessing.sequence.pad_sequences(
            user_ids[vali_index],value = 0,padding = 'post',maxlen = max_length )
        test_feature_dict[name] = keras.preprocessing.sequence.pad_sequences(
            user_ids[test_split[0]:],value = 0,padding = 'post',maxlen = max_length )
#     for fea in dense_features:
#         print(fea)
#         train_feature_dict[fea] = user_ids[fea][train_index]
#         val_feature_dict[fea] = user_ids[fea][vali_index]
#         test_feature_dict[fea] = user_ids[fea][test_split[0]:]

    gender_train_label = np.array(users['gender'][train_index])
    gender_val_label = np.array(users['gender'][vali_index])

    age_train_label = np.array(users['age'][train_index])
    age_val_label = np.array(users['age'][vali_index])
    
else:
    for fea in w2v_features:
        name = fea['name']
        max_length = fea['max_len']
        print(name)
        user_ids = np.load(f"{path_list}{name}_list_int.npy", allow_pickle=True)
        train_feature_dict[name] = keras.preprocessing.sequence.pad_sequences(
            user_ids[train_split[0]:train_split[1]],value = 0,padding = 'post',maxlen = max_length )
        val_feature_dict[name] = keras.preprocessing.sequence.pad_sequences(
            user_ids[val_split[0]:val_split[1]],value = 0,padding = 'post',maxlen = max_length )
        test_feature_dict[name] = keras.preprocessing.sequence.pad_sequences(
            user_ids[test_split[0]:],value = 0,padding = 'post',maxlen = max_length )
        
#     for fea in dense_features:
#         print(fea)
#         train_feature_dict[fea] = user_ids[fea][train_split[0]:train_split[1]]
#         val_feature_dict[fea] = user_ids[fea][val_split[0]:val_split[1]]
#         test_feature_dict[fea] = user_ids[fea][test_split[0]:]

    gender_train_label = np.array(users['gender'][train_split[0]:train_split[1]])
    gender_val_label = np.array(users['gender'][val_split[0]:val_split[1]])

    age_train_label = np.array(users['age'][train_split[0]:train_split[1]])
    age_val_label = np.array(users['age'][val_split[0]:val_split[1]])

creative_id


DeepCTR version 0.8.0 detected. Your version is 0.7.5.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.0


advertiser_id
product_id
industry
product_category


In [7]:
# num_classes = 2
# units = 128
# gender_model = lstm_model(units, num_classes, w2v_features, dense_features)
# #gender_model.summary()
# train_dataset = input_fn(train_feature_dict, gender_train_label-1, epochs=5, shuffle=True, batch_size=128)
# val_dataset = input_fn(val_feature_dict, gender_val_label-1, epochs=1, shuffle=False, batch_size=1024)
# gender_model.fit(train_dataset, validation_data=val_dataset)

In [8]:
# test_dataset = input_fn(test_feature_dict, epochs=1, shuffle=False, batch_size=1024, fit_key='predict')
# gender_prob = gender_model.predict(test_dataset)
# gender_val_prob = gender_model.predict(val_dataset)
# print(gender_prob.shape,gender_val_prob.shape)
# tune_weight = search_weight(gender_val_label-1, gender_val_prob, init_weight=[1.0]*2,class_num=2, step=0.001)

# gender_prob_tune = np.array(tune_weight)*gender_prob
# gender_pre = np.argmax(gender_prob_tune,axis=1) + 1
# np.save(f"{sub_path}/val_gender_prob.npy", gender_val_prob)
# np.save(f"{sub_path}/gender_prob.npy", gender_prob)

In [9]:
num_classes = 10
units = 128
age_model = lstm_model(units, num_classes, w2v_features, dense_features)
#age_model.summary()
train_dataset = input_fn(train_feature_dict, age_train_label-1, epochs=6, shuffle=True, batch_size=128)
val_dataset = input_fn(val_feature_dict, age_val_label-1, epochs=1, shuffle=False, batch_size=1024)
age_model.fit(train_dataset, validation_data=val_dataset)

Train for 126563 steps, validate for 293 steps


<tensorflow.python.keras.callbacks.History at 0x1fab7ac9c88>

In [10]:
test_dataset = input_fn(test_feature_dict, epochs=1, shuffle=False, batch_size=1024, fit_key='predict')
age_prob = age_model.predict(test_dataset)
age_val_prob = age_model.predict(val_dataset)
print(age_prob.shape,age_val_prob.shape)
age_tune_weight = search_weight(age_val_label-1, age_val_prob, init_weight=[1.0]*10,class_num=10, step=0.001)
print(age_tune_weight)

age_prob_tune = np.array(age_tune_weight)*age_prob
age_pre = np.argmax(age_prob_tune,axis=1) + 1


np.save(f"{sub_path}/val_age_prob.npy", age_val_prob)
np.save(f"{sub_path}/age_prob.npy", age_prob)

(1000000, 10) (300000, 10)
round:  1
0.48864
0.48866
0.48867666666666665
0.48869
0.4887033333333333
0.48873333333333335
0.48874
0.4887533333333333
0.4887566666666667
0.48877
0.48880666666666667
0.48884666666666665
0.48886666666666667
0.4888733333333333
0.48888
0.48890333333333336
0.4889233333333333
0.48893333333333333
0.48895333333333335
0.48898
0.489
0.48901
0.48903
0.48905
0.48906
0.48915333333333333
0.4891933333333333
0.4892566666666667
0.48941
0.48957
0.48965333333333333
0.48967
0.48967666666666665
0.48968666666666666
0.4898033333333333
0.48984333333333335
0.48985666666666666
0.48990666666666666
0.48995666666666665
0.49013666666666666
0.49026
0.4902766666666667
0.4903766666666667
0.4905733333333333
0.4907066666666667
0.49079666666666666
0.49089333333333335
0.49095
0.4909733333333333
round:  2
0.4909866666666667
0.49099
0.49100333333333335
0.49101333333333336
0.4910233333333333
0.4910466666666667
0.4911433333333333
0.4912066666666667
0.49122
0.4912433333333333
0.49136
0.491426666666

In [20]:
gender_prob = np.load(f"../../data/tencent2020/sub2/gender_prob.npy")
gender_val_prob = np.load(f"../../data/tencent2020/sub2/val_gender_prob.npy")

print(gender_prob.shape,gender_val_prob.shape)
tune_weight = search_weight(gender_val_label-1, gender_val_prob, init_weight=[1.0]*2,class_num=2, step=0.001)
gender_prob_tune = np.array(tune_weight)*gender_prob
gender_pre = np.argmax(gender_prob_tune,axis=1) + 1
                                                                              

print(gender_pre)
sub = pd.DataFrame()
sub['user_id'] = range(3000001,4000001)
sub['predicted_age'] = age_pre
sub['predicted_gender'] = gender_pre
print('ok！')
sub.to_csv(f"{sub_path}/submission.csv", index=False, encoding='utf-8')

(1000000, 2) (300000, 2)
round:  1
0.9474333333333333
0.9474533333333334
0.9474833333333333
0.94749
0.9475
0.9475166666666667
round:  2
[1 2 2 ... 1 1 1]
ok！
