In [1]:
import sys
import os
import logging
sys.path.append(r"..")
from utils import *
from model import *
import numpy as np
import pandas as pd
import deepctr
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score
from gensim import models
from scipy import sparse
from scipy.sparse import csr_matrix, coo_matrix

from collections import defaultdict
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.client import device_lib

print(tf.__version__)
print(tf.test.is_built_with_gpu_support)
print(tf.test.is_gpu_available())
print(device_lib.list_local_devices())
os.environ["CUDA_VISIBLE_DEVICES"] = '0' #use GPU with ID=0
#os.environ["CUDA_VISIBLE_DEVICES"]= '-1'
gpus = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(gpus[0], True)
#对需要进行限制的GPU进行设置
tf.config.experimental.set_virtual_device_configuration(gpus[0],
                                                      [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
gpus

2.1.1
<function is_built_with_gpu_support at 0x00000261B4133438>
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 853850584597353415
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4930941747
locality {
  bus_id: 1
  links {
  }
}
incarnation: 16251682597201175003
physical_device_desc: "device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
user_ids = pd.read_pickle(f"{pickle_path}/user_ids_relencode.pickle")
user = pd.read_csv(train_preliminary_p + "user.csv", encoding='utf-8')
user_ids = user_ids.merge(user, how='left', on='user_id')
del user
print(user_ids.shape)
user_ids.head(2)

In [3]:
# 超参数
vocab_size = 3500000
max_length = 200
embedding_dim = 128
units = 128
num_classes = 2
batch_size = 256
epochs = 2

In [4]:
creative_id_v = models.KeyedVectors.load_word2vec_format(f"{path_save}/creative_id_w2v_128.bin", binary=True)
ad_id_v = models.KeyedVectors.load_word2vec_format(f"{path_save}/ad_id_w2v_128.bin", binary=True)
advertiser_id_v = models.KeyedVectors.load_word2vec_format(f"{path_save}/advertiser_id_w2v_64.bin", binary=True)
product_id_v = models.KeyedVectors.load_word2vec_format(f"{path_save}/product_id_w2v_64.bin", binary=True)

In [5]:
creative_id_em = np.zeros((vocab_size, creative_id_v.vector_size))
for w in creative_id_v.vocab:
    creative_id_em[int(w)] = creative_id_v[w]
ad_id_em = np.zeros((vocab_size, ad_id_v.vector_size))
for w in ad_id_v.vocab:
    ad_id_em[int(w)] = ad_id_v[w]    
    
creative_id_em.shape,ad_id_em.shape

((3500000, 128), (3500000, 128))

In [6]:
creative_id_train_seq = keras.preprocessing.sequence.pad_sequences(user_ids['creative_id'][:810000],value = 0,padding = 'post',maxlen = max_length )
creative_id_val_seq = keras.preprocessing.sequence.pad_sequences(user_ids['creative_id'][810000:900000],value = 0,padding = 'post',maxlen = max_length )

creative_id_test_seq = keras.preprocessing.sequence.pad_sequences(user_ids['creative_id'][900000:],value = 0,padding = 'post',maxlen = max_length )

ad_id_train_seq = keras.preprocessing.sequence.pad_sequences(user_ids['ad_id'][:810000],value = 0,padding = 'post',maxlen = max_length )
ad_id_val_seq = keras.preprocessing.sequence.pad_sequences(user_ids['ad_id'][810000:900000],value = 0,padding = 'post',maxlen = max_length )

ad_id_test_seq = keras.preprocessing.sequence.pad_sequences(user_ids['ad_id'][900000:],value = 0,padding = 'post',maxlen = max_length )




In [7]:
# 放入cpu中
with tf.device("/CPU:0"):
    creative_id_em = tf.constant(creative_id_em)
    ad_id_em = tf.constant(ad_id_em)

In [8]:
gender_train_label = np.array(user_ids['gender'][:810000])
gender_val_label = np.array(user_ids['gender'][810000:900000])

In [9]:
age_train_label = np.array(user_ids['age'][:810000])
age_val_label = np.array(user_ids['age'][810000:900000])

In [10]:
def input_fn(feature_dict, label=None, epochs=5, shuffle=True, batch_size=64, fit_key='train'):
    if fit_key == 'train':
        dataset = tf.data.Dataset.from_tensor_slices((feature_dict, label))
    else:
        dataset = tf.data.Dataset.from_tensor_slices((feature_dict))
    if shuffle:
        dataset = dataset.shuffle(100*batch_size)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [11]:
class LSTModel(keras.Model):
    def __init__(self, units, num_classes, voc_size, emb_size, max_len):
        super(LSTModel, self).__init__()
        self.units = units
        self.embedding1 = keras.layers.Embedding(voc_size, emb_size, input_length=max_len, trainable=False, weights=[creative_id_em])
        self.embedding2 = keras.layers.Embedding(voc_size, emb_size, input_length=max_len, trainable=False, weights=[ad_id_em])
        
        self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(self.units))
        self.dense1 = keras.layers.Dense(self.units, activation='relu')
        self.dense2 = keras.layers.Dense(num_classes, activation='softmax')

    def call(self, feature_dict, training=None, mask=None):
        x1 = self.embedding1(feature_dict['creative_id'])
        x2 = self.embedding2(feature_dict['ad_id'])
        e = tf.concat([x1,x2],-1)
        x = self.lstm(e)
        #x = self.embedding(x)
        #x = self.lstm(x)
        x = self.dense1(x)
        x = self.dense2(x)
        return x

In [12]:
model = LSTModel(units=units,
                 num_classes=num_classes,
                 voc_size=vocab_size, 
                 emb_size=embedding_dim, 
                 max_len=max_length)
model.compile(optimizer = keras.optimizers.Adam(0.001),
              loss = keras.losses.sparse_categorical_crossentropy,
              metrics = ['accuracy'])
train_feature_dict = {'creative_id':creative_id_train_seq,
                'ad_id':ad_id_train_seq}
val_feature_dict = {'creative_id':creative_id_val_seq,
                    'ad_id':ad_id_val_seq}
train_dataset = input_fn(train_feature_dict, gender_train_label-1, epochs=5, shuffle=True, batch_size=128)
val_dataset = input_fn(val_feature_dict, gender_val_label-1, epochs=1, shuffle=False, batch_size=128)
model.fit(train_dataset, validation_data=val_dataset)
#model.save(model_path+'gender', save_format='tf')
model.summary()

Train for 31641 steps, validate for 704 steps
Model: "lst_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  448000000 
_________________________________________________________________
embedding_1 (Embedding)      multiple                  448000000 
_________________________________________________________________
bidirectional (Bidirectional multiple                  394240    
_________________________________________________________________
dense (Dense)                multiple                  32896     
_________________________________________________________________
dense_1 (Dense)              multiple                  258       
Total params: 896,427,394
Trainable params: 427,394
Non-trainable params: 896,000,000
_________________________________________________________________


In [17]:
num_classes = 10
model = LSTModel(units=units,
                 num_classes=num_classes,
                 voc_size=vocab_size, 
                 emb_size=embedding_dim, 
                 max_len=max_length)
model.compile(optimizer = keras.optimizers.Adam(0.001),
              loss = keras.losses.sparse_categorical_crossentropy,
              metrics = ['accuracy'])
train_feature_dict = {'creative_id':creative_id_train_seq,
                'ad_id':ad_id_train_seq}
val_feature_dict = {'creative_id':creative_id_val_seq,
                    'ad_id':ad_id_val_seq}
train_dataset = input_fn(train_feature_dict, age_train_label-1, epochs=5, shuffle=True, batch_size=128)
val_dataset = input_fn(val_feature_dict, age_val_label-1, epochs=1, shuffle=False, batch_size=128)
model.fit(train_dataset, validation_data=val_dataset)
#model.save(model_path+'age', save_format='tf')
model.summary()

Train for 31641 steps, validate for 704 steps
Model: "lst_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  448000000 
_________________________________________________________________
embedding_3 (Embedding)      multiple                  448000000 
_________________________________________________________________
bidirectional_1 (Bidirection multiple                  394240    
_________________________________________________________________
dense_2 (Dense)              multiple                  32896     
_________________________________________________________________
dense_3 (Dense)              multiple                  1290      
Total params: 896,428,426
Trainable params: 428,426
Non-trainable params: 896,000,000
_________________________________________________________________


In [6]:
#gender_model = keras.models.load_model('gender')
#age_model = keras.models.load_model('age')


In [7]:
gender_pre = []
age_pre = []

num_index = [0, 200000, 400000, 600000, 800000, 1000000]
for i in range(5):
    pre_tmp = gender_model.predict(test[num_index[i]:num_index[i+1]],batch_size=8192)
    gender_pre.extend(pre_tmp.argmax(axis=1) + 1)
    print(i)
    if i == 4:
        break
#gender_pre = gender_model.predict(test[0:200000],batch_size=8192)
#gender_pre2 = gender_pre.argmax(axis=1) + 1
#type(gender_pre2), gender_pre2.shape, gender_pre2
gender_pre

0
1
2
3
4


[1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,


In [13]:
gender_pre = []
num_index = [0, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000]
for i in range(10):
    test_feature_dict = {'creative_id':creative_id_test_seq[num_index[i]:num_index[i+1]],
                    'ad_id':ad_id_test_seq[num_index[i]:num_index[i+1]]}
    test_dataset = input_fn(test_feature_dict, epochs=1, shuffle=False, batch_size=1024, fit_key='predict')
    pre_tmp = model.predict(test_dataset)
    gender_pre.extend(pre_tmp.argmax(axis=1) + 1)
    print(i)

# test_feature_dict = {'creative_id':creative_id_test_seq,
#                     'ad_id':ad_id_test_seq}
# test_dataset = input_fn(test_feature_dict, epochs=1, shuffle=False, batch_size=8192, fit_key='predict')
# age_pre = model.predict(test_feature_dict)
age_pre[0:10],age_pre.shape

0
1
2
3
4
5
6
7
8
9


NameError: name 'age_pre' is not defined

In [18]:
gender_pre[0:10],len(gender_pre)

([1, 2, 2, 1, 1, 1, 1, 1, 1, 2], 1000000)

In [19]:
age_pre = []
num_index = [0, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000]
for i in range(10):
    test_feature_dict = {'creative_id':creative_id_test_seq[num_index[i]:num_index[i+1]],
                    'ad_id':ad_id_test_seq[num_index[i]:num_index[i+1]]}
    test_dataset = input_fn(test_feature_dict, epochs=1, shuffle=False, batch_size=1024, fit_key='predict')
    pre_tmp = model.predict(test_dataset)
    age_pre.extend(pre_tmp.argmax(axis=1) + 1)
    print(i)

# test_feature_dict = {'creative_id':creative_id_test_seq,
#                     'ad_id':ad_id_test_seq}
# test_dataset = input_fn(test_feature_dict, epochs=1, shuffle=False, batch_size=8192, fit_key='predict')
# age_pre = model.predict(test_feature_dict)
age_pre[0:10],len(age_pre)

0
1
2
3
4
5
6
7
8
9


AttributeError: 'list' object has no attribute 'shape'

In [16]:
sub = pd.DataFrame()
sub['user_id'] = range(3000001,4000001)
#sub['predicted_age'] = age_pre2
sub['predicted_gender'] = gender_pre
sub.to_csv(f"{sub_path}/submission_0529.csv", index=False, encoding='utf-8')

In [20]:
sub = pd.read_csv(f"{sub_path}/submission_0529.csv", encoding='utf-8')
sub['predicted_age'] = age_pre
sub.to_csv(f"{sub_path}/submission.csv", index=False, encoding='utf-8')

In [21]:
age_pre[0:10],len(age_pre)

([3, 7, 2, 2, 4, 4, 10, 3, 2, 10], 1000000)