In [1]:
import os
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from  collections import Counter
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import  OneHotEncoder

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'


In [2]:
seq_df = pd.read_pickle(f'{preprocess_path}/ad_id_s64_total_seq.pkl')
print(seq_df)
seq_df = seq_df[seq_df.user_id < 1000000]

           user_id                                                                                                                                              ad_id_seq
0        3969503.0  [131508, 24135, 179398, 212955, 298663, 18413, 82996, 121567, 581344, 222576, 410358, 27028, 102071, 74685, 1215844, 241849, 273087, 1437928, 1187...
1           2267.0  [223979, 139563, 79026, 220199, 220126, 274249, 190996, 461148, 50437, 274189, 220199, 533051, 163703, 107932, 113724, 761306, 192657, 624676, 355...
2         512898.0  [150988, 133836, 150708, 246310, 250548, 306943, 320970, 204541, 87899, 107984, 147107, 140319, 569029, 598774, 290696, 683618, 690876, 170732, 53...
...            ...                                                                                                                                                    ...
1899997   742408.0                  [2619335, 3771589, 3332913, 3574432, 73976, 3114229, 3712996, 2495986, 3553252, 1900218, 1757244, 3770207, 2305828

In [3]:
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')

In [4]:
total_df = seq_df.merge(label_df,on='user_id',how='left')

In [5]:
L = 64
emb_model = Word2Vec.load(f'model/ad_id_emb.model_{L}')
print(emb_model)
import numpy as np

vocab_list = [word for word, Vocab in emb_model.wv.vocab.items()]# 存储 所有的 词语

word_index = {" ": 0} # 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。使用前必须添加一个索引0.
word_vector = {} # 初始化`[word : vector]`字典

# 初始化存储所有向量的大矩阵，留意其中多一位（首行），词向量全为 0，用于 padding补零。
# 行数 为 所有单词数+1 比如 10000+1 ； 列数为 词向量“维度”比如100。
embedding_matrix = np.zeros((len(vocab_list) + 1, emb_model.vector_size))

for i in range(len(vocab_list)):
    # print(i)
    word = vocab_list[i]  # 每个词语
    word_index[word] = i + 1 # 词语：索引
    word_vector[word] = emb_model.wv[word] # 词语：词向量
    embedding_matrix[i + 1] = emb_model.wv[word]  # 词向量矩阵

print(embedding_matrix.shape)

Word2Vec(vocab=3027360, size=64, alpha=0.025)
(3027361, 64)


In [6]:
result=[]
hit=0
miss=0
for row in tqdm(total_df[['user_id','ad_id_seq']].values,total=len(total_df)):
    try:
        result.append([row[0],[word_index[i]  for i in row[-1]]])
        hit+=1
    except Exception as e:
        miss+=1
print(f'hit:{hit}, miss:{miss}')

100%|██████████| 900000/900000 [00:13<00:00, 64527.47it/s] 

hit:900000, miss:0





In [7]:
int_seq_df  = pd.DataFrame(result,columns=['user_id','ad_id_int_seq'])
print(int_seq_df)

         user_id                                                                                                                                          ad_id_int_seq
0         2267.0  [963199, 160403, 15881, 756, 25382, 2139, 538883, 451419, 1762, 18246, 756, 818, 1660, 778, 5227, 93053, 27087, 156191, 27605, 33824, 9285, 131681...
1       512898.0  [3705, 17415, 1040366, 8680, 1613864, 33623, 342527, 1721, 9515, 109048, 42791, 11593, 327098, 235459, 71953, 129075, 158673, 133890, 1412, 7534, ...
2       524600.0  [3705, 27941, 399706, 1921899, 41307, 229487, 781793, 2982853, 294179, 200446, 1365907, 133279, 1670198, 316202, 394748, 252916, 147678, 787520, 1...
...          ...                                                                                                                                                    ...
899997  868400.0  [67812, 6596, 205148, 368418, 126497, 453053, 9884, 467138, 4554, 1507178, 2629819, 2629820, 499751, 458486, 11287, 32854, 1545551, 133384, 82

In [8]:
train_df  = int_seq_df[int_seq_df.user_id <=720000]
valid_df = int_seq_df[int_seq_df.user_id > 720000]

train_df = train_df.merge(label_df,on='user_id',how='left')
train_df['age'] =train_df['age'] -1

valid_df = valid_df.merge(label_df,on='user_id',how='left')
valid_df['age'] =valid_df['age'] -1


train_x = np.array(train_df[['ad_id_int_seq']].values[:,0])
train_y = train_df[['age']].values

valid_x = np.array(valid_df[['ad_id_int_seq']].values[:,0])
valid_y = valid_df[['age']].values

before_one_hot =  train_y.reshape([-1,1])
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_train_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_train_y.shape)

before_one_hot =  valid_y.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_valid_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_valid_y.shape)

print(train_x)
print(len(train_x))
maxlen = 1000
train_x = keras.preprocessing.sequence.pad_sequences(train_x, maxlen=maxlen)
valid_x = keras.preprocessing.sequence.pad_sequences(valid_x, maxlen=maxlen)
print(train_x)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(720000, 10)
[[2]
 [2]
 [1]
 ...
 [8]
 [3]
 [5]]
(180000, 10)
[list([963199, 160403, 15881, 756, 25382, 2139, 538883, 451419, 1762, 18246, 756, 818, 1660, 778, 5227, 93053, 27087, 156191, 27605, 33824, 9285, 1316811, 225255, 137354, 552160, 281245, 96585, 2201, 230862, 156191, 2960, 492332, 82325, 80036, 1721, 13393, 40049, 34958, 1976, 29917, 934457, 24271, 43293, 185382, 24875, 43293, 44587, 1597, 32644, 5227, 40064, 185386, 594320, 1371, 1721, 706, 200155, 27136, 96298, 377005, 432345, 726759, 1787, 286213, 30439, 2192, 17901, 25364, 93950, 2289, 242597, 115605, 1330945, 929, 561383, 38752, 8671, 2673052, 90385, 77576, 1024120, 42258, 67706, 9607, 349027, 925104, 192834])
 list([3705, 17415, 1040366, 8680, 1613864, 33623, 342527, 1721, 9515, 109048, 42791, 11593, 327098, 235459, 71953, 129075, 158673, 133890, 1412, 7534, 349649, 425, 1276905, 1082840, 3087, 1691, 454208, 2015, 354169, 16630, 201620, 1606882, 11593, 46545, 231389, 40048, 421264, 9902, 2929, 784, 2630, 7547, 48298, 30

In [9]:

embedding_layer = Embedding(
    len(vocab_list) +1,
    emb_model.vector_size,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [10]:

inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = embedding_layer(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(10, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 64)          193751104 
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
Total params: 193,917,258
Trainable params: 166,154
Non-trainable params: 193,751,104
_________________________________________________________________


In [None]:
print(train_x.shape)
print(one_hoted_train_y)
print(valid_x.shape)
print(one_hoted_valid_y)

model.fit(train_x,one_hoted_train_y, validation_data=(valid_x,one_hoted_valid_y), epochs=3)


(720000, 1000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(180000, 1000)
[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Epoch 1/3
 3958/22500 [====>.........................] - ETA: 3:19:42 - loss: 1.5664 - accuracy: 0.3567