In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sbn
import os
from functools import cmp_to_key
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from deepctr.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
from deepctr.models import xDeepFM
import pickle
import time
%matplotlib inline

In [2]:
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
def load_dict_from_file(file, type='default', skiprow = 0):
    data = list()
    size = 0
    with open(file, encoding='utf-8') as f:
        for line in f:
            size += 1
            try:
                if size <= skiprow:
                    continue
                line = line.replace('"', '').strip()
                if len(line) <= 0:
                    continue
                if type == 'interet':
                    parts = line.split(",")
                    name = parts[0]
                    size = int(parts[1])
                    if size > 10 and len(name) > 0:
                        data.append(name)
                elif type == 'loc':
                    parts = line.split(",")
                    province = parts[0]
                    city = parts[1]
                    area = parts[2]
                    data.append(province)
                    data.append(province + "_" + city)
                    data.append(province + "_" + city + "_" + area)
                elif type == 'publisher':
                    parts = line.split(",")
                    name = parts[0]
                    size = int(parts[1])
                    if size > 10:
                        data.append(name)
                else:
                    data.append(line)
            except:
                print(line)
    return data

In [4]:
def load_dict(dir, type='default', skiprow = 0):
    data = list()
    size = 0
    files = os.listdir(dir)
    for file in files:
        if not file.endswith(".csv"):
            continue
        data+=load_dict_from_file(dir + "/" + file, type, skiprow)
    return data

In [5]:
lbe_file = 'lbe.pickle'
data_map_file = 'data_map.pickle'

In [6]:
def save_pickle_data(file, data):
    f = open(file, 'wb')
    pickle.dump(data, f)
    f.close()


def load_pickle_data(file):
    try:
        f1 = open(file, 'rb')
        return pickle.load(f1)
    except:
        pass
    return None

In [7]:
def gen_dict_map(vocad):
        its_index = dict()
        size = 0
        for i in vocad:
            if i not in its_index.keys():
                its_index[i] = size
                size += 1
        return its_index


In [8]:
its = load_dict('/home/recsys/dataset/dict/interets', 'interet')
locs = load_dict('/home/recsys/dataset/dict/loc', 'loc')
publishers = load_dict('/home/recsys/dataset/dict/publisher', 'publisher', 1)
cates = load_dict_from_file('/home/recsys/dataset/dict/cate.csv', 'cate', 1)
channels = load_dict_from_file('/home/recsys/dataset/dict/channel.csv', 'channel', 1)
publishers.append('other')
channels.append('')

u_levels = [str(i) for i in range(0, 10)]
media_levels = [str(i) for i in range(0, 10)]
rschannles = [str(i) for i in range(1, 33)]
vocabs = dict()
vocabs['u_level'] = u_levels
vocabs['t_channel'] = channels
vocabs['cp_category'] = cates
vocabs['cp_publisher'] = publishers
vocabs['cp_media_level'] = media_levels
vocabs['rschannles'] = rschannles

Sky game
中国新闻网,绥芬河政府网站,1
Ansun Biopharma, Inc.,1


In [9]:
cate_map = gen_dict_map(cates)

In [10]:
if os.path.exists(lbe_file) and os.path.exists(data_map_file):
    print('load data from cache')
    lbe_pickle = load_pickle_data(lbe_file)
    data_map_pickle = load_pickle_data(data_map_file)

    if lbe_pickle:
        lbes = lbe_pickle
    if data_map_pickle:
        rschannlemap = data_map_pickle['rschannlemap']
        itsmap = data_map_pickle['itsmap']
        locmap = data_map_pickle['locmap']
else:
    def gen_label_encode(vocab):
        lbe = LabelEncoder()
        lbe.fit(vocab)
        return lbe


    lbes = dict()
    for key in vocabs.keys():
        lbes[key] = gen_label_encode(vocabs[key])

    for key in vocabs.keys():
        print(len(vocabs[key]))


    def gen_dict_map(vocad):
        its_index = dict()
        size = 0
        for i in vocad:
            if i not in its_index.keys():
                its_index[i] = size
                size += 1
        return its_index


    rschannles = [str(i) for i in range(1, 33)]
    rschannlemap = gen_dict_map(rschannles)

    itsmap = gen_dict_map(its)
    locmap = gen_dict_map(locs)

    data_map = {}
    data_map['rschannlemap'] = rschannlemap
    data_map['itsmap'] = itsmap
    data_map['locmap'] = locmap
    save_pickle_data(lbe_file, lbes)
    save_pickle_data(data_map_file, data_map)

load data from cache


In [11]:
len(vocabs)

6

In [12]:
dir = '/home/recsys/dataset/train_csv_v1'

In [13]:
def list_sort_files(dir):
    def compare(x, y):
        stat_x = int(x.replace('-', ''))
        stat_y = int(y.replace('-', ''))
        if stat_x < stat_y:
            return -1
        elif stat_x > stat_y:
            return 1
        else:
            return 0
    items = os.listdir(dir)
    items.sort(key = cmp_to_key(compare))
    return items

In [14]:
print(list_sort_files(dir)[0])

2020-05-15-00


In [15]:
def load_corpus(path):
    files = os.listdir(path)
    final_file = None
    for file in files:
        if file.endswith('.csv'):
            final_file = path +"/"+file
            break
    if not final_file:
        return None
    print('load data from ', final_file)
    return pd.read_csv(final_file)

In [16]:
def parser_publisher(item):
    if item not in publishers:
        item = 'other'
    return item

In [17]:
def trans_data(item):
    item['t_channel'] = item['t_channel'].fillna("")
    item['t_location'] = item['t_location'].fillna("")
    item['t_scene'] = item['t_scene'].fillna("")
    item['u_umi_interest'] = item['u_umi_interest'].fillna('')
    item['u_umi_interest_weight'] = item['u_umi_interest_weight'].astype('str').fillna('')
    item['u_umi_cate'] = item['u_umi_cate'].fillna('')
    item['u_umi_cate_weight'] = item['u_umi_cate_weight'].astype('str').fillna('')
    item['u_uli_interest'] = item['u_uli_interest'].fillna('')
    item['u_uli_interest_weight'] = item['u_uli_interest_weight'].astype('str').fillna('')
    item['u_uli_cate'] = item['u_uli_cate'].fillna('')
    item['u_uli_cate_weight'] = item['u_uli_cate_weight'].astype('str').fillna('')
    item['u_usi_interest'] = item['u_usi_interest'].fillna('')
    item['u_usi_interest_weight'] = item['u_usi_interest_weight'].astype('str').fillna('')
    item['u_usi_cate'] = item['u_usi_cate'].fillna('')
    item['u_usi_cate_weight'] = item['u_usi_cate_weight'].astype('str').fillna('')
    item['u_level'] = item['u_level'].fillna(0).astype('str')
    item['cp_media_level'] = item['cp_media_level'].fillna(0).astype('int').astype('str')
    item['cp_publisher'] = item['cp_publisher'].apply(parser_publisher)
    item['cp_location'] = item['cp_location'].fillna("")
    item['cp_location_weight'] = item['cp_location_weight'].fillna("")
    item['cp_category'] = item['cp_category'].fillna('')
    item['cp_category_weight'] = item['cp_category_weight'].astype('str').fillna('')
    item['cp_interests'] = item['cp_interests'].fillna('')
    item['cp_interests_weight'] = item['cp_interests_weight'].astype('str').fillna('')
    item['rs_channel'] = item['rs_channel'].fillna("")
    item['rs_tag_interest'] = item['rs_tag_interest'].fillna('')
    item['rs_tag_interest_dactr'] = item['rs_tag_interest_dactr'].astype('str').fillna('')
    item['rs_tag_cate'] = item['rs_tag_cate'].fillna('')
    item['rs_tag_cate_dactr'] = item['rs_tag_cate_dactr'].astype('str').fillna('')


In [18]:
file = '2020-05-21-18'
train_corpus = load_corpus('/home/recsys/dataset/train_csv_v1/' + file + '/train')
test_corpus = load_corpus('/home/recsys/dataset/train_csv_v1/' + file + '/test')
cur = int(time.time())
choose_data =  pd.concat([train_corpus, test_corpus])

load data from  /home/recsys/dataset/train_csv_v1/2020-05-21-18/train/part-00000-f23867b5-57c8-40d8-98b1-fee2456c562a-c000.csv


  if (await self.run_code(code, result,  async_=asy)):


load data from  /home/recsys/dataset/train_csv_v1/2020-05-21-18/test/part-00000-d8afc62f-ef33-49eb-8086-d5af7504d48b-c000.csv


In [19]:
def split_weight(x):
    key_ans = x.split(',')
    return list([float(item) for item in key_ans])

In [20]:
def gen_pad_seq(values, weights, key2index, max_len=None):
    def split(x):
        vkeys = list()
        try:
            key_ans = set(x.split(','))
            for key in key_ans:
                key = key.strip()
                if len(key) > 0 and key in key2index:
                    vkeys.append(key)
        except:
            pass
        return list(map(lambda x: key2index[x], vkeys))
    def split_weight(x):
        res = list()
        size = 0
        parts = x.split(',')
        for part in parts:
            try:
                if part != 'nan':
                    res.append(float(part))
                else:
                    res.append(0)
                size += 1
            except:
                res.append(0)
                size += 1
        while size < max_len:
            res.append(0)
            size += 1
        return res
    index_list = list(map(split, values))
    weight_list = None
    if weights is not None:
        weight_list = list(map(split_weight, weights))
    index_list = pad_sequences(index_list, maxlen=max_len, padding='post', )
    return index_list, weight_list

###### 选择特征列 
uid, u_level, u_umi_interest, u_umi_interest_weight, u_umi_cate, u_umi_cate_weight, u_uli_interest, u_uli_interest_weight, u_uli_cate, u_uli_cate_weight, u_usi_interest, u_usi_interest_weight, u_usi_cate, u_usi_cate_weight

t_channel, t_location

item_id, cp_publisher, cp_media_level, cp_life_hour, cp_newsy_score, cp_word_count, cp_category, cp_category_weight, cp_interests, cp_interests_weight, cp_location, cp_location_weight, cp_is_local, cp_is_local_publisher

rs_p1_score, rs_gactr, rs_channel, rs_tag_interest, rs_tag_interest_dactr, rs_tag_cate, rs_tag_cate_dactr

In [32]:
# sparse_features = ['u_level', 't_channel', 'cp_publisher', 'cp_media_level']
# dense_features = ['rs_gactr', 'rs_p1_score', 'cp_newsy_score', 'cp_is_local', 'cp_is_local_publisher', 'cp_life_hour', 'cp_word_count']
sparse_features = ['u_level', 't_channel', 'cp_publisher', 'cp_media_level']
dense_features = ['rs_gactr', 'rs_p1_score', 'cp_newsy_score']
target = ['action']

In [33]:
interest_len = 500
cate_len = 50
cp_interest_len = 10
cp_cate_len = 4
loc_len = 2
rs_tag_len = 10
rs_channel_len = 32
rs_cate_len = 5
rs_its_len = 10

In [34]:
emb_size = 16

var_info = list()
var_info.append({'label':'u_uli_interest', 'len':interest_len, 'map':itsmap, 'weight':None, 'emb_size':emb_size})
# var_info.append({'label':'u_umi_interest', 'len':interest_len, 'map':itsmap, 'weight':None, 'emb_size':emb_size})
# var_info.append({'label':'u_usi_interest', 'len':interest_len, 'map':itsmap, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'u_uli_cate', 'len':cate_len, 'map':cate_map, 'weight':None, 'emb_size':emb_size})
# var_info.append({'label':'u_umi_cate', 'len':cate_len, 'map':cate_map, 'weight':None, 'emb_size':emb_size})
# var_info.append({'label':'u_usi_cate', 'len':cate_len, 'map':cate_map, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'cp_interests', 'len':cp_interest_len, 'map':itsmap, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'cp_category', 'len':cp_cate_len, 'map':cate_map, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'cp_location', 'len':loc_len, 'map':locmap, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'t_location', 'len':loc_len, 'map':locmap, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'rs_channel', 'len':rs_channel_len, 'map':rschannlemap, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'rs_tag_interest', 'len':rs_its_len, 'map':itsmap, 'weight':None, 'emb_size':emb_size})
var_info.append({'label':'rs_tag_cate', 'len':rs_cate_len, 'map':cate_map, 'weight':None, 'emb_size':emb_size})

In [35]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=len(vocabs[feat]), embedding_dim=emb_size)
                              for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features]
varlen_feature_columns = []
varlen_feature_columns = [VarLenSparseFeat(SparseFeat(item['label'], vocabulary_size=len(item['map']) + 1, embedding_dim=item['emb_size']), maxlen=item['len'], combiner='mean', weight_name=item['weight']) for item in var_info]
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [36]:
fixlen_feature_columns

[SparseFeat(name='u_level', vocabulary_size=10, embedding_dim=16, use_hash=False, dtype='int32', embedding_name='u_level', group_name='default_group'),
 SparseFeat(name='t_channel', vocabulary_size=18, embedding_dim=16, use_hash=False, dtype='int32', embedding_name='t_channel', group_name='default_group'),
 SparseFeat(name='cp_publisher', vocabulary_size=32414, embedding_dim=16, use_hash=False, dtype='int32', embedding_name='cp_publisher', group_name='default_group'),
 SparseFeat(name='cp_media_level', vocabulary_size=10, embedding_dim=16, use_hash=False, dtype='int32', embedding_name='cp_media_level', group_name='default_group'),
 DenseFeat(name='rs_gactr', dimension=1, dtype='float32'),
 DenseFeat(name='rs_p1_score', dimension=1, dtype='float32'),
 DenseFeat(name='cp_newsy_score', dimension=1, dtype='float32')]

In [37]:
model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy', 'binary_accuracy', tf.keras.metrics.AUC()])

In [38]:
def lower_sample_data(df, most_label,  percent=1):
    '''
    percent:多数类别下采样的数量相对于少数类别样本数量的比例
    '''
    most_data = df[df['action'] == most_label]  # 多数类别的样本
    minority_data = df[df['action'] != most_label]  # 少数类别的样本
    index = np.random.randint(len(most_data), size=int(percent *len(minority_data)) )
    #下采样后数据样本
    lower_data = most_data.iloc[list(index)]  # 下采样
    return(pd.concat([lower_data, minority_data]))

In [39]:
from sklearn.utils import shuffle

In [40]:
def load_data_from_files(files):
    corpus = []
    for file in files:
        train_corpus = load_corpus('/home/recsys/dataset/train_csv_v1/' + file + '/train')
        test_corpus = load_corpus('/home/recsys/dataset/train_csv_v1/' + file + '/test')
        corpus.append(train_corpus)
        corpus.append(test_corpus)
    return lower_sample_data(pd.concat(corpus), 0, 1)

In [41]:
def train_by_batch(files):
    file = files[0]
    cur = int(time.time())
    choose_data =  load_data_from_files(files)
    choose_data['u_interests'] = choose_data['u_uli_interest']+","+choose_data['u_umi_interest']+","+choose_data['u_usi_interest']
    choose_data['u_cates'] = choose_data['u_uli_cate']+","+choose_data['u_umi_cate']+","+choose_data['u_usi_cate']
    choose_data = shuffle(choose_data)
    print("concat data cost", (int(time.time()) - cur), 'choose data size:', len(choose_data), 'pos size:', len(choose_data[choose_data['action'] == 1]), 'neg size:', len(choose_data[choose_data['action'] != 1]))
    cur = int(time.time())
    trans_data(choose_data)
    print("trans_data cost", (int(time.time()) - cur))
    for feat in sparse_features:
        lbe = lbes[feat]
        choose_data[feat] = lbe.transform(choose_data[feat])
    cur = int(time.time())
    u_uli_interest_list, u_uli_interest_list_weight = gen_pad_seq(choose_data['u_interests'], None, itsmap, interest_len)
#     u_umi_interest_list, u_umi_interest_list_weight = gen_pad_seq(choose_data['u_umi_interest'], None, itsmap, interest_len)
#     u_usi_interest_list, u_usi_interest_list_weight = gen_pad_seq(choose_data['u_usi_interest'], None, itsmap, interest_len)
    u_uli_cate_list, u_uli_cate_list_weight = gen_pad_seq(choose_data['u_cates'], None, cate_map, cate_len)
#     u_umi_cate_list, u_umi_cate_list_weight = gen_pad_seq(choose_data['u_umi_cate'], None, cate_map, cate_len)
#     u_usi_cate_list, u_usi_cate_list_weight = gen_pad_seq(choose_data['u_usi_cate'], None, cate_map, cate_len)
    cp_interests_list, cp_interests_weight = gen_pad_seq(choose_data['cp_interests'], None, itsmap, cp_interest_len)
    cp_category_list, cp_category_weight = gen_pad_seq(choose_data['cp_category'], None, cate_map, cp_cate_len)
    cp_location_list, cp_location_weight = gen_pad_seq(choose_data['cp_location'], None, locmap, loc_len)
    t_location_list, t_location_weight = gen_pad_seq(choose_data['t_location'], None, locmap, loc_len)
    rs_channel_list, rs_channel_weight = gen_pad_seq(choose_data['rs_channel'], None, rschannlemap, rs_channel_len)
    rs_tag_interest_list, rs_tag_interest_weight = gen_pad_seq(choose_data['rs_tag_interest'], None, itsmap, rs_its_len)
    rs_tag_cate_list, rs_tag_cate_weight = gen_pad_seq(choose_data['rs_tag_cate'], None, cate_map, rs_cate_len)
    print("pad data cost", (int(time.time()) - cur))
    var_data = list()
    var_data.append({'label':'u_uli_interest', 'list':u_uli_interest_list, 'weight':None})
#     var_data.append({'label':'u_umi_interest', 'list':u_umi_interest_list, 'weight':None})
#     var_data.append({'label':'u_usi_interest', 'list':u_usi_interest_list, 'weight':None})
    var_data.append({'label':'u_uli_cate', 'list':u_uli_cate_list, 'weight':None})
#     var_data.append({'label':'u_umi_cate', 'list':u_umi_cate_list, 'weight':None})
#     var_data.append({'label':'u_usi_cate', 'list':u_usi_cate_list, 'weight':None})
    var_data.append({'label':'cp_interests', 'list':cp_interests_list, 'weight':None})
    var_data.append({'label':'cp_category', 'list':cp_category_list})
    var_data.append({'label':'cp_location', 'list':cp_location_list})
    var_data.append({'label':'t_location', 'list':t_location_list})
    var_data.append({'label':'rs_channel', 'list':rs_channel_list})
    var_data.append({'label':'rs_tag_interest', 'list':rs_tag_interest_list})
    var_data.append({'label':'rs_tag_cate', 'list':rs_tag_cate_list})

    
    model_input = {name: choose_data[name] for name in feature_names}
    for item in var_data:
        model_input[item['label']] = item['list']
#         if item['label'] == 'u_uli' or item['label'] == 'u_umi' or item['label'] == 'u_usi':
#             model_input[item['label']+'_weight'] = item['weight']
   
    history = model.fit(model_input, choose_data[target].values,
                    batch_size=128, epochs=10, verbose=2, validation_split=0.1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=1)],)
    model.save_weights('./checkpoints/'+file)

In [43]:
files = list_sort_files(dir)
tmps = []
size = 0
for file in files:
    size += 1
    tmps.append(file)
#     print('train model use data from ', file)
    if size > 72:
        train_by_batch(tmps)
        tmps = []
        size = 0
if size > 0:
    train_by_batch(tmps)
    size = 0


load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-00/train/part-00000-fea9d6c6-ce27-478b-8b91-d4a47e366bd4-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-00/test/part-00000-2f154b48-f1b6-4b90-b886-fc2159d8f609-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-01/train/part-00000-28e9dfd3-c37b-4b09-b4d1-a782fb7b5ee4-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-01/test/part-00000-85858532-4699-447f-8fd3-eafeebb3752a-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-02/train/part-00000-9e21bd72-bf86-43c7-847a-272511b64eda-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-02/test/part-00000-f4f2a8a1-55a2-417e-ab43-6bc9b32dfa5d-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-03/train/part-00000-51f9b66f-65e2-40f1-bfe0-76c75f5b775e-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-03/test/part-00000-c0dec744-3dfb-441e-b6b8-171fd2b802

  if __name__ == '__main__':


load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-07/test/part-00000-18ad8f18-df3a-46ad-8c25-9394a52b7034-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-08/train/part-00000-05a4490f-4621-4e40-a8dd-8b20f1be2499-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-08/test/part-00000-4387b142-4feb-4747-a52a-981551ce5d9e-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-09/train/part-00000-a221d3f0-1273-454f-875f-7e0c25de33fc-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-09/test/part-00000-bc44a5db-2473-4037-9c43-19816ff5c4af-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-10/train/part-00000-f17d8593-313a-4c67-93a6-870f65e4c379-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-10/test/part-00000-0dc9ce4c-0b43-4f58-b5dd-839cf58f1dc0-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-11/train/part-00000-a2fcea0b-f476-48f6-af4d-6194c15259

  if __name__ == '__main__':


load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-20/test/part-00000-e3b772bb-e936-40ba-8d3a-3412b573a89f-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-21/train/part-00000-21ffe14b-2f68-4cf2-b0da-84fb8139b651-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-21/test/part-00000-a8686962-fb31-4d26-aef9-c4bd3235eb09-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-22/train/part-00000-bed86841-36ad-4771-8e79-790ab96f47eb-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-22/test/part-00000-fd2ab5d8-327d-4cb7-8dd7-45584647c9fb-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-23/train/part-00000-6be1095b-071b-4dd0-aa3d-a8d6fc24b5ec-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-15-23/test/part-00000-ef52a5f3-c3ec-4a9b-8d33-7db22eef6c1c-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-16-00/train/part-00000-aa88c22a-f819-490a-9e39-836b3a9d9b

  if __name__ == '__main__':


load data from  /home/recsys/dataset/train_csv_v1/2020-05-16-23/test/part-00000-52ac8132-db2e-4f46-9c7a-4dd885e077ef-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-17-00/train/part-00000-fed564a7-fae9-4a54-a2ae-c3b01b54fa1c-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-17-00/test/part-00000-b7a2eeb9-1e98-4d67-b980-b746b1abd663-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-17-01/train/part-00000-95233db0-97bf-4f5c-9758-b14247ccd01b-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-17-01/test/part-00000-66ca2c2b-b580-4e80-8218-18affa1497f0-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-17-02/train/part-00000-71236f0a-e796-4f23-89e6-a489e1672fac-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-17-02/test/part-00000-e20028e1-e483-48ca-b66a-d110ad0556f1-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-17-03/train/part-00000-022f5be5-29e2-4d6b-81bd-ba986bd769

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


1452391/1452391 - 524s - loss: 0.6280 - binary_crossentropy: 0.6235 - binary_accuracy: 0.6541 - auc_1: 0.7076 - val_loss: 0.6239 - val_binary_crossentropy: 0.6184 - val_binary_accuracy: 0.6586 - val_auc_1: 0.7141
Epoch 2/10
1452391/1452391 - 514s - loss: 0.6219 - binary_crossentropy: 0.6156 - binary_accuracy: 0.6612 - auc_1: 0.7174 - val_loss: 0.6231 - val_binary_crossentropy: 0.6168 - val_binary_accuracy: 0.6606 - val_auc_1: 0.7167
Epoch 3/10
1452391/1452391 - 519s - loss: 0.6198 - binary_crossentropy: 0.6129 - binary_accuracy: 0.6633 - auc_1: 0.7207 - val_loss: 0.6223 - val_binary_crossentropy: 0.6155 - val_binary_accuracy: 0.6604 - val_auc_1: 0.7178
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-01/train/part-00000-51fdc0ba-66fb-4236-8cd0-edb8e49bd309-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-01/test/part-00000-4b1dc326-d905-4024-88af-932e4d05802f-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-02/train/part-00000-b4f74

  if __name__ == '__main__':


load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-07/test/part-00000-34b296cf-85d5-401e-a2d0-9984e3a6e71b-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-08/train/part-00000-5a6ef55b-edc3-40e7-96b8-a27e1f1295da-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-08/test/part-00000-9433c1c8-5172-4471-b01b-62c98762f011-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-09/train/part-00000-b3d309ab-456c-46d7-aa6d-038fdc536ce3-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-09/test/part-00000-ca3b9019-5cb2-40bf-a85e-4a30e90e1ea2-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-10/train/part-00000-a6116636-548e-4f39-877c-4b721b1302fb-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-10/test/part-00000-d07bac7f-5a4a-4f6c-a8bd-970813236da6-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-11/train/part-00000-9bfa4eb6-0031-4fdf-8a68-647ec6fe8e

  if __name__ == '__main__':


load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-21/test/part-00000-4cbceab5-a374-43e6-ad13-42e4c3557635-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-22/train/part-00000-20986eaf-949e-47b2-8661-8e70fc215a37-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-22/test/part-00000-26d384fc-2e4f-4751-b288-c2f0d8f2166b-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-23/train/part-00000-7bcbaf6b-83d5-414f-a405-21a274c023c5-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-22-23/test/part-00000-ddb196e3-048d-4c8f-8303-cb4bdc0b5799-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-23-00/train/part-00000-d42629e4-8513-4808-bb1b-5bc10bbc6015-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-23-00/test/part-00000-bfeefb17-358c-44d7-8f53-1d1c059d54a1-c000.csv
load data from  /home/recsys/dataset/train_csv_v1/2020-05-23-01/train/part-00000-e76ecf01-9b11-4d3e-ba29-afb219d168

MemoryError: Unable to allocate 818. MiB for an array with shape (15, 7149295) and data type int64

In [121]:
file = '2020-05-21-18'
train_corpus = load_corpus('/home/recsys/dataset/train_csv_v1/' + file + '/train')
test_corpus = load_corpus('/home/recsys/dataset/train_csv_v1/' + file + '/test')
cur = int(time.time())
choose_data =  pd.concat([train_corpus, test_corpus])

load data from  /home/recsys/dataset/train_csv_v1/2020-05-21-18/train/part-00000-f23867b5-57c8-40d8-98b1-fee2456c562a-c000.csv


  if (await self.run_code(code, result,  async_=asy)):


load data from  /home/recsys/dataset/train_csv_v1/2020-05-21-18/test/part-00000-d8afc62f-ef33-49eb-8086-d5af7504d48b-c000.csv


In [181]:
train_by_batch(['2020-05-21-18'])

load data from  /home/recsys/dataset/train_csv_v1/2020-05-21-18/train/part-00000-f23867b5-57c8-40d8-98b1-fee2456c562a-c000.csv


  exec(code_obj, self.user_global_ns, self.user_ns)


load data from  /home/recsys/dataset/train_csv_v1/2020-05-21-18/test/part-00000-d8afc62f-ef33-49eb-8086-d5af7504d48b-c000.csv


Unnamed: 0,request_id,report_time,action,t_language,t_country,t_pid,t_ip,t_channel,t_scene,t_action,...,cp_product_type,cp_is_local,cp_is_local_publisher,rs_channel,rs_p1_score,rs_gactr,rs_tag_interest,rs_tag_interest_dactr,rs_tag_cate,rs_tag_cate_dactr
40194,11e87023e25d20b4aca7c85a6447059fef32526a,1590058552000,0,zh,cn,dfzx,115.217.11.145,all,list,1,...,6,0,0,123,0.414224,0.084038,颜值,0.1285650569701048,娱乐,0.084083
30449,392d2eb8f0a5391a80f48c4add3822423fe66969,1590055721000,0,zh,cn,dfpopup,122.192.186.142,all,list,1,...,6,0,0,6,0.253534,0.103816,,,汽车,
31719,076bce341395a67b13e559d3a6146987b52f2284,1590056245000,0,zh,cn,dfzx,117.136.40.222,all,list,1,...,6,0,0,12,0.336684,0.060736,,,娱乐,0.060037
71966,957f3bef906068a04d15aeb6a444e9c15aa18434,1590056452000,0,zh,cn,dfzx,113.46.227.90,all,list,1,...,6,0,0,63,0.357269,0.052729,北京市,0.26198693737799883,国内,0.053086
54776,f4c4a473fa879cc0641f35a4977040b0bbe86283,1590056405000,0,zh,cn,dfzx,118.114.11.24,all,list,1,...,6,0,0,31,0.091551,0.057561,"美国,特朗普","0.09482353261626067,0.10213215907310333",军事,0.057675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21488,ce64271c17a6814c23c7c58d2a0e0eef4b5ca639,1590058744000,1,zh,cn,dfzx,27.17.42.122,car,list,1,...,6,0,0,148,0.865334,0.666667,汽车,0.1,汽车,0.666667
21519,be8171fb3a41b7211b6627f8156bf4731ae872dc,1590058422000,1,zh,cn,dfzx,106.121.179.87,all,list,23,...,6,0,0,6,0.062614,0.050790,,,历史,0.050766
21534,7b57234116d3a097f1fe699122c13de9cd408a75,1590055675000,1,zh,cn,dfzx,36.157.152.203,military,list,12,...,6,0,0,14,0.742447,0.113214,军事,0.1,军事,0.112840
21535,e89164a21f3925da0bc4437123727359ddbff5c7,1590056817000,1,zh,cn,dfzx,120.15.151.187,all,list,23,...,6,0,0,6,0.082919,0.075837,,,健康,0.075676


In [27]:
cur = int(time.time())
model_input = {name: choose_data[name] for name in feature_names}
for item in var_data:
    model_input[item['label']] = item['list']
    if item['label'] == 'rs_taginfo':
        model_input[item['label']+'_weight'] = item['weight']
    elif item['label'] == 'u_uli' or item['label'] == 'u_umi' or item['label'] == 'u_usi':
        model_input[item['label']+'_weight'] = item['weight']
print('cost', (int(time.time()) - cur))


cost 0


In [None]:
history = model.fit(model_input, choose_data[target].values,
                batch_size=64, epochs=5, verbose=2, validation_split=0.1)
model.save_weights('./checkpoints/'+file)

In [63]:
model.load_weights('./checkpoints/'+file)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f1db24615f8>

In [53]:
!ls {checkpoint_dir}

checkpoint		     cp.ckpt.data-00001-of-00002
cp.ckpt.data-00000-of-00002  cp.ckpt.index
