In [1]:
!pip install deepctr tensorflow==2.10.1



In [2]:
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model, load_model
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names

from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, Callback
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.layers import Activation
from tensorflow.keras import backend as K
from tensorflow.keras import callbacks
from tensorflow.keras import utils
import tensorflow.keras as keras
import tensorflow as tf
import pandas as pd
import numpy as np
import warnings
# import pandas_profiling 
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
warnings.simplefilter('ignore')

In [3]:
raw_sample_df = pd.read_csv('E:\\UK\\新闻推荐\\广告点击率预测\\raw_sample.csv')
ad_feature_df = pd.read_csv('E:\\UK\\新闻推荐\\广告点击率预测\\ad_feature.csv')
user_profile_df=pd.read_csv('E:\\UK\\新闻推荐\\广告点击率预测\\user_profile.csv')

In [4]:
test_size_mb = raw_sample_df.memory_usage().sum() / 1024 / 1024
test_size_mb1 = ad_feature_df.memory_usage().sum() / 1024 / 1024
test_size_mb2 = user_profile_df.memory_usage().sum() / 1024 / 1024
print("raw_sample_df memory size: %.2f MB" % test_size_mb)
print("ad_feature_df memory size: %.2f MB" % test_size_mb1)
print("user_profile_df memory size: %.2f MB" % test_size_mb2)

raw_sample_df memory size: 1215.73 MB
ad_feature_df memory size: 38.76 MB
user_profile_df memory size: 72.91 MB


In [5]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [6]:
raw_sample_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26557961 entries, 0 to 26557960
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   user        int64 
 1   time_stamp  int64 
 2   adgroup_id  int64 
 3   pid         object
 4   nonclk      int64 
 5   clk         int64 
dtypes: int64(5), object(1)
memory usage: 2.7 GB


In [7]:
optimized_gl = raw_sample_df.copy()

gl_int = raw_sample_df.select_dtypes(include=['int'])#gl_int是一个Dataframe,包含四列：[adgroup_id,cate_id,campaign_id,customer]
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')
optimized_gl[converted_int.columns] = converted_int


gl_obj = raw_sample_df.select_dtypes(include=['object']).copy()
converted_obj = pd.DataFrame()
for col in gl_obj.columns:
    num_unique_values = len(gl_obj[col].unique())
    num_total_values = len(gl_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = gl_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = gl_obj[col]
optimized_gl[converted_obj.columns] = converted_obj
print("Original Ad Feature dataframe:{0}".format(mem_usage(raw_sample_df)))
print("Memory Optimised Ad Feature dataframe:{0}".format(mem_usage(optimized_gl)))

Original Ad Feature dataframe:2735.39 MB
Memory Optimised Ad Feature dataframe:379.92 MB


In [8]:
raw_sample_df = optimized_gl.copy()
raw_sample_df_new = raw_sample_df.rename(columns = {"user": "userid"})

In [9]:
ad_feature_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846811 entries, 0 to 846810
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   adgroup_id   846811 non-null  int64  
 1   cate_id      846811 non-null  int64  
 2   campaign_id  846811 non-null  int64  
 3   customer     846811 non-null  int64  
 4   brand        600481 non-null  float64
 5   price        846811 non-null  float64
dtypes: float64(2), int64(4)
memory usage: 38.8 MB


In [10]:
optimized_g2 = ad_feature_df.copy()

g2_int = ad_feature_df.select_dtypes(include=['int'])
converted_int = g2_int.apply(pd.to_numeric,downcast='unsigned')
optimized_g2[converted_int.columns] = converted_int

g2_float = ad_feature_df.select_dtypes(include=['float'])
converted_float = g2_float.apply(pd.to_numeric,downcast='float')#downcast='float' 会将 float64 压缩为 float32 或更低位；
optimized_g2[converted_float.columns] = converted_float

print("Original Ad Feature dataframe:{0}".format(mem_usage(ad_feature_df)))
print("Memory Optimised Ad Feature dataframe:{0}".format(mem_usage(optimized_g2)))

Original Ad Feature dataframe:38.76 MB
Memory Optimised Ad Feature dataframe:21.00 MB


In [11]:
user_profile_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061768 entries, 0 to 1061767
Data columns (total 9 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   userid                 1061768 non-null  int64  
 1   cms_segid              1061768 non-null  int64  
 2   cms_group_id           1061768 non-null  int64  
 3   final_gender_code      1061768 non-null  int64  
 4   age_level              1061768 non-null  int64  
 5   pvalue_level           485851 non-null   float64
 6   shopping_level         1061768 non-null  int64  
 7   occupation             1061768 non-null  int64  
 8   new_user_class_level   716848 non-null   float64
dtypes: float64(2), int64(7)
memory usage: 72.9 MB


In [12]:
optimized_g3 = user_profile_df.copy()

g3_int = user_profile_df.select_dtypes(include=['int'])
converted_int = g3_int.apply(pd.to_numeric,downcast='unsigned')
optimized_g3[converted_int.columns] = converted_int

g3_float = user_profile_df.select_dtypes(include=['float'])
converted_float = g3_float.apply(pd.to_numeric,downcast='float')
optimized_g3[converted_float.columns] = converted_float

print("Original User Feature dataframe:{0}".format(mem_usage(user_profile_df)))
print("Memory Optimised User Feature dataframe:{0}".format(mem_usage(optimized_g3)))

Original User Feature dataframe:72.91 MB
Memory Optimised User Feature dataframe:18.23 MB


##### 内存优化结束，合并结果

- `optimized_g3`也就是 `user_profile_df`得来的
- `optimized_g2`也就是 `ad_feature_df`得来的

In [13]:
raw_sample_df_new["userid"] = raw_sample_df_new["userid"].astype(np.int64).astype(np.uint32)
optimized_g3["userid"] = optimized_g3["userid"].astype(np.int64).astype(np.uint32)

df1 = pd.merge(optimized_g3, raw_sample_df_new, on="userid")

df1["adgroup_id"] = df1["adgroup_id"].astype(np.int64).astype(np.uint32)
optimized_g2["adgroup_id"] = optimized_g2["adgroup_id"].astype(np.int64).astype(np.uint32)

final_df = pd.merge(optimized_g2, df1,on="adgroup_id")
final_df.head()

Unnamed: 0,adgroup_id,cate_id,campaign_id,customer,brand,price,userid,cms_segid,cms_group_id,final_gender_code,age_level,pvalue_level,shopping_level,occupation,new_user_class_level,time_stamp,pid,nonclk,clk
0,63133,6406,83237,1,95471.0,170.0,172283,0,3,2,3,,3,0,4.0,1494386115,430548_1007,1,0
1,63133,6406,83237,1,95471.0,170.0,172283,0,3,2,3,,3,0,4.0,1494490600,430548_1007,1,0
2,63133,6406,83237,1,95471.0,170.0,172283,0,3,2,3,,3,0,4.0,1494304390,430548_1007,1,0
3,63133,6406,83237,1,95471.0,170.0,658198,35,4,2,4,2.0,3,0,2.0,1494675187,430539_1007,1,0
4,63133,6406,83237,1,95471.0,170.0,620600,0,12,1,6,,3,0,,1494602154,430548_1007,1,0


In [14]:
final_df['hist_cate_id'] = final_df['cate_id']
final_df['hist_adgroup_id'] = final_df['adgroup_id']

In [29]:
sparse_features = [feat for feat in final_df.columns if feat not in ['time_stamp','pid', 'nonclk','brand',
       'cms_segid', 'cms_group_id', 'age_level','clk',
       'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level ',
        'campaign_id', 'customer', 'price', 'hist_cate_id','hist_adgroup_id']]
sparse_features

['adgroup_id', 'cate_id', 'userid', 'final_gender_code']

In [30]:
dense_features = [feat for feat in final_df.columns if feat not in ['userid', 'time_stamp', 'adgroup_id', 'pid', 'nonclk', 'clk',
       'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level',
       'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level ',
       'cate_id', 'campaign_id', 'customer', 'brand','hist_cate_id','hist_adgroup_id']]
dense_features

['price']

In [31]:
sequence_features = [feat for feat in final_df.columns if feat not in ['userid', 'time_stamp', 'adgroup_id', 'pid', 'nonclk', 'clk',
       'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level',
       'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level ',
       'cate_id', 'campaign_id', 'customer', 'brand', 'price']]
sequence_features

['hist_cate_id', 'hist_adgroup_id']

In [32]:
behavior_feature_list = [feat for feat in final_df.columns if feat in ['adgroup_id', 'cate_id']]
behavior_feature_list

['adgroup_id', 'cate_id']

In [33]:
final_df[sparse_features] = final_df[sparse_features].fillna('-1', )
final_df[sequence_features] = final_df[sequence_features].fillna('-1', )
final_df[dense_features] = final_df[dense_features].fillna(0, )
target = ['clk']

In [34]:
mms = MinMaxScaler(feature_range=(0, 1))
final_df[dense_features] = mms.fit_transform(final_df[dense_features])

In [35]:
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=final_df[feat].nunique() + 1,embedding_dim=8) for feat in sparse_features] + [DenseFeat(feat, 1, )for feat in dense_features] + [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=2000000,embedding_dim=8), maxlen=1) for feat in sequence_features] 

In [36]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns, )

In [37]:
train, test = train_test_split(final_df, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [38]:
# behavior_feature_list,linear_feature_columns

In [39]:
#model = DIN(linear_feature_columns,behavior_feature_list, task='binary')
model = DIN(linear_feature_columns, behavior_feature_list, dnn_use_bn=True,
        dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation="dice",
        att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,
        task='binary')

In [40]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
adgroup_id (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
cate_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
hist_cate_id (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
hist_adgroup_id (InputLayer)    [(None, 1)]          0                                            
____________________________________________________________________________________________

In [42]:
model.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model.fit(train_model_input, train[target].values,batch_size=5024, epochs=1, verbose=1, validation_split=0.25)

ResourceExhaustedError: Graph execution error:

Detected at node 'sparse_seq_emb_hist_cate_id/embeddings/Regularizer/Square' defined at (most recent call last):
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\IPython\core\interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\IPython\core\interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\IPython\core\interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\IPython\core\interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\IPython\core\interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\ASUS\AppData\Local\Temp\ipykernel_32024\392697638.py", line 2, in <module>
      history = model.fit(train_model_input, train[target].values,batch_size=5024, epochs=1, verbose=1, validation_split=0.25)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1187, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\training.py", line 857, in train_function
      return step_function(self, iterator)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\training.py", line 847, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\training.py", line 840, in run_step
      outputs = model.train_step(data)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\training.py", line 799, in train_step
      y, y_pred, sample_weight, regularization_losses=self.losses)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1407, in losses
      loss_tensor = regularizer()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1483, in _tag_callable
      loss = loss()
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 2444, in _loss_for_variable
      regularization = regularizer(v)
    File "c:\Users\ASUS\anaconda3\envs\UK1\lib\site-packages\tensorflow\python\keras\regularizers.py", line 316, in __call__
      return self.l2 * math_ops.reduce_sum(math_ops.square(x))
Node: 'sparse_seq_emb_hist_cate_id/embeddings/Regularizer/Square'
failed to allocate memory
	 [[{{node sparse_seq_emb_hist_cate_id/embeddings/Regularizer/Square}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_8033]

In [28]:
# pred_ans = model.predict(test_model_input, batch_size=256)
# print("test LogLoss", round(log_loss(test[target].values, pred_ans), 2))
# print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 2))
