In [2]:
# deepctr 라이브러리 예제 코드 복붙(참고용)
# https://deepctr-torch.readthedocs.io/en/latest/Examples.html#classification-criteo

import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

data = pd.read_csv('./criteo_sample.txt')

sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                            for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                            for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns``

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

# 4.Define Model,train,predict and evaluate

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                task='binary',
                l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
                metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda ready...
cuda:0
Train on 160 samples, validate on 0 samples, 5 steps per epoch
Epoch 1/10
1s - loss:  0.6226 - binary_crossentropy:  0.6226 - auc:  0.5160
Epoch 2/10
0s - loss:  0.4906 - binary_crossentropy:  0.4905 - auc:  0.9545
Epoch 3/10
0s - loss:  0.3614 - binary_crossentropy:  0.3613 - auc:  0.9901
Epoch 4/10
0s - loss:  0.1934 - binary_crossentropy:  0.1934 - auc:  0.9971
Epoch 5/10
0s - loss:  0.0953 - binary_crossentropy:  0.0953 - auc:  0.9971
Epoch 6/10
0s - loss:  0.0640 - binary_crossentropy:  0.0640 - auc:  1.0000
Epoch 7/10
0s - loss:  0.0461 - binary_crossentropy:  0.0461 - auc:  1.0000
Epoch 8/10
0s - loss:  0.0352 - binary_crossentropy:  0.0352 - auc:  1.0000
Epoch 9/10
0s - loss:  0.0282 - binary_crossentropy:  0.0282 - auc:  1.0000
Epoch 10/10
0s - loss:  0.0220 - binary_crossentropy:  0.0220 - auc:  1.0000

test LogLoss 0.7694
test AUC 0.6487


In [21]:
# 본 테스크: kaggle의 TalkingData AdTracking competition dataset 활용
# TODO: 아래 링크에서 데이터셋 파일(train_sample.csv) 다운받으세요
# https://www.kaggle.com/datasets/matleonard/feature-engineering-data?select=train_sample.csv

import pandas as pd
import numpy as np
import torch
from sklearn.metrics import log_loss, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
from dateutil.parser import parse


data = pd.read_csv('./train_sample.csv')
#columns = [ip,app,device,os,channel,click_time,attributed_time,is_attributed]

########## PREPROCESSING ###############

# TODO: CTR과 무관할 것으로 추정되는 column이 있으면 사전에 drop 해줍니다.
# 예상 답안: os, attributed_time은 클릭률과 무관한 것으로 추정-> drop
data.drop(['os', 'attributed_time'], axis=1, inplace=True)


# click_time을 string으로 넣으면 one-hot encoding 시 차원이 폭발하게 됩니다. 
# 따라서 시간을 적절하게 분류해야하는데, 1) 광고 클릭에 영향을 주는 시간대가 존재할 것이며, 2) 달마다 광고 클릭률이 달라진다 라는 가설을 세울 수 있겠네요.
# 그러나 본 데이터셋은 11월 6일~11월 9일간의 데이터이므로 1)만 해도 될 것 같습니다.
# TODO: click_time 컬럼을 적정 시간대로 구분하여 categorical feature로 만들어주세요. (임의의 스트링을 할당해도 상관없습니다. 이후 one-hot encoding 시 int로 변환되기에 때문입니다.)
# 예상 답안: ['0-3', '4-7', '8-11', '12-15', '16-19', '20-23'] 중 하나로 mapping

classified_time = ['0-3', '4-7', '8-11', '12-15', '16-19', '20-23']
hour_map = lambda x: classified_time[parse(x).hour//4]

data['click_time'] = data['click_time'].apply(hour_map)


data.head()

Unnamed: 0,ip,app,device,channel,click_time,is_attributed
0,89489,3,1,379,12-15,0
1,204158,35,1,21,12-15,1
2,3437,6,1,459,12-15,0
3,167543,3,1,379,12-15,0
4,147509,3,1,379,12-15,0


In [32]:
# data 행이 100만이 넘어서 deepFM 학습이 오래걸리니 적절하게 slice한 데이터를 사용합시다.
sliced_data = data.iloc[:300000] 
categorical_features = ['ip', 'app', 'device', 'channel', 'click_time']


# 결측치 처리
sliced_data[categorical_features] = sliced_data[categorical_features].fillna('-1', )
target = ['is_attributed']

# 1.Label Encoding for sparse features,and do simple Transformation for dense features
label_encoders = {}
for feat in categorical_features:
    label_encoders[feat] = LabelEncoder()
    sliced_data[feat] = label_encoders[feat].fit_transform(sliced_data[feat])

# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, sliced_data[feat].nunique())
                            for feat in categorical_features] 

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

# 3.generate input sliced_data for model

train, test, y_train, y_test = train_test_split(sliced_data.iloc[:,:-1], sliced_data[target], test_size=0.2, stratify=sliced_data[target])

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced_data[categorical_features] = sliced_data[categorical_features].fillna('-1', )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced_data[feat] = label_encoders[feat].fit_transform(sliced_data[feat])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced_data[feat] = label_encoders[feat].fit_t

In [24]:
# 4.Define Model,train,predict and evaluate

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                task='binary',
                l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
                metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input, y_train.values, batch_size=128,epochs=10,verbose=2,validation_split=0.0)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(y_test.values, pred_ans), 4))
print("test AUC", round(roc_auc_score(y_test.values, pred_ans), 4))
print("test precision", round(precision_score(y_test.values, np.round(pred_ans) ), 4))
print("test recall", round(recall_score(y_test.values, np.round(pred_ans)), 4))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced_data[categorical_features] = sliced_data[categorical_features].fillna('-1', )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced_data[feat] = lbe.fit_transform(sliced_data[feat])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced_data[feat] = lbe.fit_transform(sliced_data[feat])
A valu

cuda ready...
cuda:0
Train on 240000 samples, validate on 0 samples, 1875 steps per epoch
Epoch 1/10
17s - loss:  0.1738 - binary_crossentropy:  0.1738 - auc:  0.9534
Epoch 2/10
17s - loss:  0.1312 - binary_crossentropy:  0.1312 - auc:  0.9773
Epoch 3/10
18s - loss:  0.1040 - binary_crossentropy:  0.1040 - auc:  0.9861
Epoch 4/10
17s - loss:  0.0919 - binary_crossentropy:  0.0919 - auc:  0.9891
Epoch 5/10
17s - loss:  0.0857 - binary_crossentropy:  0.0856 - auc:  0.9904
Epoch 6/10
17s - loss:  0.0819 - binary_crossentropy:  0.0819 - auc:  0.9911
Epoch 7/10
17s - loss:  0.0794 - binary_crossentropy:  0.0794 - auc:  0.9916
Epoch 8/10
17s - loss:  0.0777 - binary_crossentropy:  0.0777 - auc:  0.9920
Epoch 9/10
17s - loss:  0.0763 - binary_crossentropy:  0.0763 - auc:  0.9922
Epoch 10/10
18s - loss:  0.0752 - binary_crossentropy:  0.0752 - auc:  0.9924

test LogLoss 0.2408
test AUC 0.941


In [39]:
# 학습된 모델을 통해 시간대 별로 CTR이 달라진다는 가설이 맞는지 확인해봅시다.
# click_time을 제외한 나머지 피처는 통제변인으로 설정, click_time에만 변화를 주고 model의 pCTR(predicted CTR)을 관찰해봅시다.

custom_test_data = {feat:np.array([data[feat][0]]*5) for feat in feature_names if feat != 'click_time'}
classified_time = ['0-3', '4-7', '12-15', '16-19', '20-23']  # sliced_data에 8-11시간대 데이터가 없어서 제외
custom_test_data['click_time'] = np.array(classified_time)

for feat in custom_test_data.keys():
    custom_test_data[feat] = label_encoders[feat].transform(custom_test_data[feat])

print(custom_test_data)

pred = model.predict(custom_test_data, 1)
for i in range(len(pred)):
    print(f'{classified_time[i]}: pCTR {pred[i]}')
    
    
    
# 선택) 시간대 말고 어떤 어플로 진입했는지 변화를 주고 pCTR이 어떻게 변화는지도 실험 ㄱㄴ
    


{'ip': array([18322, 18322, 18322, 18322, 18322], dtype=int64), 'app': array([3, 3, 3, 3, 3], dtype=int64), 'device': array([1, 1, 1, 1, 1], dtype=int64), 'channel': array([109, 109, 109, 109, 109], dtype=int64), 'click_time': array([0, 4, 1, 2, 3])}
0-3: pCTR [0.02559618]
4-7: pCTR [0.02572226]
12-15: pCTR [0.00451749]
16-19: pCTR [0.01822053]
20-23: pCTR [0.01983846]
