In [1]:
# gpu number setting
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = '1' ## gpu 번호 셋팅 윤건 :0, 기준 : 1, 준형 :2,

# Gpu growth setting
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)


# tensorflow & keras version check
print('tensorflow version : ' , tf.__version__)
print('keras version : ' , tf.keras.__version__)

# tensorflow gpu available check 
print('GPU available ? : ', tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))

tensorflow version :  2.3.1
keras version :  2.4.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available ? :  True


In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import *
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

In [3]:
raw = pd.read_csv('../Data/tb_ipr_m_seamless_2nd_movie.txt', sep='', header=None)

  interactivity=interactivity, compiler=compiler, result=result)


* insert column names

In [4]:
raw.columns = ['tb_ipr_m_seamless_2nd_movie.sa_id',
       'tb_ipr_m_seamless_2nd_movie.s_time',
       'tb_ipr_m_seamless_2nd_movie.e_time',
       'tb_ipr_m_seamless_2nd_movie.watch_duration',
       'tb_ipr_m_seamless_2nd_movie.album_id',
       'tb_ipr_m_seamless_2nd_movie.album_name',
       'tb_ipr_m_seamless_2nd_movie.vod_s_point',
       'tb_ipr_m_seamless_2nd_movie.vod_e_point',
       'tb_ipr_m_seamless_2nd_movie.agree_yn',
       'tb_ipr_m_seamless_2nd_movie.category_id',
       'tb_ipr_m_seamless_2nd_movie.release_date',
       'tb_ipr_m_seamless_2nd_movie.run_time',
       'tb_ipr_m_seamless_2nd_movie.meta_genre',
       'tb_ipr_m_seamless_2nd_movie.genre_large',
       'tb_ipr_m_seamless_2nd_movie.genre_mid',
       'tb_ipr_m_seamless_2nd_movie.genre_small',
       'tb_ipr_m_seamless_2nd_movie.movie_meta_price',
       'tb_ipr_m_seamless_2nd_movie.buy_history_price',
       'tb_ipr_m_seamless_2nd_movie.i30_meta_price',
       'tb_ipr_m_seamless_2nd_movie.buy_tot',
       'tb_ipr_m_seamless_2nd_movie.view_no',
       'tb_ipr_m_seamless_2nd_movie.fod',
       'tb_ipr_m_seamless_2nd_movie.buy_1_2',
       'tb_ipr_m_seamless_2nd_movie.buy_3',
       'tb_ipr_m_seamless_2nd_movie.buy_seg',
       'tb_ipr_m_seamless_2nd_movie.amt_1_4',
       'tb_ipr_m_seamless_2nd_movie.amt_2_4',
       'tb_ipr_m_seamless_2nd_movie.amt_3_4',
       'tb_ipr_m_seamless_2nd_movie.amt_4_4',
       'tb_ipr_m_seamless_2nd_movie.amt_r_gabun',
       'tb_ipr_m_seamless_2nd_movie.weekdays',
       'tb_ipr_m_seamless_2nd_movie.weekends',
       'tb_ipr_m_seamless_2nd_movie.dawn',
       'tb_ipr_m_seamless_2nd_movie.morning',
       'tb_ipr_m_seamless_2nd_movie.afternoon',
       'tb_ipr_m_seamless_2nd_movie.evening',
       'tb_ipr_m_seamless_2nd_movie.watch_ratio',
       'tb_ipr_m_seamless_2nd_movie.current_rate',
       'tb_ipr_m_seamless_2nd_movie.ncf_rating']

In [5]:
raw.columns = list(map(lambda x: x.split('.')[-1], raw.columns))

* distinct user 968,091
* distinct album 10,570

* featrue 분리, sparse, dense, ambiguous, unnecessary

* meta_genre, 대,중,소장르의 관계
* target column을 뭘로할지
* how to sample negatively?

In [6]:
sparse_features = ['sa_id', 'album_id', 'buy_seg']
dense_features = ['release_date', 'run_time', 'movie_meta_price', 'i30_meta_price', 'buy_tot',
                  'amt_1_4', 'amt_2_4', 'amt_3_4', 'amt_4_4']
ambiguous_features = [ 'agree_yn', 'meta_genre', 'genre_large', 'genre_mid', 'genre_small']
unnecessary_features = ['category_id', # NaN 약 650만 중에 25만 정도.
                        's_time', 'e_time', # future
                        'watch_duration', # future
                        'buy_history_price', # \\N 값이 약 482만 개
                        'vod_s_point', 'vod_e_point', 
                        'album_name',
                        'view_no',
                        'fod', 'buy_1_2', 'buy_3',
                        'amt_r_gabun',
                        'watch_ratio',
                        'weekdays', 'weekends', 'dawn', 'morning', 'afternoon', 'evening',
                        'current_rate']
target = ['ncf_rating']

* fill na

In [7]:
raw['release_date'] = raw['release_date'].fillna(value = raw['release_date'].median())

* drop na ('\\N' 이 담긴 row 88개)

In [8]:
raw = raw[raw['amt_1_4'] != '\\N']

* change column type

In [9]:
raw[['sa_id', 'album_id']] = raw[['sa_id', 'album_id']].astype(str)

In [10]:
raw[['amt_1_4', 'amt_2_4', 'amt_3_4', 'amt_4_4']] = raw[['amt_1_4', 'amt_2_4', 'amt_3_4', 'amt_4_4']].astype(float)

In [221]:
print(len(raw.columns))
print(len(sparse_features))
print(len(dense_features))
print(len(sparse_features) +len(dense_features)+ len(ambiguous_features) + len(unnecessary_features) + len(target))

39
3
9
39


In [11]:
data = raw[sparse_features + dense_features + target]

In [12]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
    
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [13]:
data

Unnamed: 0,sa_id,album_id,buy_seg,release_date,run_time,movie_meta_price,i30_meta_price,buy_tot,amt_1_4,amt_2_4,amt_3_4,amt_4_4,ncf_rating
0,68,8282,3,0.987883,0.480106,0.051020,0.051020,0.000000,1.0,0.0,0.0,0.0,0.493201
1,103,1724,3,0.787911,0.305040,0.028571,0.028571,0.000000,1.0,0.0,0.0,0.0,0.062150
2,103,2542,3,0.800467,0.328912,0.000000,0.000000,0.000000,1.0,0.0,0.0,0.0,0.048566
3,103,257,3,0.811903,0.265252,0.000000,0.028571,0.000000,1.0,0.0,0.0,0.0,0.054389
4,114,8512,0,0.988456,0.305040,0.051020,0.051020,0.003003,0.0,1.0,0.0,0.0,0.657391
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6759188,968006,6451,1,0.975990,0.267905,0.028571,0.028571,0.042042,0.0,0.0,0.0,1.0,0.560919
6759189,968006,9625,1,0.999284,0.270557,0.202041,0.202041,0.042042,0.0,0.0,0.0,1.0,0.220670
6759190,968006,10308,1,0.999851,0.270557,0.102041,0.102041,0.042042,0.0,0.0,0.0,1.0,0.606863
6759191,968006,10376,1,0.999866,0.270557,0.202041,0.202041,0.042042,0.0,0.0,0.0,1.0,0.265359


In [14]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                           for i,feat in enumerate(sparse_features)]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [15]:
train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [16]:
train

Unnamed: 0,sa_id,album_id,buy_seg,release_date,run_time,movie_meta_price,i30_meta_price,buy_tot,amt_1_4,amt_2_4,amt_3_4,amt_4_4,ncf_rating
6322629,398973,5346,2,0.964579,0.283820,0.028571,0.028571,0.000000,1.0,0.0,0.0,0.0,0.639616
4172940,726666,1274,2,0.835983,0.358090,0.024490,0.028571,0.000000,1.0,0.0,0.0,0.0,0.373951
4673716,674989,5343,2,0.964376,0.236074,0.028571,0.028571,0.000000,1.0,0.0,0.0,0.0,0.403121
3770874,335424,5443,2,0.964366,0.328912,0.028571,0.028571,0.000000,1.0,0.0,0.0,0.0,0.346304
4584683,174795,6239,2,0.975783,0.201592,0.028571,0.028571,0.000000,1.0,0.0,0.0,0.0,0.205811
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1509012,948227,8250,2,0.988237,0.244032,0.051020,0.051020,0.000000,1.0,0.0,0.0,0.0,0.602204
4737175,97781,8027,3,0.987769,0.297082,0.051020,0.051020,0.000000,1.0,0.0,0.0,0.0,0.614137
1948303,574877,10052,1,0.999745,0.286472,0.051020,0.051020,0.015015,0.0,0.0,0.2,0.8,0.094599
1482072,771059,1249,2,0.917815,0.421751,0.028571,0.028571,0.000000,1.0,0.0,0.0,0.0,0.175629


In [17]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=1024, epochs=20, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=32)
print("test MSE", round(mean_squared_error(
    test[target].values, pred_ans), 4))

Epoch 1/20


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


4225/4225 - 55s - loss: 0.0396 - mse: 0.0351 - val_loss: 0.0398 - val_mse: 0.0339
Epoch 2/20
4225/4225 - 55s - loss: 0.0415 - mse: 0.0331 - val_loss: 0.0422 - val_mse: 0.0337
Epoch 3/20
4225/4225 - 55s - loss: 0.0413 - mse: 0.0316 - val_loss: 0.0441 - val_mse: 0.0338
Epoch 4/20
4225/4225 - 55s - loss: 0.0427 - mse: 0.0319 - val_loss: 0.0450 - val_mse: 0.0337
Epoch 5/20
4225/4225 - 55s - loss: 0.0427 - mse: 0.0316 - val_loss: 0.0452 - val_mse: 0.0338
Epoch 6/20
4225/4225 - 55s - loss: 0.0425 - mse: 0.0314 - val_loss: 0.0450 - val_mse: 0.0339
Epoch 7/20
4225/4225 - 55s - loss: 0.0429 - mse: 0.0316 - val_loss: 0.0454 - val_mse: 0.0339
Epoch 8/20
4225/4225 - 55s - loss: 0.0429 - mse: 0.0315 - val_loss: 0.0452 - val_mse: 0.0337
Epoch 9/20
4225/4225 - 55s - loss: 0.0426 - mse: 0.0314 - val_loss: 0.0453 - val_mse: 0.0340
Epoch 10/20
4225/4225 - 55s - loss: 0.0426 - mse: 0.0314 - val_loss: 0.0451 - val_mse: 0.0337
Epoch 11/20
4225/4225 - 55s - loss: 0.0428 - mse: 0.0314 - val_loss: 0.0455 - va