In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
from itertools import repeat
from time import perf_counter
from tensorflow.keras.metrics import BinaryAccuracy, AUC
from tensorflow.python.client import device_lib
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
pd.set_option('display.max_columns', None)
tf.keras.backend.set_floatx('float32')

In [12]:
BATCH_SIZE = 1024
EMBEDDING_SIZE = 10
cols = {'obj': [],
        'cat': [],
       'cont': []
        }
def data_split():
    file_name = 'final_data_v2.csv'
    file_path = os.getcwd()+'/drive/MyDrive/Colab Notebooks/' #[:-16]는 본인 경로에 맞게 있어도 되고 없어도 됨.
    df = pd.read_csv(file_path+file_name, encoding='utf-8')
    df.rename(columns={'category_id_1':'category1'}, inplace=True)

    # 데이터 유형별 분류하기
    for dt_idx, dt_val in zip(df.dtypes.index, df.dtypes.values):
        # if 'category' in dt_idx:
        #     df[['category1']] = LabelEncoder().fit_transform(df[['category1']])
        #     cols['cat'].append('category1')

        if dt_val == 'object':
            if ('id' in dt_idx) | ('time' in dt_idx) | ('name' in dt_idx) | ('keyword' in dt_idx) |('url' in dt_idx):
                df.drop(columns = dt_idx, axis=1, inplace=True)
            else:
                cols['obj'].append(dt_idx)

        else:
            if ('id' in dt_idx) | ('time' in dt_idx):
                df.drop(columns = dt_idx, axis=1, inplace=True)
            else:
                if len(df[dt_idx].value_counts()) <= 30: #연속형 데이터 중 30개 내의 범주로 나눌 수 있는 데이터 = category로 구분.
                    cols['cat'].append(dt_idx)
                else:
                    if ('hour' in dt_idx) | ('group' in dt_idx):
                        pass
                    else:
                        cols['cont'].append(dt_idx) 

    return cols

def reorganization(df):
  data = pd.DataFrame()
  cols = data_split()
  for k, v in cols.items():
    if k == 'obj':
      data = pd.concat([data, df[v]], axis=1)
    elif k == 'cont':
      data = pd.concat([data, df[v]], axis=1)
    else:
      data = pd.concat([data, df[v]], axis=1)

  return data

def preprocessing():
    file_name = 'final_data_v2.csv'
    file_path = os.getcwd()+'/drive/MyDrive/Colab Notebooks/' 
    df = pd.read_csv(file_path+file_name, encoding='utf-8')
    df.rename(columns={'category_id_1':'category1'}, inplace=True)
    # 데이터 유형별 분류하기
    data = reorganization(df)
    # cols = data_split()
    modified_df = pd.DataFrame()
    vec_dict = {idx: [] for idx in range(len(data.columns))}
    feature_index = []

    for i, c in enumerate(data.columns):
        if c in cols['obj']:
            obj_data = pd.get_dummies(data[c], prefix=c, prefix_sep = "/")
            modified_df = pd.concat([modified_df, obj_data], axis=1)
            vec_dict[i] = list(obj_data.columns)
            feature_index.extend(repeat(i, obj_data.shape[1]))

        elif c in cols['cat']:  # click_label 컬럼 = y 변수로 사용
            if 'click' in c:
                pass
            else:
                cat_data = pd.get_dummies(data[c], prefix=c, prefix_sep = "/")
                vec_dict[i] = list(cat_data.columns)
                feature_index.extend(repeat(i, cat_data.shape[1]))
                modified_df = pd.concat([modified_df, cat_data], axis=1)
        else:
            scaled_num_data = MinMaxScaler().fit_transform(df[[c]])
            scaled_num_data = pd.DataFrame(scaled_num_data, columns = [c])
            modified_df = pd.concat([modified_df,scaled_num_data], axis=1)
            vec_dict[i] = list(scaled_num_data.columns)
            feature_index.extend(repeat(i, scaled_num_data.shape[1]))

    print('---- Data info ----')
    # print(cols)
    print('Data Frame shape: {}'.format(modified_df.shape))
    print('# of Feature: {}'.format(len(feature_index)))
    print(f'# of Field: {len(vec_dict)}')
    # print(f'Modified DF columns: {modified_df.columns}')
    # print(vec_dict)
    return vec_dict, feature_index, modified_df

# 데이터 7:3으로 나누기 (혹은 8:2)
def split_data():
    file_name = 'final_data_v2.csv'
    file_path = os.getcwd()+'/drive/MyDrive/Colab Notebooks/' 
    df = pd.read_csv(file_path+file_name, encoding='utf-8')
    df.rename(columns={'category_id_1':'category1'}, inplace=True)

    vec_dict, feature_index, modified_df = preprocessing()

    X = modified_df#.astype('float')
    y = df['click_label']

    print(f"X features' name (10): {X.columns.to_list()[:10]}")
    print(f"y feature's name: {y.name}")
    print()
    
    oversample = SMOTE(random_state=2022) # 불균형 데이터 셋인 번개장터 데이터 셋 불균형 문제 완화
    X, y = oversample.fit_resample(X, y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2022, stratify = y) #y 비율에 따른 층화추출 및 데이터를 7:3의 비율로 나누기
    
    print(f"# of train_data's rows: {X_train.shape[0]} \n# of test_data's rows: {X_test.shape[0]}")
    print(f'train:test ratio = {round(X_train.shape[0]/(X_train.shape[0]+ X_test.shape[0]),2)}:{round(X_test.shape[0]/(X_train.shape[0]+ X_test.shape[0]), 2)}')

    # tf.data.Dataset.from_tensor_slices 함수: tf.data.Dataset 를 생성하는 함수로 입력된 텐서로부터 slices를 생성.
    # shuffle 함수는 고정된 버퍼 크기로 데이터를 섞는데, 데이터가 완전히 랜덤적으로 뒤섞기 위해서는 입력된 데이터 크기보다 큰 수를 입력.
    # tf.cast 함수: 뒤에 나온 형으로 값을 변환
    train_ds = tf.data.Dataset.from_tensor_slices( 
              (tf.cast(X_train.values, tf.float32), tf.cast(y_train, tf.float32))
            ).shuffle(600000).batch(BATCH_SIZE) 
    
    test_ds = tf.data.Dataset.from_tensor_slices(
              (tf.cast(X_test.values, tf.float32), tf.cast(y_test, tf.float32))
            ).shuffle(300000).batch(BATCH_SIZE)
    
    print(f'Current Batch Size: {BATCH_SIZE}')
    print(f'train_ds: {train_ds}')
    print(f'test_ds: {test_ds}')
    return train_ds, test_ds, vec_dict, feature_index

In [13]:
class FM_layer(tf.keras.layers.Layer):
    def __init__(self, num_feature, grouped_field, embedding_size, feature_index):
        super(FM_layer, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.grouped_field = grouped_field      # m: grouped field 개수
        self.feature_index = feature_index      # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        # Parameters of FM Layer
        # w: capture 1st order interactions (linear - num_feature 만큼의 크기를 가진 벡터)
        # V: capture 2nd order interactions
        # tf.Variable: 모델링에서 weight나 bias와 같은 변수 값을 초기화하는 훈련가능한 변수
        self.w = tf.Variable(tf.random.normal(shape=[num_feature], mean=0.0, stddev=1.0)
                           , name='w'
                            )
        self.V = tf.Variable(tf.random.normal(shape=(grouped_field, embedding_size), 
                                              mean=0.0, stddev=0.01)
                           , name='V'
                            )

    def call(self, inputs):
          # print(f"input vector's shape: {inputs.shape}") #input vector's shape: (256, 101)
        x_batch = tf.reshape(tf.expand_dims(inputs, axis=-1), [-1, self.num_feature, 1]) #-1: 가로 vector로 생성, self.feature_index 만큼의 행, 1열
        # print(f"X's Batch Shape: {x_batch.shape}") #X's Batch Shape: (256, 101, 1)                 
        
        # Parameter V를 feature_index에 맞게 복사하여 num_feature에 맞게 늘림 (field 수만큼 embedding)
        embeds = tf.nn.embedding_lookup(params=self.V, ids=self.feature_index)
        # print(f"Embedding Layer Shape: {embeds.shape}") #Embedding Layer Shape: (101, 10)
        
        # Deep Component에서 쓸 Input
        # (batch_size, num_feature, embedding_size)
        # order-2 layer (inner product of respective feature latent vectors)
        vector_inputs = tf.math.multiply(x_batch, embeds) 
        # print(f'Input Layer Shape: {vector_inputs.shape}') #Input Layer Shape: (256, 101, 10)

        # (batch_size, ) -> order-1 layer (linear interactions among features)
        linear_terms = tf.reduce_sum(tf.math.multiply(self.w, inputs), axis=1, keepdims=False)

        # (batch_size, ) -> order-2 features (inner product units)
        # tf.math.pow(tf.matmul(inputs, self.V), 2) - tf.matmul(tf.math.pow(inputs, 2), tf.math.pow(self.V, 2)
        # tf.matmul: 행렬의 곱셈
        interactions = 0.5 * tf.subtract(
            tf.square(tf.reduce_sum(vector_inputs, [1, 2])),
            tf.reduce_sum(tf.square(vector_inputs), [1, 2])
        )
        
        # sigmoid function for CTR prediction
        linear_terms = tf.reshape(linear_terms, [-1, 1]) #벡터 -> tensor화된 벡터로 변환
        interactions = tf.reshape(interactions, [-1, 1])
        y_fm = tf.concat([linear_terms, interactions], 1)

        return y_fm, vector_inputs

In [None]:
class DeepFM(tf.keras.Model):
    def __init__(self, num_feature, grouped_field, embedding_size, feature_index):
        super(DeepFM, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.grouped_field = grouped_field      # m: grouped field 개수
        self.feature_index = feature_index      # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지 (칼럼 인덱스)

        self.fm_layer = FM_layer(num_feature, grouped_field, embedding_size, feature_index)
        
        self.hidden_layer1 = tf.keras.layers.Dense(units=64, activation='relu') #tf.keras.layers.Dense(레이어 사이즈, 활성화 함수): 인공신경망 구조를 구현시켜주는 함수
        self.dropout1 = tf.keras.layers.Dropout(rate=0.4)
        self.hidden_layer2 = tf.keras.layers.Dense(units=32, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(rate=0.2)
        self.hidden_layer3 = tf.keras.layers.Dense(units=16, activation='relu')
        self.dropout3 = tf.keras.layers.Dropout(rate=0.1)
        self.hidden_layer4 = tf.keras.layers.Dense(units=2, activation='relu')

        self.output_layer = tf.keras.layers.Dense(units=1, activation='sigmoid')

    def __repr__(self):
        return "DeepFM Model: # Field: {}, # Feature: {}, Embedding: {}".format(self.grouped_field, self.num_feature, self.embedding_size)
    
    def call(self, inputs):
        # 1) FM Component: (num_batch, 2)
        y_fm, vector_inputs = self.fm_layer(inputs)
        # print(f'Sigmoid Function shape: {y_fm.shape}') #Sigmoid Function shape: (256, 2)
        # print(f'input vector shape: {vector_inputs.shape}') #input vector shape: (256, 101, 10)

        # retrieve Dense Vectors: (num_batch, num_feature*embedding_size)
        vector_inputs = tf.reshape(vector_inputs, [-1, self.num_feature*self.embedding_size])

        # 2) Deep Component
        y_deep = self.hidden_layer1(vector_inputs)
        y_deep = self.dropout1(y_deep)
        y_deep = self.hidden_layer2(y_deep)
        y_deep = self.dropout2(y_deep)
        y_deep = self.hidden_layer3(y_deep)
        y_deep = self.dropout3(y_deep)
        y_deep = self.hidden_layer4(y_deep)

        # Concatenation
        y_pred = tf.concat([y_fm, y_deep], 1)
        y_pred = self.output_layer(y_pred)
        y_pred = tf.reshape(y_pred, [-1, ])
        
        return y_pred

In [15]:
def training_per_batch(model, x, y, opt, train_acc, train_auc):
  with tf.GradientTape() as gt: #GradientTapes can be nested to compute higher-order derivatives. (자동으로 미분 실행.)
      y_pred = model(x)
      loss = tf.keras.losses.binary_crossentropy(from_logits=False, y_true=y, y_pred=y_pred)

  grads = gt.gradient(target=loss, sources=model.trainable_variables)

  # apply_gradients()를 통해 processed gradients를 적용함
  opt.apply_gradients(zip(grads, model.trainable_variables))

  # accuracy & auc 성능
  train_acc.update_state(y, y_pred)
  train_auc.update_state(y, y_pred)

  return loss

# 반복 학습 함수
def training(epoch):
    # X_train, X_test, y_train, y_test = split_data()
    train_ds, test_ds, vec_dict, feature_index = split_data()
    
    model = DeepFM(embedding_size= EMBEDDING_SIZE, 
                   num_feature=len(feature_index),
                   grouped_field=len(vec_dict), 
                   feature_index=feature_index)
    '''
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수 (shape[1]에 해당)
        self.grouped_field = grouped_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지 (칼럼 인덱스)
        
        print('X shape: {}'.format(X_modified.shape))  
        print('# of Feature: {}'.format(len(field_index)))
        print('# of Field: {}'.format(len(field_dict)))
    '''
    # Momentum 장점 + AdaGrad 장점 = Adam (모멘텀 방식보다 좌우 흔들림이 덜 함. 좌우흔들림이 덜 함.)
    # 추가로 논문에 나온 대로 FTRL 기법 사용
    opt = tf.keras.optimizers.Adam(learning_rate=0.01) #learning_rate은 별도로 조절
    ''' 
    var1 = tf.Variable(10.0)
    loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
    step_count = opt.minimize(loss, [var1]).numpy()
    # The first step is `-learning_rate*sign(grad)`
    var1.numpy(): 9.9
    '''
  
    start = perf_counter()
    print("Start Training: Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))

    for i in range(epoch):
      train_acc = BinaryAccuracy(threshold=0.5) #threshold값도 조절 (0.4~0.6 사이 값)
      train_auc = AUC()
      loss_history = []

      for x, y in train_ds:
          loss = training_per_batch(model, x, y, opt, train_acc, train_auc)
          loss_history.append(loss)
      
      # if i % 10 == 9:
      print("Epoch {}: 누적 Loss: {:.4f}, Acc: {:.4f}, AUC: {:.4f}".format(i+1, np.mean(loss_history), train_acc.result().numpy(), train_auc.result().numpy()))
      # else:
      #   pass

    print("End of Training")
    test_acc = BinaryAccuracy(threshold=0.5)
    test_auc = AUC()
    for x, y in test_ds:
        y_pred = model(x)
        test_acc.update_state(y, y_pred)
        test_auc.update_state(y, y_pred)
    
    print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
    print("Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
    print(f"걸린 시간: {round((perf_counter() - start)//60)}분 {round((perf_counter() - start)%60)}초")
    print()

In [None]:
# 학습 10번 후 확인해보기 (relu 활성화 함수 사용)
if __name__ == '__main__':
  training(10)

---- Data info ----
Data Frame shape: (842463, 101)
# of Feature: 101
# of Field: 27
X features' name (10): ['user_gender/F', 'user_gender/M', 'imp_hour/0', 'imp_hour/1', 'imp_hour/2', 'imp_hour/3', 'imp_hour/4', 'imp_hour/5', 'imp_hour/6', 'imp_hour/7']
y feature's name: click_label

# of train_data's rows: 1144850 
# of test_data's rows: 490650
train:test ratio = 0.7:0.3
Current Batch Size: 256
train_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
test_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
Start Training: Batch Size: 256, Embedding Size: 10
Epoch 1: 누적 Loss: 0.5004, Acc: 0.7263, AUC: 0.8169
Epoch 2: 누적 Loss: 0.4723, Acc: 0.7464, AUC: 0.8399
Epoch 3: 누적 Loss: 0.4586, Acc: 0.7558, AUC: 0.8496
Epoch 4: 누적 Loss: 0.4485, Acc: 0.7639, AUC: 0.8575
Epoch 5: 누적 Loss: 0.4402, Acc: 0.7704, AUC: 0.8638
Epoch 6: 누적 Loss: 0.4505, Acc: 0.7673, AUC: 0.8588
Epoch 7: 누적 Loss: 0.4346, Acc: 0.7763, AUC: 0.8689
Epoch 8: 누적 Lo

In [None]:
if __name__ == '__main__':
  training(10) 

---- Data info ----
Data Frame shape: (842463, 101)
# of Feature: 101
# of Field: 27
X features' name (10): ['user_gender/F', 'user_gender/M', 'imp_hour/0', 'imp_hour/1', 'imp_hour/2', 'imp_hour/3', 'imp_hour/4', 'imp_hour/5', 'imp_hour/6', 'imp_hour/7']
y feature's name: click_label

# of train_data's rows: 1144850 
# of test_data's rows: 490650
train:test ratio = 0.7:0.3
Current Batch Size: 512
train_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
test_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
Start Training: Batch Size: 512, Embedding Size: 10
Epoch 1: 누적 Loss: 0.5006, Acc: 0.7323, AUC: 0.8195
Epoch 2: 누적 Loss: 0.4643, Acc: 0.7548, AUC: 0.8463
Epoch 3: 누적 Loss: 0.4522, Acc: 0.7652, AUC: 0.8562
Epoch 4: 누적 Loss: 0.4420, Acc: 0.7732, AUC: 0.8640
Epoch 5: 누적 Loss: 0.4334, Acc: 0.7793, AUC: 0.8700
Epoch 6: 누적 Loss: 0.4255, Acc: 0.7853, AUC: 0.8759
Epoch 7: 누적 Loss: 0.4198, Acc: 0.7894, AUC: 0.8798
Epoch 8: 누적 Lo

In [None]:
if __name__ == '__main__':
  training(10) 

---- Data info ----
Data Frame shape: (842463, 101)
# of Feature: 101
# of Field: 27
X features' name (10): ['user_gender/F', 'user_gender/M', 'imp_hour/0', 'imp_hour/1', 'imp_hour/2', 'imp_hour/3', 'imp_hour/4', 'imp_hour/5', 'imp_hour/6', 'imp_hour/7']
y feature's name: click_label

# of train_data's rows: 1144850 
# of test_data's rows: 490650
train:test ratio = 0.7:0.3
Current Batch Size: 512
train_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
test_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
Start Training: Batch Size: 512, Embedding Size: 50
Epoch 1: 누적 Loss: 0.5072, Acc: 0.7275, AUC: 0.8143
Epoch 2: 누적 Loss: 0.4692, Acc: 0.7510, AUC: 0.8424
Epoch 3: 누적 Loss: 0.4544, Acc: 0.7625, AUC: 0.8548
Epoch 4: 누적 Loss: 0.4443, Acc: 0.7701, AUC: 0.8617
Epoch 5: 누적 Loss: 0.4379, Acc: 0.7750, AUC: 0.8665
Epoch 6: 누적 Loss: 0.4318, Acc: 0.7793, AUC: 0.8705
Epoch 7: 누적 Loss: 0.4277, Acc: 0.7831, AUC: 0.8737
Epoch 8: 누적 Lo

In [None]:
if __name__ == '__main__':
  training(10) 

---- Data info ----
Data Frame shape: (842463, 101)
# of Feature: 101
# of Field: 27
X features' name (10): ['user_gender/F', 'user_gender/M', 'imp_hour/0', 'imp_hour/1', 'imp_hour/2', 'imp_hour/3', 'imp_hour/4', 'imp_hour/5', 'imp_hour/6', 'imp_hour/7']
y feature's name: click_label

# of train_data's rows: 1144850 
# of test_data's rows: 490650
train:test ratio = 0.7:0.3
Current Batch Size: 1024
train_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
test_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
Start Training: Batch Size: 1024, Embedding Size: 10
Epoch 1: 누적 Loss: 0.4957, Acc: 0.7292, AUC: 0.8174
Epoch 2: 누적 Loss: 0.4587, Acc: 0.7538, AUC: 0.8462
Epoch 3: 누적 Loss: 0.4458, Acc: 0.7648, AUC: 0.8567
Epoch 4: 누적 Loss: 0.4375, Acc: 0.7715, AUC: 0.8629
Epoch 5: 누적 Loss: 0.4319, Acc: 0.7756, AUC: 0.8673
Epoch 6: 누적 Loss: 0.4273, Acc: 0.7793, AUC: 0.8706
Epoch 7: 누적 Loss: 0.4238, Acc: 0.7817, AUC: 0.8730
Epoch 8: 누적 

In [14]:
# 활성화 함수 = tanh일 때
class DeepFM(tf.keras.Model):
    def __init__(self, num_feature, grouped_field, embedding_size, feature_index):
        super(DeepFM, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.grouped_field = grouped_field      # m: grouped field 개수
        self.feature_index = feature_index      # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지 (칼럼 인덱스)

        self.fm_layer = FM_layer(num_feature, grouped_field, embedding_size, feature_index)
        
        self.hidden_layer1 = tf.keras.layers.Dense(units=64, activation='tanh') #tf.keras.layers.Dense(레이어 사이즈, 활성화 함수): 인공신경망 구조를 구현시켜주는 함수
        self.dropout1 = tf.keras.layers.Dropout(rate=0.5)
        self.hidden_layer2 = tf.keras.layers.Dense(units=32, activation='tanh')
        self.dropout2 = tf.keras.layers.Dropout(rate=0.5)
        self.hidden_layer3 = tf.keras.layers.Dense(units=16, activation='tanh')
        self.dropout3 = tf.keras.layers.Dropout(rate=0.1)
        self.hidden_layer4 = tf.keras.layers.Dense(units=2, activation='tanh')

        self.output_layer = tf.keras.layers.Dense(units=1, activation='sigmoid')

    def __repr__(self):
        return "DeepFM Model: # Field: {}, # Feature: {}, Embedding: {}".format(self.grouped_field, self.num_feature, self.embedding_size)
    
    def call(self, inputs):
        # 1) FM Component: (num_batch, 2)
        y_fm, vector_inputs = self.fm_layer(inputs)
        # print(f'Sigmoid Function shape: {y_fm.shape}') #Sigmoid Function shape: (256, 2)
        # print(f'input vector shape: {vector_inputs.shape}') #input vector shape: (256, 101, 10)

        # retrieve Dense Vectors: (num_batch, num_feature*embedding_size)
        vector_inputs = tf.reshape(vector_inputs, [-1, self.num_feature*self.embedding_size])

        # 2) Deep Component
        y_deep = self.hidden_layer1(vector_inputs)
        y_deep = self.dropout1(y_deep)
        y_deep = self.hidden_layer2(y_deep)
        y_deep = self.dropout2(y_deep)
        y_deep = self.hidden_layer3(y_deep)
        y_deep = self.dropout3(y_deep)
        y_deep = self.hidden_layer4(y_deep)

        # Concatenation
        y_pred = tf.concat([y_fm, y_deep], 1)
        y_pred = self.output_layer(y_pred)
        y_pred = tf.reshape(y_pred, [-1, ])
        
        return y_pred

In [6]:
# 활성화 함수 = 탄젠트일 때 결과
if __name__ == '__main__':
  training(10) 

---- Data info ----
Data Frame shape: (842463, 101)
# of Feature: 101
# of Field: 27
X features' name (10): ['user_gender/F', 'user_gender/M', 'imp_hour/0', 'imp_hour/1', 'imp_hour/2', 'imp_hour/3', 'imp_hour/4', 'imp_hour/5', 'imp_hour/6', 'imp_hour/7']
y feature's name: click_label

# of train_data's rows: 1144850 
# of test_data's rows: 490650
train:test ratio = 0.7:0.3
Current Batch Size: 512
train_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
test_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
Start Training: Batch Size: 512, Embedding Size: 10
Epoch 1: 누적 Loss: 0.4975, Acc: 0.7251, AUC: 0.8135
Epoch 2: 누적 Loss: 0.4666, Acc: 0.7463, AUC: 0.8383
Epoch 3: 누적 Loss: 0.4576, Acc: 0.7540, AUC: 0.8459
Epoch 4: 누적 Loss: 0.4523, Acc: 0.7591, AUC: 0.8505
Epoch 5: 누적 Loss: 0.4490, Acc: 0.7622, AUC: 0.8535
Epoch 6: 누적 Loss: 0.4473, Acc: 0.7641, AUC: 0.8551
Epoch 7: 누적 Loss: 0.4452, Acc: 0.7660, AUC: 0.8569
Epoch 8: 누적 Lo

In [11]:
if __name__ == '__main__':
  training(10) 

---- Data info ----
Data Frame shape: (842463, 101)
# of Feature: 101
# of Field: 27
X features' name (10): ['user_gender/F', 'user_gender/M', 'imp_hour/0', 'imp_hour/1', 'imp_hour/2', 'imp_hour/3', 'imp_hour/4', 'imp_hour/5', 'imp_hour/6', 'imp_hour/7']
y feature's name: click_label

# of train_data's rows: 1144850 
# of test_data's rows: 490650
train:test ratio = 0.7:0.3
Current Batch Size: 512
train_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
test_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
Start Training: Batch Size: 512, Embedding Size: 50
Epoch 1: 누적 Loss: 0.5063, Acc: 0.7210, AUC: 0.8078
Epoch 2: 누적 Loss: 0.4831, Acc: 0.7409, AUC: 0.8302
Epoch 3: 누적 Loss: 0.4685, Acc: 0.7484, AUC: 0.8388
Epoch 4: 누적 Loss: 0.4668, Acc: 0.7502, AUC: 0.8401
Epoch 5: 누적 Loss: 0.4679, Acc: 0.7490, AUC: 0.8393
Epoch 6: 누적 Loss: 0.4685, Acc: 0.7488, AUC: 0.8392
Epoch 7: 누적 Loss: 0.4700, Acc: 0.7482, AUC: 0.8382
Epoch 8: 누적 Lo

In [16]:
if __name__ == '__main__':
  training(10) 

---- Data info ----
Data Frame shape: (842463, 101)
# of Feature: 101
# of Field: 27
X features' name (10): ['user_gender/F', 'user_gender/M', 'imp_hour/0', 'imp_hour/1', 'imp_hour/2', 'imp_hour/3', 'imp_hour/4', 'imp_hour/5', 'imp_hour/6', 'imp_hour/7']
y feature's name: click_label

# of train_data's rows: 1144850 
# of test_data's rows: 490650
train:test ratio = 0.7:0.3
Current Batch Size: 1024
train_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
test_ds: <BatchDataset shapes: ((None, 101), (None,)), types: (tf.float32, tf.float32)>
Start Training: Batch Size: 1024, Embedding Size: 10
Epoch 1: 누적 Loss: 0.5091, Acc: 0.7173, AUC: 0.8038
Epoch 2: 누적 Loss: 0.4716, Acc: 0.7444, AUC: 0.8357
Epoch 3: 누적 Loss: 0.4560, Acc: 0.7570, AUC: 0.8485
Epoch 4: 누적 Loss: 0.4472, Acc: 0.7641, AUC: 0.8553
Epoch 5: 누적 Loss: 0.4414, Acc: 0.7688, AUC: 0.8597
Epoch 6: 누적 Loss: 0.4374, Acc: 0.7722, AUC: 0.8629
Epoch 7: 누적 Loss: 0.4352, Acc: 0.7741, AUC: 0.8644
Epoch 8: 누적 