In [1]:
import config
from preprocess import get_modified_data
from DeepFM import DeepFM

import numpy as np
import pandas as pd
from time import perf_counter
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.metrics import BinaryAccuracy, AUC

In [2]:
def get_data():
    file = pd.read_csv('../../data/adult.data', header=None)
    X = file.loc[:, 0:13]
    Y = file.loc[:, 14].map({' <=50K': 0, ' >50K': 1})
    
    X.columns = config.ALL_FIELDS
    field_dict, field_index, X_modified = get_modified_data(X,
                                                           config.ALL_FIELDS,
                                                           config.CONT_FIELDS,
                                                           config.CAT_FIELDS,
                                                           False)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_modified, Y, test_size=0.2)
    
    train_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_train.values, tf.float32),
         tf.cast(Y_train, tf.float32))).shuffle(30000).batch(config.BATCH_SIZE)
    
    test_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_test.values, tf.float32), 
         tf.cast(Y_test, tf.float32))).shuffle(10000).batch(config.BATCH_SIZE)
    
    return train_ds, test_ds, field_dict, field_index

In [3]:
def train_on_batch(model, optimizer, acc, auc, inputs, targets):
    with tf.GradientTape() as tape:
        y_pred = model(inputs)
        loss = tf.keras.losses.binary_crossentropy(from_logits=False, y_true=targets, y_pred=y_pred)
        
        grads = tape.gradient(target=loss, sources=model.trainable_variables)
        
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        acc.update_state(targets, y_pred)
        auc.update_state(targets, y_pred)
        
        return loss

In [4]:
def train(epochs):
    train_ds, test_ds, field_dict, field_index = get_data()
    
    model = DeepFM(embedding_size=config.EMBEDDING_SIZE, num_feature=len(field_index),
                   num_field=len(field_dict), field_index=field_index)
    
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
    
    print("Start Training: Batch Size: {}, Embedding Size: {}".format(config.BATCH_SIZE, config.EMBEDDING_SIZE))
    
    start = perf_counter()
    for i in range(epochs):
        acc = BinaryAccuracy(threshold=0.5)
        auc = AUC()
        loss_history = []

        for x, y in train_ds:
            loss = train_on_batch(model, optimizer, acc, auc, x, y)
            loss_history.append(loss)

        print("Epoch {:03d}: 누적 Loss: {:.4f}, Acc: {:.4f}, AUC: {:.4f}".format(
            i, np.mean(loss_history), acc.result().numpy(), auc.result().numpy()))

    test_acc = BinaryAccuracy(threshold=0.5)
    test_auc = AUC()
    for x, y in test_ds:
        y_pred = model(x)
        test_acc.update_state(y, y_pred)
        test_auc.update_state(y, y_pred)
        
    print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
    print("Batch Size: {}, Embedding Size: {}".format(config.BATCH_SIZE, config.EMBEDDING_SIZE))
    print("걸린 시간: {:.3f}".format(perf_counter() - start))
    model.save_weights('./weights/weights-epoch({})-batch({})-embedding({}).h5'.format(
        epochs, config.BATCH_SIZE, config.EMBEDDING_SIZE))


if __name__ == '__main__':
    train(epochs=100)


Data Prepared...
X shape: (32561, 108)
# of Feature: 108
# of Field: 14
Start Training: Batch Size: 256, Embedding Size: 10
Epoch 000: 누적 Loss: 0.7733, Acc: 0.5849, AUC: 0.5029
Epoch 001: 누적 Loss: 0.5728, Acc: 0.7594, AUC: 0.4190
Epoch 002: 누적 Loss: 0.5680, Acc: 0.7594, AUC: 0.4249
Epoch 003: 누적 Loss: 0.5645, Acc: 0.7595, AUC: 0.4310
Epoch 004: 누적 Loss: 0.5615, Acc: 0.7594, AUC: 0.4389
Epoch 005: 누적 Loss: 0.5592, Acc: 0.7593, AUC: 0.4473
Epoch 006: 누적 Loss: 0.5571, Acc: 0.7593, AUC: 0.4582
Epoch 007: 누적 Loss: 0.5549, Acc: 0.7594, AUC: 0.4766
Epoch 008: 누적 Loss: 0.5530, Acc: 0.7594, AUC: 0.5051
Epoch 009: 누적 Loss: 0.5513, Acc: 0.7593, AUC: 0.5494
Epoch 010: 누적 Loss: 0.5492, Acc: 0.7595, AUC: 0.6402
Epoch 011: 누적 Loss: 0.5468, Acc: 0.7595, AUC: 0.7277
Epoch 012: 누적 Loss: 0.5440, Acc: 0.7594, AUC: 0.8126
Epoch 013: 누적 Loss: 0.5399, Acc: 0.7594, AUC: 0.8426
Epoch 014: 누적 Loss: 0.5343, Acc: 0.7594, AUC: 0.8389
Epoch 015: 누적 Loss: 0.5270, Acc: 0.7594, AUC: 0.8489
Epoch 016: 누적 Loss: 0.5175, 

OSError: Unable to create file (unable to open file: name = 'weights/weights-epoch(100)-batch(256)-embedding(10).h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)

In [6]:
file = pd.read_csv('../../data/adult.data', header=None)

In [9]:
B = pd.DataFrame()

In [12]:
B.columns = config.ALL_FIELDS

ValueError: Length mismatch: Expected axis has 0 elements, new values have 14 elements

In [7]:
X = file.loc[:, 0:13]
Y = file.loc[:, 14].map({' <=50K': 0, ' >50K': 1})
X.columns = config.ALL_FIELDS

In [8]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [4]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [5]:
field_dict, field_index, X_modified = get_modified_data(X,
                                                       config.ALL_FIELDS,
                                                       config.CONT_FIELDS,
                                                       config.CAT_FIELDS,
                                                       False)

Data Prepared...
X shape: (32561, 108)
# of Feature: 108
# of Field: 14


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X_modified, Y, test_size=0.2)
train_ds = tf.data.Dataset.from_tensor_slices(
    (tf.cast(X_train.values, tf.float32),
     tf.cast(Y_train, tf.float32))).shuffle(30000).batch(config.BATCH_SIZE)

test_ds = tf.data.Dataset.from_tensor_slices(
    (tf.cast(X_test.values, tf.float32), 
     tf.cast(Y_test, tf.float32))).shuffle(10000).batch(config.BATCH_SIZE)

In [7]:
X_modified

Unnamed: 0,age,workclass- ?,workclass- Federal-gov,workclass- Local-gov,workclass- Never-worked,workclass- Private,workclass- Self-emp-inc,workclass- Self-emp-not-inc,workclass- State-gov,workclass- Without-pay,...,country- Portugal,country- Puerto-Rico,country- Scotland,country- South,country- Taiwan,country- Thailand,country- Trinadad&Tobago,country- United-States,country- Vietnam,country- Yugoslavia
0,0.301370,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0.452055,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.287671,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.493151,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.150685,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,0.315068,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,0.561644,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,0.068493,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
from sklearn.datasets import load_breast_cancer

In [8]:
cancer = load_breast_cancer()

In [4]:
file

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
