In [1]:
import numpy as np
import pandas as pd
import keras

from keras.models import Sequential, Model
from keras.layers import Input, Dense, Concatenate

from sklearn.preprocessing import MinMaxScaler

Using TensorFlow backend.


In [2]:
# all columns
COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week",
    "native_country", "income_bracket"
]

# label column
LABEL_COLUMN = "label"

# categorical columns
CATEGORICAL_COLUMNS = [
    "workclass", "education", "marital_status", "occupation",
    "relationship", "race", "gender", "native_country"
]

# continuous columns
CONTINUOUS_COLUMNS = [
    "age", "education_num", "capital_gain", "capital_loss", "hours_per_week"
]


In [3]:
# load file
def load(filename):
    with open(filename, 'r') as f:
        skiprows = 1 if 'test' in filename else 0
        df = pd.read_csv(f, names=COLUMNS,
                        skipinitialspace=True,
                        skiprows=skiprows, engine='python')
        # drop null value
        df = df.dropna(how='any', axis=0)
    return df

In [4]:
# preprocessing
def preprocess(df):
    
    df[LABEL_COLUMN] = df['income_bracket'].apply(lambda x: ">50K" in x).astype(int)
    df.pop("income_bracket")
    
    y = df[LABEL_COLUMN].values
    df.pop(LABEL_COLUMN)
    
    df = pd.get_dummies(df, columns=[x for x in CATEGORICAL_COLUMNS])
    
    # TODO: select the samples
    
    # TODO: make cross features
#     from sklearn.preprocessing import PolynomialFeatures
#     X = PolynmialFeatures(degree=2, interaction_only=True, include_bias=False).fit_transform(X)
    
    df = pd.DataFrame(MinMaxScaler().fit_transform(df.astype('float64')), columns=df.columns)

    X = df.values
    
    return X, y

In [5]:
# main function
    
df_train = load('../../data/adult.data')
df_test = load('../../data/adult.test')
    
df = pd.concat([df_train, df_test])
print(df.head())

train_len = len(df_train)

X, y = preprocess(df)

X_train = X[:train_len]
y_train = y[:train_len]
X_test = X[train_len:]
y_test = y[train_len:]

print(X_train)  
print(y_train)

   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race  gender  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country income_bracket  
0          2174             0              40  United-States          <=50K  
1             0           

In [6]:
# wide model
wide = Sequential()
wide.add(Dense(1, input_dim=X_train.shape[1]))
print(X_train.shape)
print(wide.summary())

wide_input = Input(shape=X_train.shape)
wide_output = Dense(1)(wide_input)
wide_model = Model(inputs=wide_input, outputs=wide_output)
print(wide_model.summary())

(32561, 108)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 109       
Total params: 109
Trainable params: 109
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 32561, 108)        0         
_________________________________________________________________
dense_2 (Dense)              (None, 32561, 1)          109       
Total params: 109
Trainable params: 109
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
# deep part
deep = Sequential()

# input layer
deep.add(Dense(1, input_dim=X_train.shape[1], activation='relu'))
deep.add(Dense(100, activation='relu'))
# deep.add(Dense(input_dim=100, output_dim=32, activation='relu'))
deep.add(Dense(32, activation='relu'))
# deep.add(Dense(input_dim=32, output_dim=8))
deep.add(Dense(1, activation='sigmoid'))

print(deep.summary())

deep_input = Input(shape=X_train.shape)
deep_output = Dense(1)(deep_input)
deep_output = Dense(100, activation='relu')(deep_output)
deep_output = Dense(32, activation='relu')(deep_output)
deep_output = Dense(1, activation='sigmoid')(deep_output)

deep_model = Model(inputs=deep_input, outputs=deep_output)

print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 1)                 109       
_________________________________________________________________
dense_4 (Dense)              (None, 100)               200       
_________________________________________________________________
dense_5 (Dense)              (None, 32)                3232      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 3,574
Trainable params: 3,574
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 32561, 108)        0         
_________________________________________________________________
den

In [None]:
# combine wide and deep 
merged_input = Input(shape=wide_model.input.shape)
merged_output = Concatenate()([wide_model.output, deep_model.output])
merged_output = Dense(1, activation='sigmoid')(merged_output)

model = Model(inputs=merged_input, outputs=merged_output)
# model.summary()

In [None]:
# model compile
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy'])

# model training
history = model.fit([X_train, X_train], y_train, epochs=10, batch_size=32)

# loss and accuracy
loss, accuracy = model.evaluate([X_test, X_test], y_test)
print('\nTest accuracy:', accuracy)    