In [1]:
# 用Wide&Deep模型，对Census数据做2分类预测。
# 没有使用criteo数据集，主要是因为该数据集的类别特征C1,..，C26经过onehot后有1万多特征，
# 而我们又不知道特征代表的具体含义，在wide部分特征交叉时会遇到困难。

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PolynomialFeatures
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Activation, concatenate, BatchNormalization
from tensorflow.keras.models import Model

In [3]:
train_data = pd.read_csv("../Census/train.csv")
test_data = pd.read_csv("../Census/test.csv")

In [4]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
all_data = pd.concat([train_data, test_data])

In [6]:
# 将“income_bracket”转换为0-1值
all_data['label'] = all_data['income_bracket'].apply(lambda x: 1 if '>50K' in x else 0)

all_data.drop('income_bracket', axis=1, inplace=True)

In [7]:
# 类别特征列和连续型特征列
categorical_columns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]

continuous_columns = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

# 构造Wide部分

In [8]:
# 将类别特征做onehot处理
wide_data = all_data.copy()

for col in categorical_columns:
    onehot_feats = pd.get_dummies(wide_data[col], prefix = col, prefix_sep='.')
    wide_data.drop([col], axis = 1, inplace = True)
    wide_data = pd.concat([wide_data, onehot_feats], axis = 1)

In [9]:
wide_data.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,label,workclass. ?,workclass. Federal-gov,workclass. Local-gov,...,native_country. Portugal,native_country. Puerto-Rico,native_country. Scotland,native_country. South,native_country. Taiwan,native_country. Thailand,native_country. Trinadad&Tobago,native_country. United-States,native_country. Vietnam,native_country. Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# 得到我们需要的onehot后类别特征
 cate_features = wide_data.iloc[:, 7:]

In [11]:
# 对训练集和测试集分别处理
train_size = len(train_data)

train_cate_features = cate_features.iloc[:train_size]

test_cate_features = cate_features.iloc[train_size:]

In [12]:
# 对类别特征做简单的2阶特征交叉
poly = PolynomialFeatures(degree=2, interaction_only=True)

train_cate_poly = poly.fit_transform(train_cate_features)
test_cate_poly = poly.transform(test_cate_features)

In [13]:
train_cate_features.shape

(32561, 102)

In [14]:
train_cate_poly.shape     #  5254 = 1 + 102 + 0.5 * 102 * 101

(32561, 5254)

In [15]:
wide_input = Input(shape=(train_cate_poly.shape[1],))       # wide部分输入logisti函数的数据形式

In [16]:
del wide_data, cate_features

# 构造Deep部分

In [17]:
# 将类别特征转换为数值
for col in categorical_columns:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])

In [18]:
# 分割出训练特征和测试特征
conti_features = all_data[continuous_columns]
cate_features = all_data[categorical_columns]

train_conti_features = conti_features.iloc[:train_size]
train_cate_features = cate_features.iloc[:train_size]

test_conti_features = conti_features.iloc[train_size:]
test_cate_features = cate_features.iloc[train_size:]

In [19]:
# 分割出训练和测试标签
y = all_data.pop('label')

y_train = y[:train_size]
y_test = y[train_size:]

In [20]:
# 将连续型特征做归一化
scaler = MinMaxScaler()
train_conti_features = scaler.fit_transform(train_conti_features)
test_conti_features = scaler.transform(test_conti_features)

In [21]:
# 为类别数据的每个特征创建Input层和Embedding层
cate_inputs = []
cate_embeds = []

for i in range(len(categorical_columns)):
    input_i = Input(shape=(1,), dtype='int32')
    dim = all_data[categorical_columns[i]].nunique()
    embed_dim = 8         # 统一embeding为8维向量，可调节
    embed_i = Embedding(dim, embed_dim, input_length=1)(input_i)
    flatten_i = Flatten()(embed_i)
    cate_inputs.append(input_i)
    cate_embeds.append(flatten_i)

In [22]:
# 连续型特征数据在全连接层统一输入
conti_input = Input(shape=(len(continuous_columns),))
conti_dense = Dense(256, use_bias=False)(conti_input)

In [23]:
# 把全连接层和各Embedding的输出粘在一起
concat_embeds = concatenate([conti_dense]+cate_embeds)
concat_embeds = Activation('relu')(concat_embeds)
bn_concat = BatchNormalization()(concat_embeds)

In [24]:
# 再堆叠三个全连接层
fc1 = Dense(512, activation='relu')(bn_concat)
bn1 = BatchNormalization()(fc1)
fc2 = Dense(256, activation='relu')(bn1)
bn2 = BatchNormalization()(fc2)
fc3 = Dense(128, activation='relu')(bn2)

In [25]:
deep_input = fc3

# Wide&Deep

In [26]:
# 将Wide、Deep对最终层的输入做拼接
out_layer = concatenate([deep_input, wide_input])

In [27]:
# 定义最终的输入输出
inputs = [conti_input] + cate_inputs + [wide_input]

output = Dense(1, activation='sigmoid')(out_layer)

In [28]:
# 定义模型
model = Model(inputs=inputs, outputs=output)

In [29]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
input_data = [train_conti_features] +[train_cate_features.values[:, i] for i in range(train_cate_features.shape[1])] + [train_cate_poly]

input_data_test = [test_conti_features] +[test_cate_features.values[:, i] for i in range(test_cate_features.shape[1])] + [test_cate_poly]

In [31]:
model.fit(input_data, y_train.values,
          validation_data=(input_data_test, y_test.values),
          epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b3e3ccc390>