In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, mean_squared_error
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import get_custom_objects
from deepctr.layers import custom_objects

In [2]:
original_df = pd.read_csv("data\Preprocessed_instacart.csv")
original_df.head()

Unnamed: 0,product_id,order_dow,order_hour_of_day,aisle_id,department_id,target
0,315,0,0,20,1,1
1,103,0,0,11,4,1
2,300,0,0,5,4,1
3,881,0,0,30,3,1
4,50,0,0,11,4,1


In [3]:
df = original_df[['product_id', 'order_dow', 'order_hour_of_day', 'aisle_id', 'department_id', 'target']].copy()
df.head()

Unnamed: 0,product_id,order_dow,order_hour_of_day,aisle_id,department_id,target
0,315,0,0,20,1,1
1,103,0,0,11,4,1
2,300,0,0,5,4,1
3,881,0,0,30,3,1
4,50,0,0,11,4,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647410 entries, 0 to 647409
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   product_id         647410 non-null  int64
 1   order_dow          647410 non-null  int64
 2   order_hour_of_day  647410 non-null  int64
 3   aisle_id           647410 non-null  int64
 4   department_id      647410 non-null  int64
 5   target             647410 non-null  int64
dtypes: int64(6)
memory usage: 29.6 MB


In [5]:
df['product_id'].nunique()

2965

## Checking for missing values

In [6]:
df['product_id'].isna().sum()

0

In [7]:
df['order_dow'].isna().sum()

0

In [8]:
df['order_hour_of_day'].isna().sum()

0

In [9]:
df['aisle_id'].isna().sum()

0

In [10]:
df['department_id'].isna().sum()

0

In [11]:
df['target'].isna().sum()

0

## Encoding

In [12]:
sparse_features = ['product_id', 'order_dow', 'order_hour_of_day', 'aisle_id', 'department_id']
target = ['target']

In [13]:
for feat in sparse_features:
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique() + 1, embedding_dim=4)
                        for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [14]:
train, test = train_test_split(df, test_size=0.2, random_state=2020)
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

In [15]:
train_model_input, test_model_input

({'product_id': array([  88,   49, 2917, ...,  265, 2192,   11], dtype=int64),
  'order_dow': array([0, 4, 3, ..., 4, 5, 4], dtype=int64),
  'order_hour_of_day': array([ 7, 17, 16, ..., 17, 17, 13], dtype=int64),
  'aisle_id': array([ 1, 10, 11, ...,  0, 30,  7], dtype=int64),
  'department_id': array([ 3,  3, 18, ..., 15, 19,  6], dtype=int64)},
 {'product_id': array([  80,   16, 1373, ...,  113,  802, 2693], dtype=int64),
  'order_dow': array([2, 6, 0, ..., 3, 4, 5], dtype=int64),
  'order_hour_of_day': array([19,  9, 16, ...,  8, 18, 11], dtype=int64),
  'aisle_id': array([10,  1, 20, ...,  4, 21,  4], dtype=int64),
  'department_id': array([ 3,  3, 13, ...,  3, 15,  3], dtype=int64)})

In [16]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=100, epochs=20, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=256)
print("test MSE", round(mean_squared_error(
    test[target].values, pred_ans), 4))

Epoch 1/20
4144/4144 - 17s - loss: 0.4438 - binary_crossentropy: 0.4434 - val_loss: 0.4335 - val_binary_crossentropy: 0.4328
Epoch 2/20
4144/4144 - 14s - loss: 0.4302 - binary_crossentropy: 0.4291 - val_loss: 0.4278 - val_binary_crossentropy: 0.4264
Epoch 3/20
4144/4144 - 14s - loss: 0.4272 - binary_crossentropy: 0.4253 - val_loss: 0.4250 - val_binary_crossentropy: 0.4226
Epoch 4/20
4144/4144 - 15s - loss: 0.4220 - binary_crossentropy: 0.4192 - val_loss: 0.4223 - val_binary_crossentropy: 0.4191
Epoch 5/20
4144/4144 - 15s - loss: 0.4175 - binary_crossentropy: 0.4138 - val_loss: 0.4208 - val_binary_crossentropy: 0.4167
Epoch 6/20
4144/4144 - 16s - loss: 0.4130 - binary_crossentropy: 0.4085 - val_loss: 0.4208 - val_binary_crossentropy: 0.4159
Epoch 7/20
4144/4144 - 15s - loss: 0.4096 - binary_crossentropy: 0.4044 - val_loss: 0.4229 - val_binary_crossentropy: 0.4173
Epoch 8/20
4144/4144 - 15s - loss: 0.4071 - binary_crossentropy: 0.4012 - val_loss: 0.4219 - val_binary_crossentropy: 0.4156


In [17]:
# Save the model
model.save("savedmodel/")

print("Model saved.")



INFO:tensorflow:Assets written to: savedmodel/assets


INFO:tensorflow:Assets written to: savedmodel/assets


Model saved.




In [18]:
pred_ans[132]

array([0.99961567], dtype=float32)

## For making predictions

In [19]:
# # Make predictions

# # Generate input data for the new data
# new_data_input = {name: new_data[name].values for name in feature_names}

# # Make predictions
# predictions = model.predict(new_data_input)

# # Print the predictions
# print(predictions)