In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error

from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from deepctr.models import DeepFM

In [2]:
df = pd.read_csv('./data/movielens.csv')

In [3]:
df

Unnamed: 0,userId,title,genres,tag,rating,target
0,1,Toy Story (1995),Adventure,toys,3.92,0
1,2,Toy Story (1995),Adventure,toys,3.92,0
2,3,Toy Story (1995),Adventure,toys,3.92,1
3,4,Toy Story (1995),Adventure,toys,3.92,0
4,5,Toy Story (1995),Adventure,toys,3.92,0
...,...,...,...,...,...,...
99995,996,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,0
99996,997,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,1
99997,998,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,0
99998,999,"Lord of the Rings: The Return of the King, The...",Action,trilogy,4.14,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userId  100000 non-null  int64  
 1   title   100000 non-null  object 
 2   genres  100000 non-null  object 
 3   tag     100000 non-null  object 
 4   rating  100000 non-null  float64
 5   target  100000 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 4.6+ MB


In [5]:
sparse_features = ['userId', 'title', 'genres', 'tag']
dense_features = ['rating']

In [6]:
data = df.copy()

In [7]:
encoders = []

for i in range(len(sparse_features)):
    encoders.append(i)
    encoders[i] = LabelEncoder()
    data[sparse_features[i]] = encoders[i].fit_transform(data[sparse_features[i]])

In [8]:
encoders

[LabelEncoder(), LabelEncoder(), LabelEncoder(), LabelEncoder()]

In [9]:
data

Unnamed: 0,userId,title,genres,tag,rating,target
0,0,92,1,65,3.92,0
1,1,92,1,65,3.92,0
2,2,92,1,65,3.92,1
3,3,92,1,65,3.92,0
4,4,92,1,65,3.92,0
...,...,...,...,...,...,...
99995,995,50,0,67,4.14,0
99996,996,50,0,67,4.14,1
99997,997,50,0,67,4.14,0
99998,998,50,0,67,4.14,0


In [10]:
(np.array(data['rating'])).reshape(-1, 1)

array([[3.92],
       [3.92],
       [3.92],
       ...,
       [4.14],
       [4.14],
       [4.14]])

In [11]:
mms = MinMaxScaler()
data['rating'] = mms.fit_transform((np.array(data['rating'])).reshape(-1, 1))

In [12]:
data

Unnamed: 0,userId,title,genres,tag,rating,target
0,0,92,1,65,0.66875,0
1,1,92,1,65,0.66875,0
2,2,92,1,65,0.66875,1
3,3,92,1,65,0.66875,0
4,4,92,1,65,0.66875,0
...,...,...,...,...,...,...
99995,995,50,0,67,0.80625,0
99996,996,50,0,67,0.80625,1
99997,997,50,0,67,0.80625,0
99998,998,50,0,67,0.80625,0


In [13]:
sparse_feat = [SparseFeat(feat, data[feat].max() + 1,embedding_dim=4)
                            for feat in sparse_features]
dense_feat = [DenseFeat(feat,1,) for feat in dense_features]

fixlen_feature_columns = sparse_feat + dense_feat

In [14]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [15]:
linear_feature_columns

[SparseFeat(name='userId', vocabulary_size=1000, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016CE02A6888>, embedding_name='userId', group_name='default_group', trainable=True),
 SparseFeat(name='title', vocabulary_size=100, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016CE0597AC8>, embedding_name='title', group_name='default_group', trainable=True),
 SparseFeat(name='genres', vocabulary_size=10, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x0000016CE05859C8>, embedding_name='genres', group_name='default_group', trainable=True),
 SparseFeat(name='tag', vocabulary_size=71, embedding_dim=4, use_hash=False

In [16]:
feature_names

['userId', 'title', 'genres', 'tag', 'rating']

In [17]:
train, test = train_test_split(data, test_size=0.2, random_state=777)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [18]:
train_model_input

{'userId': array([925, 987, 647, ..., 134, 467, 591], dtype=int64),
 'title': array([ 5, 72,  5, ..., 43, 71, 67]),
 'genres': array([4, 6, 4, ..., 0, 0, 0]),
 'tag': array([39, 34, 39, ...,  3,  7,  2]),
 'rating': array([0.81875, 0.9125 , 0.81875, ..., 0.325  , 0.75625, 0.85625])}

In [19]:
input_col = ['userId','title','genres','tag', 'rating']
target_col = ['target']

In [20]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )

In [21]:
history = model.fit(train_model_input, train[target_col].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10
250/250 - 1s - loss: 0.1845 - mse: 0.1844 - val_loss: 0.1416 - val_mse: 0.1416
Epoch 2/10
250/250 - 0s - loss: 0.1425 - mse: 0.1424 - val_loss: 0.1409 - val_mse: 0.1408
Epoch 3/10
250/250 - 0s - loss: 0.1408 - mse: 0.1407 - val_loss: 0.1395 - val_mse: 0.1394
Epoch 4/10
250/250 - 0s - loss: 0.1379 - mse: 0.1378 - val_loss: 0.1334 - val_mse: 0.1333
Epoch 5/10
250/250 - 0s - loss: 0.1273 - mse: 0.1272 - val_loss: 0.1232 - val_mse: 0.1231
Epoch 6/10
250/250 - 0s - loss: 0.1210 - mse: 0.1209 - val_loss: 0.1216 - val_mse: 0.1214
Epoch 7/10
250/250 - 0s - loss: 0.1186 - mse: 0.1184 - val_loss: 0.1204 - val_mse: 0.1202
Epoch 8/10
250/250 - 0s - loss: 0.1167 - mse: 0.1165 - val_loss: 0.1223 - val_mse: 0.1221
Epoch 9/10
250/250 - 0s - loss: 0.1154 - mse: 0.1151 - val_loss: 0.1197 - val_mse: 0.1194
Epoch 10/10
250/250 - 0s - loss: 0.1138 - mse: 0.1136 - val_loss: 0.1198 - val_mse: 0.1195


In [22]:
pred_ans = model.predict(test_model_input, batch_size=256)
print("test MSE", round(mean_squared_error(test[target_col].values, pred_ans), 4))

test MSE 0.1211


In [23]:
test

Unnamed: 0,userId,title,genres,tag,rating,target
96275,275,49,1,27,0.80625,0
22712,712,81,1,18,0.30000,0
31942,942,17,0,25,0.13125,0
87283,283,78,6,24,0.75000,1
13342,342,97,0,22,0.00000,0
...,...,...,...,...,...,...
99354,354,50,0,67,0.80625,0
81073,73,71,0,7,0.75625,0
64055,55,4,0,3,0.72500,1
84730,730,82,0,55,0.14375,0


In [24]:
test[target_col].values[:10]

array([[0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0]], dtype=int64)

In [25]:
pred_ans[:10]

array([[0.05457076],
       [0.02762723],
       [0.1764965 ],
       [0.6135593 ],
       [0.10704888],
       [0.08178648],
       [0.9934853 ],
       [0.44048956],
       [0.7603214 ],
       [0.08697441]], dtype=float32)

In [26]:
test['pred'] = pred_ans

In [27]:
test

Unnamed: 0,userId,title,genres,tag,rating,target,pred
96275,275,49,1,27,0.80625,0,0.054571
22712,712,81,1,18,0.30000,0,0.027627
31942,942,17,0,25,0.13125,0,0.176497
87283,283,78,6,24,0.75000,1,0.613559
13342,342,97,0,22,0.00000,0,0.107049
...,...,...,...,...,...,...,...
99354,354,50,0,67,0.80625,0,0.013650
81073,73,71,0,7,0.75625,0,0.196653
64055,55,4,0,3,0.72500,1,0.325892
84730,730,82,0,55,0.14375,0,0.084627


In [28]:
for i in range(len(sparse_features)):
    test[sparse_features[i]] = encoders[i].inverse_transform(test[sparse_features[i]])

In [29]:
test

Unnamed: 0,userId,title,genres,tag,rating,target,pred
96275,276,"Lord of the Rings: The Fellowship of the Ring,...",Adventure,high fantasy,0.80625,0,0.054571
22712,713,Star Trek: Generations (1994),Adventure,franchise,0.30000,0,0.027627
31942,943,Cliffhanger (1993),Action,good action,0.13125,0,0.176497
87283,284,"Sixth Sense, The (1999)",Drama,ghosts/afterlife,0.75000,1,0.613559
13342,343,Waterworld (1995),Action,futuristic,0.00000,0,0.107049
...,...,...,...,...,...,...,...
99354,355,"Lord of the Rings: The Return of the King, The...",Action,trilogy,0.80625,0,0.013650
81073,74,Saving Private Ryan (1998),Action,best war films,0.75625,0,0.196653
64055,56,Aliens (1986),Action,alien,0.72500,1,0.325892
84730,731,Star Wars: Episode I - The Phantom Menace (1999),Action,space opera,0.14375,0,0.084627


In [30]:
test.sort_values(by = ['userId', 'title']).reset_index(drop = True)

Unnamed: 0,userId,title,genres,tag,rating,target,pred
0,1,Alien (1979),Horror,alien,0.74375,1,0.498975
1,1,American Beauty (1999),Comedy,midlife crisis,0.81875,0,0.855571
2,1,Babe (1995),Children,talking animals,0.49375,0,0.084797
3,1,Back to the Future (1985),Adventure,future,0.66875,0,0.533933
4,1,"Clockwork Orange, A (1971)",Crime,masterpiece,0.71875,0,0.427973
...,...,...,...,...,...,...,...
19995,1000,Saving Private Ryan (1998),Action,best war films,0.75625,0,-0.053661
19996,1000,Schindler's List (1993),Drama,jews,0.91250,0,0.072953
19997,1000,Sleepless in Seattle (1993),Comedy,romantic comedy,0.41875,0,-0.024456
19998,1000,Star Trek: Generations (1994),Adventure,franchise,0.30000,0,0.014046
