In [1]:
import pandas as pd
import torch
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat,get_feature_names

다룰 샘플 데이터는 Movielens 데이터이며, 영화 평점을 numeric하게 예측을 하는 것이 목적입니다. Task는 Regression 입니다

In [2]:
data = pd.read_csv("./examples/movielens_sample.txt")

In [3]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [4]:
data.shape

(200, 10)

### features, target 지정

In [5]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ["rating"]

### 1. Label encoding(as.factor)

In [6]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [7]:
data[sparse_features].head()

Unnamed: 0,movie_id,user_id,gender,age,occupation,zip
0,12,107,0,2,4,35
1,169,123,1,1,4,118
2,6,12,0,2,13,99
3,112,21,1,1,18,55
4,45,187,1,5,19,41


### 2. sparse한 feature 부분 중복제거 갯수

In [8]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                              for feat in sparse_features]
fixlen_feature_columns

[SparseFeat(name='movie_id', dimension=187, use_hash=False, dtype='int32', embedding_name='movie_id', embedding=True),
 SparseFeat(name='user_id', dimension=193, use_hash=False, dtype='int32', embedding_name='user_id', embedding=True),
 SparseFeat(name='gender', dimension=2, use_hash=False, dtype='int32', embedding_name='gender', embedding=True),
 SparseFeat(name='age', dimension=7, use_hash=False, dtype='int32', embedding_name='age', embedding=True),
 SparseFeat(name='occupation', dimension=20, use_hash=False, dtype='int32', embedding_name='occupation', embedding=True),
 SparseFeat(name='zip', dimension=188, use_hash=False, dtype='int32', embedding_name='zip', embedding=True)]

In [9]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']

### 3. model 생성을 하기 위한 input data 생성하기

In [10]:
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

### 4. Define Model

In [11]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, 
               dnn_feature_columns, 
               task='regression',
               device=device)

model.compile("adam", "mse", metrics=['mse'],)

In [12]:
model

DeepFM(
  (embedding_dict): ModuleDict(
    (age): Embedding(7, 8)
    (gender): Embedding(2, 8)
    (movie_id): Embedding(187, 8)
    (occupation): Embedding(20, 8)
    (user_id): Embedding(193, 8)
    (zip): Embedding(188, 8)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (age): Embedding(7, 1)
      (gender): Embedding(2, 1)
      (movie_id): Embedding(187, 1)
      (occupation): Embedding(20, 1)
      (user_id): Embedding(193, 1)
      (zip): Embedding(188, 1)
    )
  )
  (out): PredictionLayer()
  (fm): FM()
  (dnn): DNN(
    (dropout): Dropout(p=0)
    (linears): ModuleList(
      (0): Linear(in_features=48, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (dnn_linear): Linear(in_features=128, out_features=1, bias=False)
)

### 5. Train Model

In [13]:
%%time
history = model.fit(train_model_input, 
          train[target].values,
          batch_size=256, 
          epochs=10, verbose=2, 
          validation_split=0.2, 
          )

cpu
Train on 128 samples, validate on 32 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  14.1646 - mse:  14.1646 - val_mse:  13.6581
Epoch 2/10
0s - loss:  13.9130 - mse:  13.9130 - val_mse:  13.4481
Epoch 3/10
0s - loss:  13.6743 - mse:  13.6743 - val_mse:  13.2491
Epoch 4/10
0s - loss:  13.4467 - mse:  13.4467 - val_mse:  13.0580
Epoch 5/10
0s - loss:  13.2289 - mse:  13.2289 - val_mse:  12.8720
Epoch 6/10
0s - loss:  13.0165 - mse:  13.0165 - val_mse:  12.6911
Epoch 7/10
0s - loss:  12.8080 - mse:  12.8080 - val_mse:  12.5129
Epoch 8/10
0s - loss:  12.6000 - mse:  12.6000 - val_mse:  12.3268
Epoch 9/10
0s - loss:  12.3815 - mse:  12.3815 - val_mse:  12.1316
Epoch 10/10
0s - loss:  12.1517 - mse:  12.1517 - val_mse:  11.9263
Wall time: 613 ms


### 6. Predict and Evaluate Model

In [14]:
pred_ans = model.predict(test_model_input, batch_size=256)
pred_ans

array([[0.3096913 ],
       [0.29650345],
       [0.30972636],
       [0.2974062 ],
       [0.29696402],
       [0.32171726],
       [0.29665202],
       [0.2973509 ],
       [0.29701048],
       [0.30732864],
       [0.29756182],
       [0.29718927],
       [0.2966463 ],
       [0.29690212],
       [0.29712787],
       [0.29687274],
       [0.29722014],
       [0.29647702],
       [0.2968    ],
       [0.2965187 ],
       [0.29679227],
       [0.29672378],
       [0.2968709 ],
       [0.29673246],
       [0.3216006 ],
       [0.2946208 ],
       [0.2972101 ],
       [0.2941858 ],
       [0.2956073 ],
       [0.2971223 ],
       [0.29571357],
       [0.29734743],
       [0.2968275 ],
       [0.29719478],
       [0.2964598 ],
       [0.29762253],
       [0.29748103],
       [0.2972415 ],
       [0.2965003 ],
       [0.29680303]], dtype=float32)

In [15]:
print("test MSE", round(mean_squared_error(test[target].values, pred_ans), 4))

test MSE 11.9855
