# pyFMでmovie lensしてみる

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
logs = pd.read_csv("./ml-100k/u.data", names=["uid", "mid", "rating", "timestamp"], sep="\t")
y=np.array(logs["rating"],dtype=np.float64)
logs=logs.drop(["timestamp","rating"], axis=1)
logs.tail(10)

Unnamed: 0,uid,mid
99990,806,421
99991,676,538
99992,721,262
99993,913,209
99994,378,78
99995,880,476
99996,716,204
99997,276,1090
99998,13,225
99999,12,203


In [2]:
users = pd.read_csv("./ml-100k/u.user", names=["uid", "age" , "gender", "occupation" ,"zip_code"], sep="|")
users=users.drop(["age","zip_code"], axis=1)
users.tail(10)

Unnamed: 0,uid,gender,occupation
933,934,M,engineer
934,935,M,doctor
935,936,M,other
936,937,M,educator
937,938,F,technician
938,939,F,student
939,940,M,administrator
940,941,M,student
941,942,F,librarian
942,943,M,student


In [3]:
# itemのcontext追加情報の読み込み
#read_csvだとUnicodeDecodeErrorが起きてしまったのでhttps://qiita.com/niwaringo/items/d2a30e04e08da8eaa643で解決
# 後ろのcolumnはジャンルを表す.(複数ジャンルに属するitemもあるらしい)

import codecs
with codecs.open("./ml-100k/u.item", "r", "Shift-JIS", "ignore") as f:
    items = pd.read_table(f,names=["mid" , "title" , "released", "video_released", "IMDb_URL" ,
                                               "unknown" , "Action" , "Adventure" , "Animation" ,
                                               "Children" , "Comedy" ,"Crime" , "Documentary" , "Drama" , "Fantasy" ,
                                               "Film_Noir" ,"Horror" , "Musical" , "Mystery", "Romance" , "Sci_Fi",
                                               "Thriller", "War", "Western"],delimiter="|")
items=items.drop(["title", "released","video_released", "IMDb_URL" ],axis=1)
items.tail(10)

Unnamed: 0,mid,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
1672,1673,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1673,1674,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1674,1675,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1675,1676,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1676,1677,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1677,1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1681,1682,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [4]:
log_user=pd.merge(logs,users,on="uid")
log_user_dummied=pd.get_dummies(log_user)
merged_df= pd.merge(log_user_dummied,items,on="mid")
features_df=pd.get_dummies(merged_df,columns=["uid","mid"])
features_df.tail(10)

Unnamed: 0,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,...,mid_1673,mid_1674,mid_1675,mid_1676,mid_1677,mid_1678,mid_1679,mid_1680,mid_1681,mid_1682
99990,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
99991,0,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
99992,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
99993,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
99994,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
99995,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
99996,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99997,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
99999,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
from  scipy.sparse import csr_matrix
X=csr_matrix(features_df.values,dtype=np.float64)

In [6]:
X.toarray().shape

(100000, 2667)

In [7]:
X.toarray()

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [8]:
from sklearn.model_selection import train_test_split
test_rate=0.1
valid_rate=0.1

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_rate)
#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,test_size=valid_rate)

In [12]:
from pyfm import pylibfm
fm = pylibfm.FM(num_factors=5, num_iter=50, task="regression")

In [13]:
fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.68464
-- Epoch 2
Training MSE: 0.65743
-- Epoch 3
Training MSE: 0.64769
-- Epoch 4
Training MSE: 0.64552
-- Epoch 5
Training MSE: 0.64510
-- Epoch 6
Training MSE: 0.64319
-- Epoch 7
Training MSE: 0.64238
-- Epoch 8
Training MSE: 0.64114
-- Epoch 9
Training MSE: 0.64068
-- Epoch 10
Training MSE: 0.63975
-- Epoch 11
Training MSE: 0.64027
-- Epoch 12
Training MSE: 0.64070
-- Epoch 13
Training MSE: 0.64121
-- Epoch 14
Training MSE: 0.64052
-- Epoch 15
Training MSE: 0.64054
-- Epoch 16
Training MSE: 0.63969
-- Epoch 17
Training MSE: 0.64099
-- Epoch 18
Training MSE: 0.64031
-- Epoch 19
Training MSE: 0.64063
-- Epoch 20
Training MSE: 0.64024
-- Epoch 21
Training MSE: 0.64138
-- Epoch 22
Training MSE: 0.64073
-- Epoch 23
Training MSE: 0.64106
-- Epoch 24
Training MSE: 0.64128
-- Epoch 25
Training MSE: 0.64150
-- Epoch 26
Training MSE: 0.64147
-- Epoch 27
Training MSE: 0.64143
-- Epoch 28
Tra

In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np
preds = fm.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test,preds))
print(f"FM RMSE: {rmse:.4f}")

FM RMSE: 1.1279
