# Matrix Factorization

通过矩阵分解将物品、用户协同矩阵，分解为两个低维空间矩阵，通过低维空间向量内积来作为相似度。

In [1]:
import os
import pandas as pd
import numpy as np
import itertools

## 1. 导入数据文件路径

In [2]:
path = '../datasets/ml-1m'
rating_path = os.path.join(path, 'ratings.dat')
movie_path = os.path.join(path, 'movies.dat')

In [3]:
print('评分文件路径是否存在：%s' % (os.path.exists(rating_path)))
print('电影参数路径是否存在：%s' % (os.path.exists(movie_path)))

评分文件路径是否存在：True
电影参数路径是否存在：True


## 2. 构造电影ID到电影名的映射表

In [4]:
movie_id2name = {}
with open(movie_path, 'rt', errors='ignore') as f:
    for line in f:
        line = line.strip('\n')
        movie_id, movie_name, _ = line.split('::')
        movie_id = int(movie_id)-1
        movie_id2name[movie_id] = movie_name

In [5]:
def id2name(ids):
    return [movie_id2name.get(movie, '电影名信息缺失') for movie in ids]

In [6]:
len(movie_id2name)

3883

发现`movies.dat`中有信息缺失

In [7]:
print('缺失电影数：{}'.format(3952 - len(movie_id2name)))

缺失电影数：69


## 3.构造算法模型

In [8]:
class MF:
    def __init__(self, path, user_len, item_len, hidden_dim):
        self.train = np.zeros(shape=(user_len, item_len), dtype=np.float32)
        self.test = {}
        self.train_mask = set()
        self.test_mask = set()
        self.Q = np.random.rand(user_len, hidden_dim)
        self.V = np.random.rand(item_len, hidden_dim)
        self.mu = 0
        self.bu = np.random.rand(user_len)
        self.bi = np.random.rand(item_len)
        self.losses = []
        self.generate_data(path)
    
    def load_data(self, path):
        with open(path, 'rt', encoding='utf-8') as f:
            for line in f:
                yield line.strip('\n')
                
    def generate_data(self, path, pivot=0.8):
        for line in self.load_data(path):
            user, movie, rating, _ = line.split('::')
            user, movie, rating = int(user)-1, int(movie)-1, int(rating)
            if np.random.rand() < pivot:
                self.train[user][movie] = rating
                self.train_mask.add((user, movie))
            else:
                self.test.setdefault(user, {})
                self.test[user][movie] = rating
                self.test_mask.add((user, movie))
        print('{:=^40}'.format('数据装载完毕'))

    def train_model(self, epochs=100, lamb=0.1, learning_rate=0.01):
        epoch = 1
        self.mu = self.train.mean()
        while epoch <= epochs:
            cum_loss = 0
            for i, j in self.train_mask:
                error = self.train[i][j] - self.Q[i] @ self.V[j].T - self.mu - self.bu[i] - self.bi[j]
                loss = (error)**2 + lamb * (np.linalg.norm(self.Q[i], 2)**2 +
                       np.linalg.norm(self.V[j], 2)**2 +
                       self.bu[i]**2  + self.bi[j]**2)
                loss *= 0.5
                cum_loss += loss
                self.Q[i] -= learning_rate * (-1 * error * self.V[j] + lamb * self.Q[i])
                self.V[j] -= learning_rate * (-1 * error * self.Q[i] + lamb * self.V[j])
                self.bu[i] -= learning_rate * (-1 * error + lamb * self.bu[i])
                self.bi[j] -= learning_rate * (-1 * error + lamb * self.bi[j])
            cum_loss /= len(self.train_mask)
            if epoch % 5 == 0:
                print('epoch:{}; loss:{}'.format(epoch, cum_loss))
            self.losses.append(cum_loss)
            epoch += 1
        print('{:=^40}'.format('训练结束！'))        
    
    def predict(self, user, movie):
        '''
        预测用户对电影的评分
        '''
        return self.mu + self.bu[user] + self.bi[movie] + self.Q[user] @ self.V[movie].T
    
    def recommend(self, user, N=10):
        '''
        展示推荐结果，推荐结果中除掉已看电影，使用迭代器进行了优化
        '''
        watched = set(np.argwhere(self.train[user]).flatten())
        similar = iter((self.Q[user] @ self.V.T).argsort()[::-1])
        return list(itertools.islice(filter(lambda x:(x not in watched), similar), N))
    
    def evaluate(self):
        '''
        基于所有测试数据对模型进行评估，返回准确率和召回率
        '''
        hits, precision, recall = 0, 0, 0
        for user,item_rating in self.test.items():
            recommended = self.recommend(user)
            hits += set(recommended) & set(item_rating.keys())
            precision += len(recommended)
            recall += len(item_rating.keys())
        precision, recall = hits/precision, hits/recall
        return precision, recall

In [9]:
mf = MF(path=rating_path, user_len=6040, item_len=3952, hidden_dim=64)



In [10]:
'训练集：{}'.format(len(mf.train_mask))

'训练集：801136'

In [11]:
'测试集：{}'.format(len(mf.test_mask))

'测试集：199073'

In [12]:
mf.train.shape

(6040, 3952)

## 4. 模型训练

In [13]:
mf.train_model()

epoch:5; loss:0.7836267863857456
epoch:10; loss:0.7069094899996432
epoch:15; loss:0.6799557390593624
epoch:20; loss:0.6661813055696568
epoch:25; loss:0.6578836065660846
epoch:30; loss:0.6524442541390613
epoch:35; loss:0.648689917494726
epoch:40; loss:0.6459977892389706
epoch:45; loss:0.644006914957214
epoch:50; loss:0.6424966247478808
epoch:55; loss:0.6413263345502985
epoch:60; loss:0.6404031716849529
epoch:65; loss:0.6396637604097131
epoch:70; loss:0.6390636151716823
epoch:75; loss:0.6385707523070577
epoch:80; loss:0.6381617075780774
epoch:85; loss:0.6378189691646978
epoch:90; loss:0.6375292724031959
epoch:95; loss:0.6372824374202547
epoch:100; loss:0.637070559559776


## 5. 评分预测

In [14]:
print('{:=^40}'.format('预测用户32对电影45评分：'))
score = mf.predict(32, 20)
print('评分为：', score)

评分为： 3.1737573905023213


## 6. 推荐演示

In [15]:
print('{:=^40}'.format('给用户100推荐以下电影：'))
rec = id2name(mf.recommend(100))
for i, movie in enumerate(rec, 1):
    print('No {}: {}'.format(i, movie))

No 1: Prerokbe Ognja (1995)
No 2: Raining Stones (1993)
No 3: Hellhounds on My Trail (1999)
No 4: Happy Go Lovely (1951)
No 5: 电影名信息缺失
No 6: Pharaoh's Army (1995)
No 7: Charm's Incidents (1996)
No 8: Hostile Intentions (1994)
No 9: 电影名信息缺失
No 10: Every Other Weekend (1990)
