# Recommendation System
- baseline estimator
- SVD
- ...

In [1]:
import numpy as np
import pickle
from tqdm import tqdm

In [2]:
def read_data(path):
    # 读取train.txt格式的数据，返回字典
    data = {}
    with open(path, 'r') as f:
        while True:
            line = f.readline().strip()
            if not line:  # EOF
                break
            # 读取user_id和rate_num
            user_id, rate_num = line.split('|')
            rate_num = int(rate_num)
            user_id = int(user_id)
            # 读取用户的评分数据
            rate_data = {}
            for i in range(rate_num):
                item_id, score = f.readline().strip().split()
                item_id = int(item_id)
                score = int(score)
                rate_data[item_id] = score
            # 保存该用户的数据
            data[user_id] = rate_data
    return data

In [3]:
train_path ="data/train_data.txt"
train_data = read_data(train_path)
print("len(train_data):", len(train_data))

len(train_data): 19835


In [4]:
valid_path ="data/validate_data.txt"
valid_data = read_data(valid_path)
print("len(valid_data):", len(valid_data))

len(valid_data): 19835


## baseline estimator

### μ : overall mean rating

In [5]:
# 计算全局平均分
def cal_global_avg(data):
    sum_score = 0
    sum_num = 0
    for user_id, rate_data in data.items():
        sum_score += sum(rate_data.values())
        sum_num += len(rate_data)
    return sum_score / sum_num

In [9]:
global_avg = cal_global_avg(train_data)
print("global_avg:", global_avg)

global_avg: 49.47062753677192


### b_x : rating deviation of user x (ave.rating of user x - μ)

In [6]:
# 统计每个用户的平均评分，用户偏差
def cal_user_bias(data, average_score):
    # 每个用户的平均评分
    user_average_score = {}
    for user_id, rate_data in data.items():
        total_score = 0
        for score in rate_data.values():
            total_score += score
        user_average_score[user_id] = total_score / len(rate_data)
    # 每个用户与全局平均评分的偏差
    user_bias = {}
    for user_id, u_ave_score in user_average_score.items():
        user_bias[user_id] = u_ave_score - average_score
    # 最小偏差，最大偏差，平均偏差
    max_bias = max(user_bias.items(), key=lambda x: x[1])
    min_bias = min(user_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in user_bias.values():
        total_bias += bias
    average_bias = total_bias / len(user_bias)
    return user_average_score, user_bias, max_bias, min_bias, average_bias

In [10]:
user_average_score, user_bias, max_bias, min_bias, average_bias = cal_user_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (547, 50.52937246322808)
min_bias: (413, -49.47062753677192)
average_bias: 20.37407831847788


### b_i : rating deviation of item i (ave.rating of item i - μ)

In [7]:
# 统计每个物品的平均评分，物品偏差
def cal_item_bias(data, average_score):
    # 统计物品得分
    item_scores = {}
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            if item_id in item_scores:
                item_scores[item_id].append(score)
            else:
                item_scores[item_id] = [score]
    # 计算物品平均得分
    item_average_score = {}
    for item_id, scores in item_scores.items():
        item_average_score[item_id] = sum(scores) / len(scores)
    # 计算物品偏差
    item_bias = {}
    for item_id, i_ave_score in item_average_score.items():
        item_bias[item_id] = i_ave_score - average_score
    # 最大偏差，最小偏差，平均偏差
    max_bias = max(item_bias.items(), key=lambda x: x[1])
    min_bias = min(item_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in item_bias.values():
        total_bias += bias
    average_bias = total_bias / len(item_bias)
    return item_average_score, item_bias, max_bias, min_bias, average_bias

In [11]:
item_average_score, item_bias, max_bias, min_bias, average_bias = cal_item_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (210761, 50.52937246322808)
min_bias: (211658, -49.47062753677192)
average_bias: -5.654850049099554


### save model

In [12]:
class BaselineEstimator:
    def __init__(self, global_avg, user_bias, item_bias):
        self.global_avg = global_avg
        self.user_bias = user_bias
        self.item_bias = item_bias
        self.baseline_estimator = {}

    def fit(self, train_data):
        for user_id, rate_data in train_data.items():
            for item_id, score in rate_data.items():
                self.baseline_estimator[(user_id, item_id)] = self.global_avg + self.user_bias[user_id] + self.item_bias[item_id]

    def save_model(self, path="models/baseline_estimator.pkl"):
        # 保存自身模型
        with open(path, "wb") as f:
            pickle.dump(self, f)

    def predict(self, user_id, item_id):
        return self.baseline_estimator.get((user_id, item_id), self.global_avg)  # 不存在则使用global_avg

In [13]:
estimator = BaselineEstimator(global_avg, user_bias, item_bias)
estimator.fit(train_data)
estimator.save_model()  # 保存模型
print(estimator.predict(0,0))  # 预测

49.47062753677192


In [6]:
baseline_data = {
    "global_avg": global_avg,
    "user_average_score": user_average_score,
    "user_bias": user_bias,
    "item_average_score": item_average_score,
    "item_bias": item_bias
}

with open("models/baseline_data.pkl", "wb") as f:
    pickle.dump(baseline_data, f)

print("baseline_data:", len(baseline_data))

baseline_data: 5


### evaluate model

In [8]:
def RMSE(data, model):
    rmse, count = 0.0, 0
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            predict = model.predict(user_id, item_id)
            rmse += (predict - score) ** 2
            count += 1
    rmse = np.sqrt((rmse / count))
    return rmse 

In [17]:
baseline_rmse = RMSE(train_data, estimator)
print("baseline_rmse:", baseline_rmse)

baseline_rmse = RMSE(valid_data, estimator)
print("baseline_rmse:", baseline_rmse)

baseline_rmse: 28.927723508286146
len(valid_data): 19835
baseline_rmse: 38.20913906323089


## SVD

### 尝试将分数压缩到0-10之间(没啥用)

In [9]:
def compress_data(data):
    """
    压缩数据
    Args:
        data:数据
    Returns:
        compressed_data:压缩后的数据
    """
    compressed_data={}
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            if user_id not in compressed_data:
                compressed_data[user_id] = {}
            compressed_data[user_id][item_id] = (float)(score/10)
    return compressed_data

In [10]:
compress_train_data = compress_data(train_data)
print("len(compress_train_data):", len(compress_train_data))
compress_valid_data = compress_data(valid_data)
print("len(compress_valid_data):", len(compress_valid_data))

len(compress_train_data): 19835
len(compress_valid_data): 19835


In [26]:
global_avg = cal_global_avg(compress_train_data)
print("global_avg:", global_avg)

user_average_score, user_bias, max_bias, min_bias, average_bias = cal_user_bias(compress_train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

item_average_score, item_bias, max_bias, min_bias, average_bias = cal_item_bias(compress_train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

global_avg: 4.947062753677194
max_bias: (547, 5.052937246322806)
min_bias: (413, -4.947062753677194)
average_bias: 2.037407831847696
max_bias: (210761, 5.052937246322806)
min_bias: (211658, -4.947062753677194)
average_bias: -0.5654850049047325


### 定义SVD model

In [18]:
baseline_data = {}
with open("models/baseline_data.pkl", "rb") as f:
    baseline_data = pickle.load(f)
# baseline_data["global_avg"] = global_avg
# baseline_data["user_bias"] = user_bias
# baseline_data["item_bias"] = item_bias

In [20]:
class SVD:
    def __init__(self, baseline_data, factor = 50, lambda_p = 1e-2, lambda_q = 1e-2, 
                 lambda_bx = 1e-2, lambda_bi = 1e-2):
        """
        初始化SVD模型
        Args:
            baseline_data: dict, baseline数据
            factor: int, 隐向量的维度
            lambda_p: float, 正则化参数
            lambda_q: float, 正则化参数
            lambda_bx: float, 正则化参数
            lambda_bi: float, 正则化参数
        """
        self.factor = factor  # 隐向量的维度
        # 正则化参数
        self.lambda_p = lambda_p
        self.lambda_q = lambda_q
        self.lambda_bx = lambda_bx
        self.lambda_bi = lambda_bi
        # 用户与物品偏置
        self.global_avg = baseline_data["global_avg"]
        self.bx = baseline_data["user_bias"]
        self.bi = baseline_data["item_bias"]
        # overall max_item_id: 624960 max_user_id: 19834
        max_item_id = 624960
        max_user_id = 19834
        # 随机初始化P(user) Q(item)矩阵
        self.P = np.random.normal(0, 0.1, size=(factor, max_user_id + 1))
        self.Q = np.random.normal(0, 0.1, size=(factor, max_item_id + 1))

    def predict(self, user_id, item_id):
        """
        预测用户user对物品item的评分
        Args:
            user_id: 用户id
            item_id: 物品id
        Returns:
            预测评分
        """
        if user_id in self.bx.keys():
            bx = self.bx[user_id]
        else:
            bx = 0
        if item_id in self.bi.keys():
            bi = self.bi[item_id]
        else:
            bi = 0
        p = self.P[:, user_id]
        q = self.Q[:, item_id]
        return self.global_avg + bx + bi + np.dot(p, q)
    
    def loss(self, data):
        """
        计算loss
        Args:
            data: dict, 训练数据
        Returns:
            loss
        """
        loss, count = 0.0, 0
        for user_id, rate_data in data.items():
            for item_id, score in rate_data.items():
                predict = self.predict(user_id, item_id)
                loss += (predict - score) ** 2
                count += 1
        # 添加正则化项
        loss += self.lambda_p * np.linalg.norm(self.P) ** 2
        loss += self.lambda_q * np.linalg.norm(self.Q) ** 2
        loss += self.lambda_bx * np.linalg.norm(list(self.bx.values())) ** 2
        loss += self.lambda_bi * np.linalg.norm(list(self.bi.values())) ** 2
        return np.sqrt(loss / count)

    def train(self, epoches, lr, data, valid_data):
        """
        训练模型
        Args:
            epoches: int, 迭代次数
            lr: float, 学习率
            data: dict, 训练数据
        """
        for epoch in range(epoches):
            # 使用tqdm显示训练进度
            for user_id, rate_data in tqdm(data.items(), desc="Epoch {}".format(epoch)):
                for item_id, score in rate_data.items():
                    bx = self.bx[user_id]
                    bi = self.bi[item_id]
                    p = self.P[:, user_id]
                    q = self.Q[:, item_id]
                    # 计算梯度
                    error = score - self.predict(user_id, item_id)
                    self.bx[user_id] += lr * (error - self.lambda_bx * bx)
                    self.bi[item_id] += lr * (error - self.lambda_bi * bi)
                    self.P[:, user_id] += lr * (error * q - self.lambda_p * p)
                    self.Q[:, item_id] += lr * (error * p - self.lambda_q * q)
            # 计算loss
            epoch_loss = self.loss(valid_data)
            print("Epoch {} finished: validate loss={}".format(epoch, epoch_loss))
            # 学习率衰减
            lr *= 0.9
        

In [25]:
SVD_model = SVD(baseline_data, factor=100)

In [26]:
SVD_model.train(10, 0.0005, train_data, valid_data)

Epoch 0: 100%|██████████| 19835/19835 [01:18<00:00, 253.54it/s]


Epoch 0 finished: validate loss=30.33555533969568


Epoch 1: 100%|██████████| 19835/19835 [01:13<00:00, 269.10it/s]


Epoch 1 finished: validate loss=29.783277701772967


Epoch 2: 100%|██████████| 19835/19835 [01:30<00:00, 218.58it/s]


Epoch 2 finished: validate loss=29.215482336577743


Epoch 3: 100%|██████████| 19835/19835 [01:18<00:00, 254.25it/s]


Epoch 3 finished: validate loss=28.734161356662582


Epoch 4: 100%|██████████| 19835/19835 [01:18<00:00, 252.00it/s]


Epoch 4 finished: validate loss=28.459166189718015


Epoch 5: 100%|██████████| 19835/19835 [01:17<00:00, 254.42it/s]


Epoch 5 finished: validate loss=28.39569389395577


Epoch 6: 100%|██████████| 19835/19835 [01:14<00:00, 265.49it/s]


Epoch 6 finished: validate loss=28.456305222288922


Epoch 7: 100%|██████████| 19835/19835 [01:14<00:00, 266.88it/s]


Epoch 7 finished: validate loss=28.567826725412083


Epoch 8: 100%|██████████| 19835/19835 [01:15<00:00, 263.24it/s]


Epoch 8 finished: validate loss=28.695097502748464


Epoch 9: 100%|██████████| 19835/19835 [01:12<00:00, 273.02it/s]


Epoch 9 finished: validate loss=28.825384129262765


In [27]:
train_RMSE = RMSE(train_data, SVD_model)
print("train RMSE: ", train_RMSE)
valid_RMSE = RMSE(valid_data, SVD_model)
print("valid RMSE: ", valid_RMSE)

train RMSE:  13.3524420804163
valid RMSE:  28.75117941933756


In [28]:
with open("models/SVD_100factor.pkl", "wb") as f:
    pickle.dump(SVD_model, f)