# Recommendation System
- baseline estimator
- SVD
- SVD + bias
- SVD + bias + attributes

In [1]:
import numpy as np
import pickle
from tqdm import tqdm

In [2]:
def read_data(path):
    # 读取train.txt格式的数据，返回字典
    data = {}
    with open(path, 'r') as f:
        while True:
            line = f.readline().strip()
            if not line:  # EOF
                break
            # 读取user_id和rate_num
            user_id, rate_num = line.split('|')
            rate_num = int(rate_num)
            user_id = int(user_id)
            # 读取用户的评分数据
            rate_data = {}
            for i in range(rate_num):
                item_id, score = f.readline().strip().split()
                item_id = int(item_id)
                score = int(score)
                rate_data[item_id] = score
            # 保存该用户的数据
            data[user_id] = rate_data
    return data

In [3]:
train_path ="data/train_data.txt"
train_data = read_data(train_path)
print("len(train_data):", len(train_data))

len(train_data): 19835


In [4]:
valid_path ="data/validate_data.txt"
valid_data = read_data(valid_path)
print("len(valid_data):", len(valid_data))

len(valid_data): 19835


## baseline estimator

### μ : overall mean rating

In [1]:
# 计算全局平均分
def cal_global_avg(data):
    sum_score = 0
    sum_num = 0
    for user_id, rate_data in data.items():
        sum_score += sum(rate_data.values())
        sum_num += len(rate_data)
    return sum_score / sum_num

In [2]:
global_avg = cal_global_avg(train_data)
print("global_avg:", global_avg)

NameError: name 'train_data' is not defined

### b_x : rating deviation of user x (ave.rating of user x - μ)

In [47]:
# 统计每个用户的平均评分，用户偏差
def cal_user_bias(data, average_score):
    # 每个用户的平均评分
    user_average_score = {}
    for user_id, rate_data in data.items():
        total_score = 0
        for score in rate_data.values():
            total_score += score
        user_average_score[user_id] = total_score / len(rate_data)
    # 每个用户与全局平均评分的偏差
    user_bias = {}
    for user_id, u_ave_score in user_average_score.items():
        user_bias[user_id] = u_ave_score - average_score
    # 最小偏差，最大偏差，平均偏差
    max_bias = max(user_bias.items(), key=lambda x: x[1])
    min_bias = min(user_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in user_bias.values():
        total_bias += bias
    average_bias = total_bias / len(user_bias)
    return user_average_score, user_bias, max_bias, min_bias, average_bias

In [48]:
user_average_score, user_bias, max_bias, min_bias, average_bias = cal_user_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (547, 50.52937246322808)
min_bias: (413, -49.47062753677192)
average_bias: 20.37407831847788


### b_i : rating deviation of item i (ave.rating of item i - μ)

In [49]:
# 统计每个物品的平均评分，物品偏差
def cal_item_bias(data, average_score):
    # 统计物品得分
    item_scores = {}
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            if item_id in item_scores:
                item_scores[item_id].append(score)
            else:
                item_scores[item_id] = [score]
    # 计算物品平均得分
    item_average_score = {}
    for item_id, scores in item_scores.items():
        item_average_score[item_id] = sum(scores) / len(scores)
    # 计算物品偏差
    item_bias = {}
    for item_id, i_ave_score in item_average_score.items():
        item_bias[item_id] = i_ave_score - average_score
    # 最大偏差，最小偏差，平均偏差
    max_bias = max(item_bias.items(), key=lambda x: x[1])
    min_bias = min(item_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in item_bias.values():
        total_bias += bias
    average_bias = total_bias / len(item_bias)
    return item_average_score, item_bias, max_bias, min_bias, average_bias

In [50]:
item_average_score, item_bias, max_bias, min_bias, average_bias = cal_item_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (210761, 50.52937246322808)
min_bias: (211658, -49.47062753677192)
average_bias: -5.654850049099554


### save model

In [61]:
class BaselineEstimator:
    def __init__(self, global_avg, user_bias, item_bias):
        self.global_avg = global_avg
        self.user_bias = user_bias
        self.item_bias = item_bias
        self.baseline_estimator = {}

    def save_model(self, path="models/baseline_estimator.pkl"):
        # 保存自身模型
        with open(path, "wb") as f:
            pickle.dump(self, f)

    def predict(self, user_id, item_id):
        score = self.global_avg
        if user_id in self.user_bias:
            score += self.user_bias[user_id]
        if item_id in self.item_bias:
            score += self.item_bias[item_id]
        score = max(0, score)
        score = min(100, score)
        return score

In [65]:
estimator = BaselineEstimator(global_avg, user_bias, item_bias)
print(estimator.predict(1,127640))  # 预测
print(estimator.user_bias[1])
print(estimator.item_bias[127640])
print(estimator.global_avg)

100
40.474727654484916
33.31725125110686
49.47062753677192


In [66]:
estimator.save_model()

In [6]:
baseline_data = {
    "global_avg": global_avg,
    "user_average_score": user_average_score,
    "user_bias": user_bias,
    "item_average_score": item_average_score,
    "item_bias": item_bias
}

with open("models/baseline_data.pkl", "wb") as f:
    pickle.dump(baseline_data, f)

print("baseline_data:", len(baseline_data))

baseline_data: 5


### evaluate model

In [5]:
def RMSE(data, model):
    rmse, count = 0.0, 0
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            predict = model.predict(user_id, item_id)
            rmse += (predict - score) ** 2
            count += 1
    rmse = np.sqrt((rmse / count))
    return rmse 

In [67]:
with open("models/baseline_estimator.pkl", "rb") as f:
    estimator = pickle.load(f)

baseline_rmse = RMSE(train_data, estimator)
print("baseline_rmse:", baseline_rmse)

baseline_rmse = RMSE(valid_data, estimator)
print("baseline_rmse:", baseline_rmse)

baseline_rmse: 27.332504852224535
baseline_rmse: 29.993169296413154


## SVD
various SVD models with 50 latent factors:
- basic SVD
  - time of 1 epoch: 50s
  - train RMSE:  17.82015769020438
  - valid RMSE:  29.4390823320279
- SVD + bias
  - time of 1 epoch: 75s
  - train RMSE:  16.57224154523797
  - valid RMSE:  27.77936363893447
- SVD + bias + attributes (k=3 portion=7:3)
  - time of 1 epoch: 100s
  - train RMSE:  16.812805151033487
  - valid RMSE:  27.056434665845632
- factors = 200, k = 5, portion = 6:4, lambda = 0.1
  - train RMSE:  13.99533877730795
  - valid RMSE:  26.50446973316574

### 尝试将分数压缩到0-10之间(没啥用)

In [9]:
def compress_data(data):
    """
    压缩数据
    Args:
        data:数据
    Returns:
        compressed_data:压缩后的数据
    """
    compressed_data={}
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            if user_id not in compressed_data:
                compressed_data[user_id] = {}
            compressed_data[user_id][item_id] = (float)(score/10)
    return compressed_data

In [10]:
compress_train_data = compress_data(train_data)
print("len(compress_train_data):", len(compress_train_data))
compress_valid_data = compress_data(valid_data)
print("len(compress_valid_data):", len(compress_valid_data))

len(compress_train_data): 19835
len(compress_valid_data): 19835


In [26]:
global_avg = cal_global_avg(compress_train_data)
print("global_avg:", global_avg)

user_average_score, user_bias, max_bias, min_bias, average_bias = cal_user_bias(compress_train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

item_average_score, item_bias, max_bias, min_bias, average_bias = cal_item_bias(compress_train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

global_avg: 4.947062753677194
max_bias: (547, 5.052937246322806)
min_bias: (413, -4.947062753677194)
average_bias: 2.037407831847696
max_bias: (210761, 5.052937246322806)
min_bias: (211658, -4.947062753677194)
average_bias: -0.5654850049047325


### 定义SVD model

In [6]:
baseline_data = {}
with open("models/baseline_data.pkl", "rb") as f:
    baseline_data = pickle.load(f)
# baseline_data["global_avg"] = global_avg
# baseline_data["user_bias"] = user_bias
# baseline_data["item_bias"] = item_bias

In [27]:
class SVD:
    def __init__(self, baseline_data, factor = 50, lambda_p = 1e-2, lambda_q = 1e-2, 
                 lambda_bx = 1e-2, lambda_bi = 1e-2):
        """
        初始化SVD模型
        Args:
            baseline_data: dict, baseline数据
            factor: int, 隐向量的维度
            lambda_p: float, 正则化参数
            lambda_q: float, 正则化参数
            lambda_bx: float, 正则化参数
            lambda_bi: float, 正则化参数
        """
        self.factor = factor  # 隐向量的维度
        # 正则化参数
        self.lambda_p = lambda_p
        self.lambda_q = lambda_q
        self.lambda_bx = lambda_bx
        self.lambda_bi = lambda_bi
        # 用户与物品偏置
        self.global_avg = baseline_data["global_avg"]
        self.bx = baseline_data["user_bias"]
        self.bi = baseline_data["item_bias"]
        # overall max_item_id: 624960 max_user_id: 19834
        max_item_id = 624960
        max_user_id = 19834
        # 随机初始化P(user) Q(item)矩阵
        self.P = np.random.normal(0, 0.1, size=(factor, max_user_id + 1))
        self.Q = np.random.normal(0, 0.1, size=(factor, max_item_id + 1))

    def predict(self, user_id, item_id):
        """
        预测用户user对物品item的评分
        Args:
            user_id: 用户id
            item_id: 物品id
        Returns:
            预测评分
        """
        if user_id in self.bx.keys():
            bx = self.bx[user_id]
        else:
            bx = 0
        if item_id in self.bi.keys():
            bi = self.bi[item_id]
        else:
            bi = 0
        p = self.P[:, user_id]
        q = self.Q[:, item_id]
        score = self.global_avg + bx + bi + np.dot(p, q)
        score = min(score, 100)
        score = max(score, 0)
        return score
    
    def loss(self, data):
        """
        计算loss
        Args:
            data: dict, 训练数据
        Returns:
            loss
        """
        loss, count = 0.0, 0
        for user_id, rate_data in data.items():
            for item_id, score in rate_data.items():
                predict = self.predict(user_id, item_id)
                loss += (predict - score) ** 2
                count += 1
        # 添加正则化项
        loss += self.lambda_p * np.linalg.norm(self.P) ** 2
        loss += self.lambda_q * np.linalg.norm(self.Q) ** 2
        loss += self.lambda_bx * np.linalg.norm(list(self.bx.values())) ** 2
        loss += self.lambda_bi * np.linalg.norm(list(self.bi.values())) ** 2
        return np.sqrt(loss / count)

    def train(self, epoches, lr, data, valid_data):
        """
        训练模型
        Args:
            epoches: int, 迭代次数
            lr: float, 学习率
            data: dict, 训练数据
        """
        for epoch in range(epoches):
            # 使用tqdm显示训练进度
            for user_id, rate_data in tqdm(data.items(), desc="Epoch {}".format(epoch)):
                for item_id, score in rate_data.items():
                    bx = self.bx[user_id]
                    bi = self.bi[item_id]
                    p = self.P[:, user_id]
                    q = self.Q[:, item_id]
                    # 计算梯度
                    error = score - self.predict(user_id, item_id)
                    self.bx[user_id] += lr * (error - self.lambda_bx * bx)
                    self.bi[item_id] += lr * (error - self.lambda_bi * bi)
                    self.P[:, user_id] += lr * (error * q - self.lambda_p * p)
                    self.Q[:, item_id] += lr * (error * p - self.lambda_q * q)
            # 计算loss
            epoch_loss = self.loss(valid_data)
            print("Epoch {} finished: validate loss={}".format(epoch, epoch_loss))
            # 学习率衰减
            lr *= 0.9
        

In [12]:
SVD_model = SVD(baseline_data, factor=50)

### train and evaluate model

In [14]:
SVD_model.train(5, 0.0005, train_data, valid_data)

Epoch 0: 100%|██████████| 19835/19835 [01:14<00:00, 265.78it/s]


Epoch 0 finished: validate loss=27.93013527620241


Epoch 1: 100%|██████████| 19835/19835 [01:23<00:00, 238.93it/s]


Epoch 1 finished: validate loss=27.79854495676883


Epoch 2: 100%|██████████| 19835/19835 [01:16<00:00, 258.99it/s]


Epoch 2 finished: validate loss=27.777485680470978


Epoch 3: 100%|██████████| 19835/19835 [01:13<00:00, 270.29it/s]


Epoch 3 finished: validate loss=27.807101586190697


Epoch 4: 100%|██████████| 19835/19835 [01:15<00:00, 264.14it/s]


Epoch 4 finished: validate loss=27.856668487707616


In [15]:
train_RMSE = RMSE(train_data, SVD_model)
print("train RMSE: ", train_RMSE)
valid_RMSE = RMSE(valid_data, SVD_model)
print("valid RMSE: ", valid_RMSE)

train RMSE:  16.57224154523797
valid RMSE:  27.77936363893447


In [26]:
with open("models/SVD_50factor.pkl", "wb") as f:
    pickle.dump(SVD_model, f)

In [14]:
with open("models/SVD_50factor.pkl", "rb") as f:
    model = pickle.load(f)

train_RMSE = RMSE(train_data, model)
print("train RMSE: ", train_RMSE)
valid_RMSE = RMSE(valid_data, model)
print("valid RMSE: ", valid_RMSE)

train RMSE:  16.062878369561574
valid RMSE:  28.68405924929402


## using the attributes of the items

In [9]:
class SVD_attribute:
    def __init__(self, baseline_data, similar_nodes, k = 3, factor = 50, lambda_p = 1e-2, lambda_q = 1e-2, 
                 lambda_bx = 1e-2, lambda_bi = 1e-2):
        """
        初始化SVD模型
        Args:
            baseline_data: dict, baseline数据
            similar_nodes: dict, 每个节点的相似节点
            k: int, 使用的相似节点个数
            factor: int, 隐向量的维度
            lambda_p: float, 正则化参数
            lambda_q: float, 正则化参数
            lambda_bx: float, 正则化参数
            lambda_bi: float, 正则化参数
        """
        self.factor = factor  # 隐向量的维度
        # 正则化参数
        self.lambda_p = lambda_p
        self.lambda_q = lambda_q
        self.lambda_bx = lambda_bx
        self.lambda_bi = lambda_bi
        # 用户与物品偏置
        self.global_avg = baseline_data["global_avg"]
        self.bx = baseline_data["user_bias"]
        self.bi = baseline_data["item_bias"]
        # overall max_item_id: 624960 max_user_id: 19834
        max_item_id = 624960
        max_user_id = 19834
        # 随机初始化P(user) Q(item)矩阵
        self.P = np.random.normal(0, 0.1, size=(factor, max_user_id + 1))
        self.Q = np.random.normal(0, 0.1, size=(factor, max_item_id + 1))
        # 相似节点
        self.similar_nodes = similar_nodes
        self.k = k

    def predict(self, user_id, item_id):
        """
        预测用户user对物品item的评分
        Args:
            user_id: 用户id
            item_id: 物品id
        Returns:
            预测评分
        """
        if user_id in self.bx.keys():
            bx = self.bx[user_id]
        else:
            bx = 0
        if item_id in self.bi.keys():
            bi = self.bi[item_id]
        else:
            bi = 0
        p = self.P[:, user_id]
        q = self.Q[:, item_id]
        # 直接得分由SVD模型得到
        direct_score = self.global_avg + bx + bi + np.dot(p, q)
        # indrect_score由相似节点得分平均得到
        indirect_score, count = 0, 0
        if item_id in self.similar_nodes:
            for node_id in self.similar_nodes[item_id]:
                temp_q = self.Q[:, node_id]
                indirect_score += np.dot(p, temp_q) + self.global_avg + bx
                if node_id in self.bi.keys():
                    indirect_score += self.bi[node_id]
                count += 1
                if count == self.k:
                    break
        if count == 0:
            score = direct_score
        else:
            score = direct_score * 0.6 + (indirect_score / count) * 0.4
        score = min(score, 100)
        score = max(score, 0)
        return score

    def loss(self, data):
        """
        计算loss
        Args:
            data: dict, 训练数据
        Returns:
            loss
        """
        loss, count = 0.0, 0
        for user_id, rate_data in data.items():
            for item_id, score in rate_data.items():
                predict = self.predict(user_id, item_id)
                loss += (predict - score) ** 2
                count += 1
        # 添加正则化项
        loss += self.lambda_p * np.linalg.norm(self.P) ** 2
        loss += self.lambda_q * np.linalg.norm(self.Q) ** 2
        loss += self.lambda_bx * np.linalg.norm(list(self.bx.values())) ** 2
        loss += self.lambda_bi * np.linalg.norm(list(self.bi.values())) ** 2
        return np.sqrt(loss / count)

    def train(self, epoches, lr, data, valid_data):
        """
        训练模型
        Args:
            epoches: int, 迭代次数
            lr: float, 学习率
            data: dict, 训练数据
        """
        for epoch in range(epoches):
            # 使用tqdm显示训练进度
            for user_id, rate_data in tqdm(data.items(), desc="Epoch {}".format(epoch)):
                for item_id, score in rate_data.items():
                    bx = self.bx[user_id]
                    bi = self.bi[item_id]
                    p = self.P[:, user_id]
                    q = self.Q[:, item_id]
                    # 计算梯度
                    error = score - self.predict(user_id, item_id)
                    self.bx[user_id] += lr * (error - self.lambda_bx * bx)
                    self.bi[item_id] += lr * (error - self.lambda_bi * bi)
                    self.P[:, user_id] += lr * (error * q - self.lambda_p * p)
                    self.Q[:, item_id] += lr * (error * p - self.lambda_q * q)
            # 计算loss
            epoch_loss = self.loss(valid_data)
            print("Epoch {} finished: validate loss={}".format(epoch, epoch_loss))
            # 学习率衰减
            lr *= 0.95

In [8]:
with open("data/similar_nodes.pkl", "rb") as f:
    similar_nodes = pickle.load(f)
print("len(similar_nodes):", len(similar_nodes))

len(similar_nodes): 507172


In [10]:
SVD_attribute_model = SVD_attribute(baseline_data, similar_nodes, k = 5, factor = 200, lambda_bi=1e-1, lambda_bx=1e-1, lambda_p=1e-1, lambda_q=1e-1)

In [11]:
SVD_attribute_model.train(10, 0.0005, train_data, valid_data)

Epoch 0:   0%|          | 0/19835 [00:00<?, ?it/s]

Epoch 0: 100%|██████████| 19835/19835 [04:39<00:00, 71.06it/s] 


Epoch 0 finished: validate loss=29.74155052825229


Epoch 1: 100%|██████████| 19835/19835 [03:29<00:00, 94.85it/s] 


Epoch 1 finished: validate loss=29.260608661309202


Epoch 2: 100%|██████████| 19835/19835 [03:24<00:00, 97.07it/s] 


Epoch 2 finished: validate loss=28.773075826218733


Epoch 3: 100%|██████████| 19835/19835 [03:20<00:00, 98.75it/s] 


Epoch 3 finished: validate loss=28.26915588825707


Epoch 4: 100%|██████████| 19835/19835 [03:44<00:00, 88.36it/s] 


Epoch 4 finished: validate loss=27.862518942243195


Epoch 5: 100%|██████████| 19835/19835 [03:32<00:00, 93.26it/s] 


Epoch 5 finished: validate loss=27.581582139240712


Epoch 6: 100%|██████████| 19835/19835 [03:12<00:00, 103.13it/s]


Epoch 6 finished: validate loss=27.415648977219295


Epoch 7: 100%|██████████| 19835/19835 [02:57<00:00, 111.95it/s]


Epoch 7 finished: validate loss=27.332609384701918


Epoch 8: 100%|██████████| 19835/19835 [02:51<00:00, 115.69it/s]


Epoch 8 finished: validate loss=27.303067738773155


Epoch 9: 100%|██████████| 19835/19835 [03:37<00:00, 91.03it/s] 


Epoch 9 finished: validate loss=27.30618087375584


In [12]:
train_RMSE = RMSE(train_data, SVD_attribute_model)
print("train RMSE: ", train_RMSE)
valid_RMSE = RMSE(valid_data, SVD_attribute_model)
print("valid RMSE: ", valid_RMSE)

train RMSE:  13.99533877730795
valid RMSE:  26.50446973316574


In [13]:
with open("models/SVD_attribute_200_5_64.pkl", "wb") as f:
    pickle.dump(SVD_attribute_model, f)

## basic SVD

In [72]:
class SVD_basic:
    def __init__(self, factor = 50, lambda_p = 1e-2, lambda_q = 1e-2):
        """
        初始化SVD模型
        Args:
            factor: int, 隐向量的维度
            lambda_p: float, 正则化参数
            lambda_q: float, 正则化参数
        """
        self.factor = factor  # 隐向量的维度
        # 正则化参数
        self.lambda_p = lambda_p
        self.lambda_q = lambda_q
        # overall max_item_id: 624960 max_user_id: 19834
        max_item_id = 624960
        max_user_id = 19834
        # 随机初始化P(user) Q(item)矩阵
        self.P = np.random.normal(0, 0.1, size=(factor, max_user_id + 1))
        self.Q = np.random.normal(0, 0.1, size=(factor, max_item_id + 1))

    def predict(self, user_id, item_id):
        """
        预测用户user对物品item的评分
        Args:
            user_id: 用户id
            item_id: 物品id
        Returns:
            预测评分
        """
        p = self.P[:, user_id]
        q = self.Q[:, item_id]
        score = np.dot(p, q)
        score = min(score, 100)
        score = max(score, 0)
        return score
    
    def loss(self, data):
        """
        计算loss
        Args:
            data: dict, 训练数据
        Returns:
            loss
        """
        loss, count = 0.0, 0
        for user_id, rate_data in data.items():
            for item_id, score in rate_data.items():
                predict = self.predict(user_id, item_id)
                loss += (predict - score) ** 2
                count += 1
        # 添加正则化项
        loss += self.lambda_p * np.linalg.norm(self.P) ** 2
        loss += self.lambda_q * np.linalg.norm(self.Q) ** 2
        return np.sqrt(loss / count)

    def train(self, epoches, lr, data, valid_data):
        """
        训练模型
        Args:
            epoches: int, 迭代次数
            lr: float, 学习率
            data: dict, 训练数据
        """
        for epoch in range(epoches):
            # 使用tqdm显示训练进度
            for user_id, rate_data in tqdm(data.items(), desc="Epoch {}".format(epoch)):
                for item_id, score in rate_data.items():
                    p = self.P[:, user_id]
                    q = self.Q[:, item_id]
                    # 计算梯度
                    error = score - self.predict(user_id, item_id)
                    self.P[:, user_id] += lr * (error * q - self.lambda_p * p)
                    self.Q[:, item_id] += lr * (error * p - self.lambda_q * q)
            # 计算loss
            epoch_loss = self.loss(valid_data)
            print("Epoch {} finished: validate loss={}".format(epoch, epoch_loss))
            # 学习率衰减
            lr *= 0.95

In [73]:
SVD_basic_model = SVD_basic(factor=50)
SVD_basic_model.train(10, 0.0005, train_data, valid_data)

Epoch 0: 100%|██████████| 19835/19835 [00:54<00:00, 367.24it/s]


Epoch 0 finished: validate loss=39.034601580231886


Epoch 1: 100%|██████████| 19835/19835 [00:51<00:00, 388.71it/s]


Epoch 1 finished: validate loss=33.41025828980945


Epoch 2: 100%|██████████| 19835/19835 [00:56<00:00, 352.05it/s]


Epoch 2 finished: validate loss=31.63125541116009


Epoch 3: 100%|██████████| 19835/19835 [00:54<00:00, 360.87it/s]


Epoch 3 finished: validate loss=30.653772853958973


Epoch 4: 100%|██████████| 19835/19835 [00:57<00:00, 342.17it/s]


Epoch 4 finished: validate loss=30.10287798728087


Epoch 5: 100%|██████████| 19835/19835 [00:50<00:00, 395.97it/s]


Epoch 5 finished: validate loss=29.813117403437783


Epoch 6: 100%|██████████| 19835/19835 [00:54<00:00, 365.81it/s]


Epoch 6 finished: validate loss=29.671782586278294


Epoch 7: 100%|██████████| 19835/19835 [00:55<00:00, 356.38it/s]


Epoch 7 finished: validate loss=29.614579967370098


Epoch 8: 100%|██████████| 19835/19835 [00:53<00:00, 368.63it/s]


Epoch 8 finished: validate loss=29.602995458974654


Epoch 9: 100%|██████████| 19835/19835 [00:52<00:00, 376.97it/s]


Epoch 9 finished: validate loss=29.61521374311972


In [74]:
train_RMSE = RMSE(train_data, SVD_basic_model)
print("train RMSE: ", train_RMSE)
valid_RMSE = RMSE(valid_data, SVD_basic_model)
print("valid RMSE: ", valid_RMSE)

train RMSE:  16.024254197472054
valid RMSE:  29.61382622170249


In [22]:
with open("models/SVD_basic_50factor.pkl", "wb") as f:
    pickle.dump(SVD_basic_model, f)

In [6]:
with open("models/SVD_basic_50factor.pkl", "rb") as f:
    model = pickle.load(f)

print(model.predict(0,0))
print(model.P.shape)
print(model.P[0])
print(model.Q.shape)
print(model.Q[0])

3.7942438495214534
(50, 19835)
[-0.71174277 -0.85639551  1.41811758 ...  1.84501671 -0.89567605
 -0.77031656]
(50, 624961)
[ 0.0712336   0.00737961 -0.01848777 ... -0.1626112   0.04042626
  0.04377546]


# get test result

In [14]:
def read_test_data(path):
    '''
    读取test.txt格式的数据，返回字典
    '''
    data = {}
    with open(path, 'r') as f:
        while True:
            line = f.readline().strip()
            if not line:
                break
            user_id, item_num = line.split('|')
            item_num = int(item_num)
            user_id = int(user_id)
            # 读取待预测的物品
            item_list = []
            for i in range(item_num):
                item_id = int(f.readline().strip())
                item_list.append(item_id)
            data[user_id] = item_list
    return data

In [15]:
test_data = read_test_data("data/test.txt")

In [68]:
with open("models/baseline_estimator.pkl", "rb") as f:
    model = pickle.load(f)

In [69]:
results = {}
for user_id, item_list in test_data.items():
    for item_id in item_list:
        if results.get(user_id) is None:
            results[user_id] = {}
        results[user_id][item_id] = model.predict(user_id, item_id)
print("predict done")

predict done


In [70]:
with open("./results/result_baseline.txt", "w") as f:
    for user_id, rate_data in results.items():
        f.write(str(user_id) + "|" + str(len(rate_data)) + "\n")
        for item_id, score in rate_data.items():
            f.write(str(item_id) + " " + str(score) + "\n")
print("write done")

write done
