# Recommendation System 1
- collaborative filtering (user-based)
- collaborative filtering (item-based)

In [1]:
import numpy as np
import pickle
from tqdm import tqdm

In [2]:
def read_data(path):
    # 读取train.txt格式的数据，返回字典
    data = {}
    with open(path, 'r') as f:
        while True:
            line = f.readline().strip()
            if not line:  # EOF
                break
            # 读取user_id和rate_num
            user_id, rate_num = line.split('|')
            rate_num = int(rate_num)
            user_id = int(user_id)
            # 读取用户的评分数据
            rate_data = {}
            for i in range(rate_num):
                item_id, score = f.readline().strip().split()
                item_id = int(item_id)
                score = int(score)
                rate_data[item_id] = score
            # 保存该用户的数据
            data[user_id] = rate_data
    return data

In [3]:
train_path ="data/train_data.txt"
train_data = read_data(train_path)
print("len(train_data):", len(train_data))

len(train_data): 19835


In [4]:
valid_path ="data/validate_data.txt"
valid_data = read_data(valid_path)
print("len(valid_data):", len(valid_data))

len(valid_data): 19835


## baseline estimator

### μ : overall mean rating

In [5]:
# 计算全局平均分
def cal_global_avg(data):
    sum_score = 0
    sum_num = 0
    for user_id, rate_data in data.items():
        sum_score += sum(rate_data.values())
        sum_num += len(rate_data)
    return sum_score / sum_num

global_avg = cal_global_avg(train_data)
print("global_avg:", global_avg)

global_avg: 49.471345245393245


### b_x : rating deviation of user x (ave.rating of user x - μ)

In [6]:
# 统计每个用户的平均评分，用户偏差
def cal_user_bias(data, average_score):
    # 每个用户的平均评分
    user_average_score = {}
    for user_id, rate_data in data.items():
        total_score = 0
        for score in rate_data.values():
            total_score += score
        user_average_score[user_id] = total_score / len(rate_data)
    # 每个用户与全局平均评分的偏差
    user_bias = {}
    for user_id, u_ave_score in user_average_score.items():
        user_bias[user_id] = u_ave_score - average_score
    # 最小偏差，最大偏差，平均偏差
    max_bias = max(user_bias.items(), key=lambda x: x[1])
    min_bias = min(user_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in user_bias.values():
        total_bias += bias
    average_bias = total_bias / len(user_bias)
    return user_average_score, user_bias, max_bias, min_bias, average_bias

user_average_score, user_bias, max_bias, min_bias, average_bias = cal_user_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (547, 50.528654754606755)
min_bias: (413, -49.471345245393245)
average_bias: 20.3759866979849


### b_i : rating deviation of item i (ave.rating of item i - μ)

In [7]:
# 统计每个物品的平均评分，物品偏差
def cal_item_bias(data, average_score):
    # 统计物品得分
    item_scores = {}
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            if item_id in item_scores:
                item_scores[item_id].append(score)
            else:
                item_scores[item_id] = [score]
    # 计算物品平均得分
    item_average_score = {}
    for item_id, scores in item_scores.items():
        item_average_score[item_id] = sum(scores) / len(scores)
    # 计算物品偏差
    item_bias = {}
    for item_id, i_ave_score in item_average_score.items():
        item_bias[item_id] = i_ave_score - average_score
    # 最大偏差，最小偏差，平均偏差
    max_bias = max(item_bias.items(), key=lambda x: x[1])
    min_bias = min(item_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in item_bias.values():
        total_bias += bias
    average_bias = total_bias / len(item_bias)
    return item_average_score, item_bias, max_bias, min_bias, average_bias

item_average_score, item_bias, max_bias, min_bias, average_bias = cal_item_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (319866, 50.528654754606755)
min_bias: (112993, -49.471345245393245)
average_bias: -5.650348061258569


In [6]:
baseline_data = {
    "global_avg": global_avg,
    "user_average_score": user_average_score,
    "user_bias": user_bias,
    "item_average_score": item_average_score,
    "item_bias": item_bias
}

with open("models/baseline_data.pkl", "wb") as f:
    pickle.dump(baseline_data, f)

print("baseline_data:", len(baseline_data))

baseline_data: 5


### evaluate model

In [7]:
def RMSE(data, model):
    rmse, count = 0.0, 0
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            predict = model.predict(user_id, item_id)
            rmse += (predict - score) ** 2
            count += 1
    rmse = np.sqrt((rmse / count))
    return rmse 

## collaborative filtering (user-based)

### calculate similarity between users

In [6]:
import math

def pearson(x, y, x_ave, y_ave):
    """
    calculate pearson correlation coefficient
    Args:
        x: the score list of x (user1)
        y: the score list of y (user2)
        x_ave: the average score of x (user1)
        y_ave: the average score of y (user2)
    Returns:
        sim(x, y): the pearson correlation coefficient between x and y
    """
    # 找到两个用户共同评分的物品
    shared_items = set(x.keys()) & set(y.keys())

    # 如果没有共同元素，返回0
    if not shared_items:
        return 0

    # 计算pearson相关系数
    sim, sum1, sum2 = 0, 0, 0
    for item in shared_items:
        temp1 = x[item] - x_ave
        temp2 = y[item] - y_ave
        # 为了避免分母为0的情况，对将打分值做一个微调
        if temp1 == 0:
            temp1 = 0.1
        if temp2 == 0:
            temp2 = 0.1
        sim += temp1 * temp2  # 分子
        # 计算分母
        sum1 += temp1**2
        sum2 += temp2**2
    sim = sim / ((sum1**0.5) * (sum2**0.5))
    return sim

In [7]:
from tqdm import tqdm

# 计算两两用户的相似度
def cal_similarity(train_set, user_average_score):
    """
    calculate the similarity between users
    Args:
        train_set: the train data
    Returns:
        similarity: the similarity matrix
    """
    similarity = {key:{} for key in train_set.keys()}
    for i, user1 in tqdm(enumerate(train_set.keys()), desc="Outer Loop"):
        for j, user2 in enumerate(list(train_set.keys())[i+1:], start=i+1):
            pearson_sim = pearson(train_set[user1], train_set[user2], user_average_score[user1], user_average_score[user2])
            similarity[user1][user2] = pearson_sim
            similarity[user2][user1] = pearson_sim
    return similarity

In [17]:
similarity = cal_similarity(train_data, user_average_score)

Outer Loop: 19835it [1:00:12,  5.49it/s] 


In [18]:
# 保存相似度
similarity_path = "models/similarity.pkl"
with open(similarity_path, "wb") as f:
    pickle.dump(similarity, f)

### estimate rating r_xi as the weighted average 

In [8]:
sorted_neighbors = {}
# 协同过滤算法 (user的好像不用baseline，没说咋用)
def cf_user_predict(user_id, item_list, data, similarity, user_average_score, neighbor_k):
    neighbor = sorted(similarity[user_id], key=lambda x: similarity[user_id][x], reverse=True)
    sorted_neighbors[user_id] = neighbor
    score={}
    for item_id in item_list:
        num=0
        index=0
        predict=0
        sum=0
        while index<len(neighbor):
            if num>=neighbor_k:
                break
            if item_id in data[neighbor[index]]:
                if similarity[user_id][neighbor[index]]<0:
                    break
                num+=1
                predict+=similarity[user_id][neighbor[index]]*(data[neighbor[index]][item_id])
                sum+=similarity[user_id][neighbor[index]]
            index+=1
        if sum!=0:
            predict=predict/sum
        else:
            predict=user_average_score[user_id]
        score[item_id]=predict
    return score

### evaluate the model

In [25]:
valid_path = "data/validate_data.txt"
valid_data = read_data(valid_path)

In [26]:
RMSE=0
num=0
for user in tqdm(valid_data.keys(), desc="Processing users"):
    item_list=list(valid_data[user].keys())
    predict=cf_user_predict(user,item_list,train_data,similarity,user_average_score,10)
    for item in item_list:
        RMSE+=(predict[item]-valid_data[user][item])**2
        num+=1
RMSE=(RMSE/num)**0.5


Processing users: 100%|██████████| 19835/19835 [1:28:02<00:00,  3.76it/s]  


In [27]:
print(RMSE)

34.889065303098945


In [None]:
# 保存相似度和排序后的相似用户
sorted_neighbors_path = "models/sorted_neighbors.pkl"

with open(sorted_neighbors_path, "wb") as f:
    pickle.dump(sorted_neighbors, f)

In [30]:
test_data={}
test_path="data/test.txt"

with open(test_path,"r") as f:
    while True:
        data=f.readline()
        if not data:
            break
        data=data.split('|')
        user_id,rate_nums=int(data[0]),int(data[1])
        user_rate={}
        for i in range(int(rate_nums)):
            rate=int(f.readline())
            user_rate[rate]=0
        test_data[user_id]=user_rate

for user in test_data.keys():
    item_list=list(test_data[user].keys())
    predict=cf_user_predict(user,item_list,train_data,similarity,user_average_score,10)
    for item in item_list:
        test_data[user][item]=predict[item]

KeyboardInterrupt: 

In [None]:
def write_to_file(file_path,data):
    with open(file_path,"w") as f:
        for user in data.keys():
            f.write(str(user)+"|"+str(len(data[user]))+'\n')
            for item in data[user].keys():
                f.write(str(item)+" "+str(data[user][item])+'\n')

write_to_file("data/result.txt",test_data)

## collaborative filtering (item-based)

In [38]:
# 读取item属性
attribute_path = "data\itemAttribute.txt"
item_attribute = {}
with open(attribute_path, "r") as f:
    while True:
        line = f.readline().strip()
        if not line:
            break
        item_id, attribute1, attribute2= line.split('|')
        item_id = int(item_id)
        
        if attribute1 == "None":
            attribute1 = 0
        else:
            attribute1 = int(attribute1)
            
        if attribute2 == "None":
            attribute2 = 0
        else:
            attribute2 = int(attribute2)
        item_attribute[item_id] = (attribute1, attribute2)

In [39]:
print("len(item_attribute):", len(item_attribute))

len(item_attribute): 507172
len(attribute): 71878


In [40]:
# 将属性相同的item放在一起
all_attribute = {}
for item_id, attribute in item_attribute.items():
    if attribute[0]!=0:
        if attribute[0] not in all_attribute:
            all_attribute[attribute[0]] = set()
            all_attribute[attribute[0]].add(item_id)
            all_attribute[attribute[0]].add(attribute[0])
        else:
            all_attribute[attribute[0]].add(item_id)

    if attribute[1]!=0:
        if attribute[1] not in attribute:
            all_attribute[attribute[1]] = set()
            all_attribute[attribute[1]].add(item_id)
            all_attribute[attribute[1]].add(attribute[1])
        else:
            all_attribute[attribute[1]].add(item_id)


In [61]:
train_data={}
valid_data={}

train_path ="data/train_data.txt"
valid_path ="data/validate_data.txt"

with open(train_path, "r") as f:
    while True:
        data=f.readline()
        if not data:
            break
        data=data.split('|')
        user_id,rate_nums= int(data[0]),data[1]
        for i in range(int(rate_nums)):
            rate=f.readline()
            rate=rate.split()
            item_id=int(rate[0])
            if item_id not in train_data:
              train_data[item_id]={}
            train_data[item_id][user_id]=float(rate[1])

with open(valid_path,"r") as f:
    while True:
        data=f.readline()
        if not data:
            break;
        data=data.split('|')
        user_id,rate_nums=int(data[0]),data[1]
        for i in range(int(rate_nums)):
            rate=f.readline()
            rate=rate.split()
            item_id=int(rate[0])
            if item_id not in valid_data:
              valid_data[item_id]={}
            valid_data[item_id][user_id]=float(rate[1])
# 输出user_list的大小
print(len(train_data))
# 输出valid_list的大小
print(len(valid_data))

422558
227064


In [62]:
# 复用user的pearson相关系数，传参的时候传item即可
def pearson(item1,item2,average1,average2):
    """
    计算pearson correlation coefficient
    Args:
        user1:用户1的打分列表
        user2:用户2的打分列表
    Returns:
        pearson相关系数
    """
    # 获得共有的item
    shared=set(item1.keys()) & set(item2.keys())

    # 如果没有共同元素，返回无穷
    if not shared:
        return 0

    # 计算pearson相关系数
    sim=0
    sum1=0
    sum2=0
    for user in shared:
        temp1=item1[user]-average1
        temp2=item2[user]-average2
        # 为了避免分母为0的情况，对将打分值做一个微调
        if temp1==0:
            temp1=0.1
        if temp2==0:
            temp2=0.1
        sim+=temp1*temp2
        sum1+=temp1**2
        sum2+=temp2**2
    sim=sim/((sum1**0.5)*(sum2**0.5))
    return sim

In [77]:
RMSE=0
num=0
from tqdm import tqdm

for item_id, user in tqdm(valid_data.items(), desc="Processing items"):
    atrribute=(0,0)
    if item_id in item_attribute:
        atrribute=item_attribute[item_id]
    neighbor=set()
    if atrribute[0]!=0:
        for item in all_attribute[atrribute[0]]:
            if item in train_data and user_id in train_data[item]:
                neighbor.add(item)
    if atrribute[1]!=0:
        for item in all_attribute[atrribute[1]]:
            if item in train_data and user_id in train_data[item]:
                neighbor.add(item)

    for user_id in user.keys():
        predict=global_avg+user_bias[user_id]
        if item_id in train_data:
            predict+=item_bias[item_id]
            sum=0
            temp_predict=0
            for item in neighbor:
                if item not in train_data:
                    continue
                pear=pearson(train_data[item_id],train_data[item],item_average_score[item_id],item_average_score[item])
                if pear<0:
                    continue
                if user_id not in train_data[item]:
                    continue
                temp_predict+=pear*(train_data[item][user_id]-item_average_score[item])
                sum+=pear
            if sum==0:
                # 计算与其他物品的相似度，并排序
                sim={}
                sim_num=0
                for item in train_data.keys():
                    if sim_num>=30:
                        break
                    if item==item_id:
                        continue
                    if user_id not in train_data[item]:
                        continue
                    pear=pearson(train_data[item_id],train_data[item],item_average_score[item_id],item_average_score[item])
                    sim_num+=1
                    sim[item]=pear
                if len(sim)!=0:
                    neighbor_k = sorted(sim, key=lambda x:sim[x], reverse=True)
                    index=0
                    sum_pearson=0
                    sim_predict=0
                    while index<len(neighbor_k):
                        if index>=10:
                            break
                        temp_item=neighbor_k[index]
                        sim_predict+=sim[temp_item]*(train_data[temp_item][user_id]-item_average_score[temp_item])
                        sum_pearson+=sim[temp_item]
                        index+=1
                    if sum_pearson!=0:
                        sim_predict=sim_predict/sum_pearson
                    predict+=sim_predict
            else:
                temp_predict=temp_predict/sum
                predict+=temp_predict
        num+=1
        RMSE+=(predict-valid_data[item_id][user_id])**2
    print((RMSE/num)**0.5)
RMSE=(RMSE/num)**0.5         

Processing items:   0%|          | 0/227064 [00:00<?, ?it/s]

Processing items:   0%|          | 1/227064 [00:35<2228:23:43, 35.33s/it]

36.33193971777155
36.70865876820201


Processing items:   0%|          | 2/227064 [00:45<1433:52:25, 22.73s/it]


KeyboardInterrupt: 

In [None]:
print(RMSE)

In [2]:
import pickle

class BaselineEstimator:
    def __init__(self, global_avg, user_bias, item_bias):
        self.global_avg = global_avg
        self.user_bias = user_bias
        self.item_bias = item_bias
        self.baseline_estimator = {}

    def fit(self, train_data):
        for user_id, rate_data in train_data.items():
            for item_id, score in rate_data.items():
                self.baseline_estimator[(user_id, item_id)] = self.global_avg + self.user_bias[user_id] + self.item_bias[item_id]

    def save_model(self, path="baseline_estimator.pkl"):
        # 保存自身模型
        with open(path, "wb") as f:
            pickle.dump(self, f)

    def predict(self, user_id, item_id):
        return self.baseline_estimator.get((user_id, item_id), self.global_avg)

# # 示例：创建并保存模型
# global_avg = 3.5
# user_bias = {'user1': 0.1, 'user2': -0.2}
# item_bias = {'item1': 0.05, 'item2': -0.05}
# estimator = BaselineEstimator(global_avg, user_bias, item_bias)

# train_data = {
#     'user1': {'item1': 5, 'item2': 3},
#     'user2': {'item1': 4, 'item2': 2}
# }
# estimator.fit(train_data)
# estimator.save_model("models/baseline_estimator.pkl")

# 读取已经保存的模型
baseline_path = 'models/baseline_estimator.pkl'
with open(baseline_path, 'rb') as f:
    baseline_data = pickle.load(f)

print("Global Average:", baseline_data.global_avg)


Global Average: 49.471345245393245


AttributeError: 'dict' object has no attribute 'global_avg'