# Recommendation System
- baseline estimator
- collaborative filtering (user-based)
- collaborative filtering (item-based)
- ...

In [1]:
def read_data(path):
    # 读取train.txt格式的数据，返回字典
    data = {}
    with open(path, 'r') as f:
        while True:
            line = f.readline().strip()
            if not line:  # EOF
                break
            # 读取user_id和rate_num
            user_id, rate_num = line.split('|')
            rate_num = int(rate_num)
            user_id = int(user_id)
            # 读取用户的评分数据
            rate_data = {}
            for i in range(rate_num):
                item_id, score = f.readline().strip().split()
                item_id = int(item_id)
                score = float(score)
                rate_data[item_id] = score
            # 保存该用户的数据
            data[user_id] = rate_data
    return data

In [2]:
train_path ="data/train_data.txt"
train_data = read_data(train_path)
print("len(train_data):", len(train_data))

len(train_data): 19835


## baseline estimator

### μ : overall mean rating

In [3]:
# 计算全局平均分
def cal_global_avg(data):
    sum_score = 0
    sum_num = 0
    for user_id, rate_data in data.items():
        sum_score += sum(rate_data.values())
        sum_num += len(rate_data)
    return sum_score / sum_num

global_avg = cal_global_avg(train_data)
print("global_avg:", global_avg)

global_avg: 49.47062753677192


### b_x : rating deviation of user x (ave.rating of user x - μ)

In [4]:
# 统计每个用户的平均评分，用户偏差
def cal_user_bias(data, average_score):
    # 每个用户的平均评分
    user_average_score = {}
    for user_id, rate_data in data.items():
        total_score = 0
        for score in rate_data.values():
            total_score += score
        user_average_score[user_id] = total_score / len(rate_data)
    # 每个用户与全局平均评分的偏差
    user_bias = {}
    for user_id, u_ave_score in user_average_score.items():
        user_bias[user_id] = u_ave_score - average_score
    # 最小偏差，最大偏差，平均偏差
    max_bias = max(user_bias.items(), key=lambda x: x[1])
    min_bias = min(user_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in user_bias.values():
        total_bias += bias
    average_bias = total_bias / len(user_bias)
    return user_average_score, user_bias, max_bias, min_bias, average_bias

user_average_score, user_bias, max_bias, min_bias, average_bias = cal_user_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (547, 50.52937246322808)
min_bias: (413, -49.47062753677192)
average_bias: 20.37407831847788


### b_i : rating deviation of item i (ave.rating of item i - μ)

In [5]:
# 统计每个物品的平均评分，物品偏差
def cal_item_bias(data, average_score):
    # 统计物品得分
    item_scores = {}
    for user_id, rate_data in data.items():
        for item_id, score in rate_data.items():
            if item_id in item_scores:
                item_scores[item_id].append(score)
            else:
                item_scores[item_id] = [score]
    # 计算物品平均得分
    item_average_score = {}
    for item_id, scores in item_scores.items():
        item_average_score[item_id] = sum(scores) / len(scores)
    # 计算物品偏差
    item_bias = {}
    for item_id, i_ave_score in item_average_score.items():
        item_bias[item_id] = i_ave_score - average_score
    # 最大偏差，最小偏差，平均偏差
    max_bias = max(item_bias.items(), key=lambda x: x[1])
    min_bias = min(item_bias.items(), key=lambda x: x[1])
    total_bias = 0
    for bias in item_bias.values():
        total_bias += bias
    average_bias = total_bias / len(item_bias)
    return item_average_score, item_bias, max_bias, min_bias, average_bias

item_average_score, item_bias, max_bias, min_bias, average_bias = cal_item_bias(train_data, global_avg)
print("max_bias:", max_bias)
print("min_bias:", min_bias)
print("average_bias:", average_bias)

max_bias: (210761, 50.52937246322808)
min_bias: (211658, -49.47062753677192)
average_bias: -5.654850049099554


### save model

In [7]:
# 保存baseline estimator
import pickle

baseline_estimator = {
    "global_avg": global_avg,
    "user_average_score": user_average_score,
    "user_bias": user_bias,
    "item_average_score": item_average_score,
    "item_bias": item_bias
}
with open("models/baseline_estimator.pkl", "wb") as f:
    pickle.dump(baseline_estimator, f)
print("baseline_estimator saved.")

baseline_estimator saved.


# 协同过滤(User)

In [2]:
user_average = {}
# 计算每个用户的平均打分
for user in user_list.keys():
    sum=0
    for item in user_list[user].keys():
        sum+=user_list[user][item]
    user_average[user]=sum/len(user_list[user])
    for item in user_list[user].keys():
      user_list[user][item]-=user_average[user]

print(user_average)



{'0': 81.05263157894737, '1': 89.83783783783784, '2': 44.875, '3': 88.88888888888889, '4': 61.80525164113786, '5': 30.928270042194093, '6': 55.744680851063826, '7': 72.72727272727273, '8': 95.23809523809524, '9': 86.33333333333333, '10': 84.66666666666667, '11': 52.0, '12': 88.57142857142857, '13': 90.0, '14': 15.014409221902017, '15': 51.61904761904762, '16': 84.28571428571429, '17': 83.42105263157895, '18': 56.40845070422535, '19': 85.38461538461539, '20': 89.0909090909091, '21': 82.60416666666667, '22': 78.0, '23': 69.10714285714286, '24': 70.82191780821918, '25': 82.10526315789474, '26': 86.9047619047619, '27': 59.26696832579186, '28': 87.14285714285714, '29': 50.0, '30': 83.09433962264151, '31': 76.95035460992908, '32': 53.10126582278481, '33': 72.47236180904522, '34': 13.124338624338625, '35': 81.24087591240875, '36': 62.38461538461539, '37': 48.84987168520102, '38': 89.41176470588235, '39': 46.35514018691589, '40': 38.2271762208068, '41': 46.0, '42': 83.50649350649351, '43': 70.

In [3]:
import math
def pearson(user1,user2):
    """
    计算pearson correlation coefficient
    Args:
        user1:用户1的打分列表
        user2:用户2的打分列表
    Returns:
        pearson相关系数
    """
    # 获得共有的item
    shared=set(user1.keys()) & set(user2.keys())

    # 如果没有共同元素，返回无穷
    if not shared:
        return math.inf

    # 计算pearson相关系数
    sim=0
    sum1=0
    sum2=0
    for item in shared:
        temp1=user1[item]
        temp2=user2[item]
        # 为了避免分母为0的情况，对将打分值做一个微调
        if temp1==0:
            temp1=0.1
        if temp2==0:
            temp2=0.1
        sim+=temp1*temp2
        sum1+=temp1**2
        sum2+=temp2**2
    sim=sim/((sum1**0.5)*(sum2**0.5))
    return sim

In [7]:
import heapq
from tqdm import tqdm
import time
# 计算两两用户的相似度
def cal_similarity(train_set):
    """
    计算每个用户其最相似的k个用户

    Args:
        train_set:训练集

    Returns:
        similarity:相似度矩阵
    """
    similarity={key:{} for key in train_set.keys()}
    for i, user1 in tqdm(enumerate(train_set.keys()), desc="Outer Loop"):
        for j, user2 in enumerate(list(train_set.keys())[i+1:], start=i+1):
            pearson_sim = pearson(train_set[user1], train_set[user2])
            similarity[user1][user2] = pearson_sim
            similarity[user2][user1] = pearson_sim
    return similarity

# 为每个用户找最相似的k个用户
def find_k_nearest(similarity,k):
    """
    为每个用户找最相似的k个用户

    Args:
        similarity:相似度矩阵
        k:最相似的k个用户

    Returns:
        nearest:最相似的k个用户
    """
    nearest={}
    for user in similarity.keys():
        nearest[user]=heapq.nlargest(k,similarity[user],key=similarity[user].get)
    return nearest

nearest=cal_similarity(user_list)

print(len(nearest))

Outer Loop: 19835it [1:05:01,  5.08it/s] 


19835


In [14]:
import heapq

test={'a':{'a':1,'b':2,'c':1.5},'b':{'a':1.2,'b':2.2,'c':1.5},'c':{'a':4.1,'b':5.2,'c':1.5}}

for item in nearest.keys():
    for item2 in nearest[item].keys():
        if nearest[item][item2]>1:
            nearest[item][item2]=0

def find_k_nearest(train_set, k):
    top_k_keys = {}
    
    for outer_key, inner_map in train_set.items():
        # 使用heapq.nlargest找到前k个值最大的键
        top_k = heapq.nlargest(k, inner_map.items(), key=lambda item: item[1])
        # 将结果存储在top_k_keys中，只保留键
        top_k_keys[outer_key] = {key:value for key, value in top_k}
    
    return top_k_keys

k_test=find_k_nearest(test,2)
k_nearest=find_k_nearest(nearest,10)
print(k_test)

{'a': {'b': 2, 'c': 1.5}, 'b': {'b': 2.2, 'c': 1.5}, 'c': {'b': 5.2, 'a': 4.1}}


In [18]:
result_path ="data/nearest.txt"
def write_to_file(file_path,data):
    with open(file_path,"w") as f:
        for item1 in data.keys():
            f.write(str(item1))
            for item2 in data[item1].keys():
                f.write(str(item2)+" "+str(data[item1][item2])+'\n')
write_to_file(result_path,nearest)

In [15]:

RMSE=0
num=0
for user in valid_list.keys():
    similar=k_nearest[user]
    for item in valid_list[user].keys():
        sum=0
        predict=0
        for sim_user in similar.keys():
            if item in user_list[sim_user].keys():
                predict+=similar[sim_user]*user_list[sim_user][item]
                sum+=similar[sim_user]
        if sum==0:
            print(similar)
        predict/=sum
        predict+=user_average[user]
        if predict>100:
            predict=100
        if predict<0:
            predict=0
        RMSE+=(predict-valid_list[user][item])**2
        num+=1
RMSE=(RMSE/num)**0.5


{'2': 1.0, '3': 1.0, '9': 1.0, '20': 1.0, '25': 1.0, '26': 1.0, '34': 1.0, '38': 1.0, '46': 1.0, '47': 1.0}


ZeroDivisionError: division by zero

In [None]:
test_list={}
test_path="data/test.txt"

with open(test_path,"r") as f:
    while True:
        data=f.readline()
        if not data:
            break
        data=data.split('|')
        user_id,rate_nums=data[0],data[1]
        user_rate={}
        for i in range(int(rate_nums)):
            rate=f.readline()
            user_rate[rate[0]]=0
        test_list[user_id]=user_rate

for user in test_list.keys():
    similar=k_nearest[user]
    for item in test_list[user].keys():
        sum=0
        for sim_user in similar.keys():
            if item in user_list[sim_user].keys():
                test_list[user][item]+=similar[sim_user]*user_list[sim_user][item]
                sum+=similar[sim_user]
        test_list[user][item]/=sum
        test_list[user][item]+=user_average[user]
        if test_list[user][item]>100:
            test_list[user][item]=100
        if test_list[user][item]<0:
            test_list[user][item]=0

In [6]:
def write_to_file(file_path,data):
    with open(file_path,"w") as f:
        for user in data.keys():
            f.write(str(user)+"|"+str(len(data[user]))+'\n')
            for item in data[user].keys():
                f.write(str(item)+" "+str(data[user][item])+'\n')

write_to_file("data/result.txt",test_list)

NameError: name 'test_list' is not defined

# 协同过滤(Item)

In [None]:
train_set={}
valid_set={}

train_path ="/content/drive/MyDrive/RecSys/data/train_set.txt"
valid_path ="/content/drive/MyDrive/RecSys/data/valid_set.txt"


with open(train_path, "r") as f:
    while True:
        data=f.readline()
        if not data:
            break
        data=data.split('|')
        user_id,rate_nums= data[0],data[1]
        for i in range(int(rate_nums)):
            rate=f.readline()
            rate=rate.split()
            if rate[0] not in train_set:
              train_set[rate[0]]={}
            train_set[rate[0]][user_id]=float(rate[1])

with open(valid_path,"r") as f:
    while True:
        data=f.readline()
        if not data:
            break;
        data=data.split('|')
        user_id,rate_nums=data[0],data[1]
        for i in range(int(rate_nums)):
            rate=f.readline()
            rate=rate.split()
            if rate[0] not in valid_set:
              valid_set[rate[0]]={}
            valid_set[rate[0]][user_id]=float(rate[1])
# 输出user_list的大小
print(len(train_set))
# 输出valid_list的大小
print(len(valid_set))

In [None]:
item_average = {}
# 计算每个Item的平均打分
for item in train_set.keys():
    sum=0
    for user in train_set[item].keys():
        sum+=train_set[item][user]
    item_average[item]=sum/len(train_set[item])
    for user in train_set[item].keys():
      train_set[item][user]-=user_average[user]

In [None]:
# 复用user的pearson相关系数，传参的时候传item即可
def pearson(user1,user2):
    """
    计算pearson correlation coefficient
    Args:
        user1:用户1的打分列表
        user2:用户2的打分列表
    Returns:
        pearson相关系数
    """
    # 获得共有的item
    shared=set(user1.keys()) & set(user2.keys())

    # 如果没有共同元素，返回无穷
    if not shared:
        return math.inf

    # 计算pearson相关系数
    sim=0
    sum1=0
    sum2=0
    for item in shared:
        temp1=user1[item]
        temp2=user2[item]
        # 为了避免分母为0的情况，对将打分值做一个微调
        if temp1==0:
            temp1=0.1
        if temp2==0:
            temp2=0.1
        sim+=temp1*temp2
        sum1+=temp1**2
        sum2+=temp2**2
    sim=sim/((sum1**0.5)*(sum2**0.5))
    return sim

In [None]:
import heapq
from tqdm import tqdm
import time


# 计算两两物品的相似度
def cal_similarity(train_set):
    """
    计算每个用户其最相似的k个用户

    Args:
        train_set:训练集

    Returns:
        similarity:相似度矩阵
    """
    similarity={key:{} for key in train_set.keys()}
    for i, user1 in tqdm(enumerate(train_set.keys()), desc="Outer Loop"):
        for j, user2 in enumerate(list(train_set.keys())[i+1:], start=i+1):
            pearson_sim = pearson(train_set[user1], train_set[user2])
            similarity[user1][user2] = pearson_sim
            similarity[user2][user1] = pearson_sim
    return similarity
similarity=cal_similarity(train_set)

In [None]:
def find_k_nearest(train_set, k):
    top_k_keys = {}

    for outer_key, inner_map in train_set.items():
        # 使用heapq.nlargest找到前k个值最大的键
        top_k = heapq.nlargest(k, inner_map.items(), key=lambda item: item[1])
        # 将结果存储在top_k_keys中，只保留键
        top_k_keys[outer_key] = {key:value for key, value in top_k}

    return top_k_keys

k_nearest=find_k_nearest(nearest,10)