# 数据预处理
将train.txt进行二八划分为训练集和验证集。

In [17]:
import random
file_path ="data/train.txt"

user_list = []

# 读取train.txt文件
with open(file_path, "r") as f:
    while True:
        data=f.readline()
        if not data:
            break
        data=data.split('|')

        user_id,rate_nums= data[0],data[1]
        user_rate={} # 保存用户的打分表
        for i in range(int(rate_nums)):
            rate=f.readline()
            rate=rate.split()
            user_rate[rate[0]]=float(rate[1])
        
        user_list.append(user_rate)


# 对train集合进行二八划分，其中20%作为验证集，80%作为训练集，验证集和中的数据用来测试模型的准确性
def split(train_set):
    """
    对train集合进行二八划分
    
    Args:
        train_set:训练集

    Returns:
        train_set:训练集
        valid_set:测试集
    """
    # 设置一个种子，保证每次运行的结果一致
    random.seed(0)
    valid_set={}
    for user in range(len(train_set)):
        test={}
        for item in train_set[user].keys():
            if random.random()<0.2:
                test[item]=train_set[user][item]
        for item in test.keys():
            del train_set[user][item]
        valid_set[user]=test

    return train_set,valid_set
# 将处理结果写入文件中，方便不同算法读取

def write_to_file(file_path,data):
    with open(file_path,"w") as f:
        for user in range(len(data)):
            f.write(str(user)+"|"+str(len(data[user]))+'\n')
            for item in data[user].keys():
                f.write(str(item)+" "+str(data[user][item])+'\n')

train_set,valid_set=split(user_list)
write_to_file("data/train_set.txt",train_set)
write_to_file("data/valid_set.txt",valid_set)


# 协同过滤（Collaborative Filtering）

## User-based

In [18]:
user_average = []
print(len(train_set))
# 计算每个用户的平均打分
for user in train_set:
    sum=0
    for item in user.keys():
        sum+=user[item]
    user_average.append(sum/len(user))


19835


In [30]:
import math

def pearson(user1,user2,user1_id,user2_id):
    """
    计算pearson correlation coefficient
    
    Args:
        user1:用户1的打分列表
        user2:用户2的打分列表
    
    Returns:
        pearson相关系数
    
    """
    average1=user_average[user1_id]
    average2=user_average[user2_id]
    
    # 获得共有的item
    shared=set(user1.keys()) & set(user2.keys())

    # 如果没有共同元素，返回无穷
    if not shared:
        return math.inf
        
    # 计算pearson相关系数
    sim=0
    temp1=0
    temp2=0
    for item in shared:
        sim+=(user1[item]-average1)*(user2[item]-average2)
        temp1+=(user1[item]-average1)**2
        temp2+=(user2[item]-average2)**2
        
    if(temp1==0):
        print("temp1=0")
    if(temp2==0):
        print(shared)
        for item in shared:
            print(user1[item],user2[item],average1,average2)
        print("temp2=0")

    sim=sim/((temp1**0.5)*(temp2**0.5))
    return sim
    

In [31]:
import heapq

# 计算两两用户的相似度
def cal_similarity(train_set):
    """
    计算每个用户其最相似的k个用户
    
    Args:
        train_set:训练集
    
    Returns:
        similarity:相似度矩阵
    """
    similarity={}
    for user1 in range(len(train_set)):
        similarity[user1]={}
        for user2 in range(user1+1,len(train_set)):
            similarity[user1][user2]=pearson(train_set[user1],train_set[user2],user1,user2)
    return similarity

# 为每个用户找最相似的k个用户
def find_k_nearest(similarity,k):
    """
    为每个用户找最相似的k个用户
    
    Args:
        similarity:相似度矩阵
        k:最相似的k个用户
    
    Returns:
        nearest:最相似的k个用户
    """
    nearest={}
    for user in similarity.keys():
        nearest[user]=heapq.nlargest(k,similarity[user],key=similarity[user].get)
    return nearest

nearest=cal_similarity(train_set)
print(nearest)

{'608376', '550452'}
90.0 90.0 81.05263157894737 90.0
90.0 90.0 81.05263157894737 90.0
temp2=0


ZeroDivisionError: float division by zero