In [3]:
import json
import math
import os
import random

from sklearn.model_selection import train_test_split


class NewUserCF:
    # 初始化函数 .max data 表示数据中的评分的最大值，即初始时间衰减函数中的
    def __init__(self, datafile):
        self.alpha = 0.5
        self.beta = 0.8
        self.datafile = datafile
        self.train, self.test, self.max_data = self.loadData()
        self.users_sim = self.UserSimilarityBest()

    # 加载数据集，并拆分成训练集和测试集
    def loadData(self):
        print("Start load Data 和 拆分数据 ...")
        data = list()
        max_data = 0
        for line in open(self.datafile):
            userid, itemid, recond, timestamp = line.split("::")
            data.append((userid, itemid, int(recond), int(timestamp)))
            if int(timestamp) > max_data:
                max_data = int(timestamp)

        # 调用sklearn 拆分成训练集和测试集
        train_list, test_list = train_test_split(data, test_size=0.1, random_state=40)
        # 将train 和 test 转化为字典格式方便使用
        train_dict = self.transform(train_list)
        test_dict = self.transform(test_list)

        return train_dict, test_dict, max_data

    # list - > dict
    def transform(self, data):
        data_dict = dict()
        for user, item, record, timestamp in data:
            data_dict.setdefault(user, {}).setdefault(item, {})
            data_dict[user][item]['rate'] = record
            data_dict[user][item]['time'] = timestamp

        return data_dict

    # 计算用户与用户之间的相似度，采用惩罚热门商品和优化算法复杂度的算法
    def UserSimilarityBest(self):
        print("计算用户与用户之间的相似度...")
        if os.path.exists("../dataSets/user_sim.json"):
            print("从文件加载...")
            userSim = json.load(open("../dataSets/user_sim.json", "r"))
        else:
            # 得到每个item被那些user 评价过
            item_eval_by_users = dict()
            for u, items in self.train.items():
                for i in items.keys():
                    item_eval_by_users.setdefault(i, set())
                    if self.train[u][i]['rate'] > 0:
                        item_eval_by_users[i].add(u)
            # 构建倒排表
            count = dict()
            # 用户评价过多少个skill
            user_eval_item_count = dict()
            for i, users in item_eval_by_users.items():
                for u in users:
                    user_eval_item_count.setdefault(u, 0)
                    user_eval_item_count[u] += 1
                    count.setdefault(u, {})
                    for v in users:
                        count[u].setdefault(v, 0)
                        if u == v:
                            continue
                        # 公式不完全
                        count[u][v] += 1 / (
                                    1 + self.alpha * abs(self.train[u][i]["time"] - self.train[v][i]["time"]) / (
                                        24 * 60 * 60)) * 1 / math.log(1 + len(users))

            # 相似矩阵
            userSim = dict()
            for u, related_users in count.items():
                userSim.setdefault(u, {})
                for v, cuv in related_users.items():
                    if u == v:
                        continue
                    userSim[u].setdefault(v, 0.0)
                    userSim[u][v] = cuv / math.sqrt(user_eval_item_count[u] * user_eval_item_count[v])
                    json.dump(userSim, open("../dataSets/user_sim.json", "w"))

                return userSim

    """
        为用户user 进行物品推荐
         user 为用户进行推荐
         k 选取k个近邻用户
         nitem ： 取nitem 个物品
    """

    def recommend(self, user, k=8, nitems=40):
        rank = dict()
        interacted_items = self.train.get(user, {})
        for v, wuv in sorted(self.users_sim[user].items(), key=lambda x: x[1], reverse=True)[0:k]:
            for i, rv in self.train[v].items():
                for i in interacted_items:
                    continue
                rank.setdefault(i, 0)
                # rank[i] += wnv * rv["rate"]
                rank[i] += wuv * rv["rate"] * 1 / (1 + self.beta * (self.max_data - abs(rv["time"])))

        return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:nitems])

    def precision(self, k=8, nitems=10):
        hit = 0
        precision = 0
        for user in random.sample(self.train.keys(), 10):
            tu = self.test.set(user, {})
            rank = self.recommend(user, k=k, nitems=nitems)
            for item, rate in rank.items():
                if item in tu:
                    hit += 1

            precision += nitems

        return hit / (precision * 1.0)


if __name__ == '__main__':
    # cf = NewUserCF("../dataSets/ml-1m/ratings.dat")
    # result = cf.recommend("1")
    # print("user '1' recommend result is {}".format(result))

    # pre = cf.precision()
    # print("pre is {}".format(pre))

    pass




In [None]:
import json
import math
import os
import random

from sklearn.model_selection import train_test_split

class ItemBasedCF:
    def __init__(self,datafile):
        self.slpha = 0.5
        self.beta = 0.8
        self.datafile = datafile
        self.train,self.test,self.max_data = self.loadData()
        self.items_sim = self.ItemSimilarityBest()

    # 加载数据集，并差分成训练接和测试集
    def loadData(self):
        print("开始加载数据集合拆分数据集...")
        data = list()
        max_data = 0
        for line in open(self.datafile):
            userid,itemid,record,timestamp = line.split("::")
            data.append((userid,itemid,int(record),int(timestamp)))
            if int(timestamp) > max_data:
                max_data = int(timestamp)

            # 调用sklearn拆分数据集
            trian_list,test_list = train_test_split(data,test_size=0.1,random_state=40)
            # 将train和test转化为字典格式方便调节
            train_dict = self.transform(trian_list)
            test_dict = self.transform(test_list)

            return train_dict,test_dict,max_data

    # 将list转化为dict
    def transform(self,data):
        data_dict = dict()
        for user,item,record,timestamp in data:
            data_dict.setdefault(user,{}).setdefault(item,{})
            data_dict[user][item]["rate"] = record
            data_dict[user][item]["time"] = timestamp
        
        return data_dict

    # 计算物品之间的相似度
    def ItemSimilarityBest(self):
        print("开始计算物品之间的相似度")
        if os.path.exists("../dataSets/ml-1m/item_sim.json"):
            print("从文件中加载数据")
            itemSim = json.load(open("../dataSets/ml-1m/item_sim.json","r"))
        else:
            itemSim = dict()
            # 记录每个物品有多少用户产生过行为
            item_eval_by_user_count = dict()
            # 共现矩阵
            count = dict()
            for user,items in self.train.items():
                # print("user is {}".format(user))
                for i in items.keys():
                    item_eval_by_user_count.setdefault(i,0)
                    if self.train[str(user)][i]["rate"] > 0.0:
                        item_eval_by_user_count[i]+=1
                    for j in items.keys():
                        count.setdefault(i,{}).setdefault(j,0)
                        if self.train[str(user)][i]["rate"] > 0.0 and self.train[str(user)][j]["rate"]>0.0 and i!= j:
                            count[i][j] += 1* 1 / (1+self.alpha * abs(self.train[user][i]["time"] - self.train[user][j]["time"])/(24*60*60))
            # 共现矩阵 -> 相似度矩阵
            for i,related_items in count.items():
                itemSim.setdefault(i,{})
                for j,num in related_items.items():
                    itemSim[i].setdefault(j,0)
                    itemSim[i][j] = num/math.sqrt(item_eval_by_user_count[i] * item_eval_by_user_count[j])

        json.dump(itemSim,open("data/item_sim.json","w"))
        return json

    # 为用户推荐    
    def recommend(self,user,k=8,nitems=40):
        result = dict()
        u_items = self.train.get(user,{})
        for i,rate_time in u_items.items():
            for j,wj in sorted(self.items_sim[i].items(),key=lambda x:x[1],reverse=True)[0:k]:
                if j in u_items:
                    continue
                result.setdefault(j,0)
                # result[j] += reate_time["rate"] * wj
                result[j] += rate_time["rate"]*wj*1/(1+self.beta * (self.max_data - abs(rate_time["time"]))) 

        return dict(sorted(result.items(),key=lambda x:x[1],reverse=True)[0:nitems])


    # 计算准确率  选取10个用户进行测试
    def precision(self,k=8,nitems=10):
        hit = 0
        precision = 0
        print(len(self.test.keys()))
        for user in random.sample(self.test.keys(),10):
            tu = self.test.set(user, {})
            rank = self.recommend(user, k=k, nitems=nitems)
            for item, rate in rank.items():
                if item in tu:
                    hit += 1

            precision += nitems

        return hit / (precision * 1.0)

if __name__ == '__main__':
    cf = ItemBasedCF("../dataSets/ml-1m/ratings.dat")
    result = cf.recommend("1")
    print("user '1' recommend result is {}".format(result))

    pre = cf.precision()
    print("pre is {}".format(pre))



In [None]:
# 创建一个基于地域和热度的酒店推荐系统
import pandas as pd

class RecBasedAH:
    def __init__(self,path=None,addr="朝阳区",type="score",k=10,sort=False):
        self.path = path
        self.addr = addr
        self.type = type
        self.k = k
        self.sort = sort
        self.data = self.load_mess()
    
    # 使用pandas 加载数据
    def load_mess(self):
        data = pd.read_csv(self.path,header=0,sep=",",encoding="gbk")
        return data[data["addr"]==self.addr]

    # ~
    def reccomond(self):
        if self.type in ["score","comment_num","lowest_price","decoration_time","open_time"]:
            data = self.data.sort_values( by= [self.type,"lowest_price"],ascending=self.sort)[:self.k]
            return dict(data.filter(items=["name",self.type]).values)
        # 综合排序
        else self.type == "combine":
            # 过滤得到使用的信息
            data = self.data.filter(items=["name","score","comment_num","lowest_price","decoration_time","open_time","lowset_price"])  
            # 对装修时间做出处理
            data["decoration_time"] = data["decoration_time"].apply(lambda x:int(x)-2018)
            data["open_time"] = data["open_time"].apply(lambda x:2018 - int(x))



if __name__ == "__main__":
    path = "../dataSets/hotel-mess/hotel-mess.csv"

    hotel_rec = RecBasedAH(path,addr="丰台区",type="combine",k=10,sort=False)
    results = hotel_rec.reccomond()
    print(results)

