In [1]:
import random
import time
import operator
import pandas as pd

In [2]:
# load in data and create var
file_path = "./user_taggedbookmarks-timestamps.dat"
records = {}
train_data = dict()
test_data = dict()
user_tags = dict()
tag_items = dict()
user_items = dict()

In [3]:
# use test_dataset to derive accuracy and recall
def precisionAndRecall(N):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user, items in test_data.items():
        if user not in train_data:
            continue
        rank = recommend(user, N)
        for item, rui in rank:
            if item in items:
                hit += 1
        h_recall += len(items)
        h_precision += N
    h_recall = hit/(h_recall*1.0)
    h_precision = hit/(h_precision*1.0)
    return (h_precision, h_recall)

In [4]:
# recommend topN to the users
def recommend(user, N):
    recommend_items = dict()
    tagged_items = user_items[user]
    for tag, wut in user_tags[user].items():
        for item, wti in tag_items[tag].items():
            if item in tagged_items:
                continue
            if item not in recommend_items:
                recommend_items[item] = wut * wti
            else:
                recommend_items[item] = recommend_items[item] + wut * wti
    return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]

In [5]:
# use the test_dataset to evaluate the recommendation results
def testRecommend():
    print("推荐结果评估")
    print("%3s %10s %10s" % ('N', "精确率","召回率"))
    for n in [5,10,20,40,60,80,100]:
        precision, recall = precisionAndRecall(n)
        print("%3d %10.3f%% %10.3f%%" % (n, precision * 100, recall * 100))

In [6]:
# load in data
def load_data():
    print("开始加载数据...")
    df = pd.read_csv(file_path, sep='\t')
    for i in range(len(df)):
        uid = df['userID'][i]
        iid = df['bookmarkID'][i]
        tag = df['tagID'][i]
        records.setdefault(uid,{})
        records[uid].setdefault(iid,[])
        records[uid][iid].append(tag)
    print("数据集大小为 %d." % (len(df)))
    print("设置tag的人数 %d." % (len(records)))
    print("数据加载完成\n")

In [7]:
def train_test_split(ratio, seed=100):
    random.seed(seed)
    for u in records.keys():
        for i in records[u].keys():
            if random.random()<ratio:
                test_data.setdefault(u,{})
                test_data[u].setdefault(i,[])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)
    print("训练集样本数 %d, 测试集样本数 %d" % (len(train_data),len(test_data)))

In [8]:
def addValueToMat(mat, index, item, value=1):
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value

In [9]:
def initSata():
    records = train_data
    for u, items in records.items():
        for i, tags in items.items():
            for tag in tags:
                addValueToMat(user_tags, u, tag, 1)
                addValueToMat(tag_items, tag, i, 1)
                addValueToMat(user_items, u, i, 1)
    print("user_tags, tag_items, user_items初始化完成。")
    print("user_tags大小 %d, tag_items大小 %d, user_items大小 %d" % (len(user_tags), len(tag_items), len(user_items)))

In [10]:
start = time.time()
load_data()
mid1 = time.time()
train_test_split(0.2)
# mid2 = time.time()    
# initSata()    
# mid3 = time.time()
# testRecommend()        
# end = time.time()

开始加载数据...
数据集大小为 437593.
设置tag的人数 1867.
数据加载完成

训练集样本数 1860, 测试集样本数 1793


In [36]:
test_data[8]

{1: [1],
 15: [1, 19],
 17: [1, 20],
 26: [1, 29],
 27: [1, 30],
 41: [43],
 43: [44],
 44: [45],
 52: [57],
 63: [70],
 96: [111]}

In [37]:
train_data[8]

{2: [1],
 7: [1, 6, 7],
 8: [1, 8, 9],
 9: [1, 10],
 10: [1, 11],
 11: [1, 12, 13],
 14: [1, 15, 16, 17, 18],
 16: [1, 5],
 18: [1, 21],
 19: [1, 22],
 20: [1, 23],
 22: [1, 10, 25],
 24: [1, 27],
 25: [1, 28],
 30: [1, 33],
 32: [1, 35],
 33: [1, 36],
 34: [1, 37],
 35: [1, 38],
 36: [1, 39],
 37: [1, 40],
 38: [1, 41],
 42: [43],
 45: [46],
 46: [47],
 48: [49, 50, 51, 52],
 53: [51, 58, 59, 60, 61],
 58: [2, 66],
 59: [56],
 60: [2, 67],
 61: [1, 68],
 62: [69],
 64: [71],
 65: [71],
 68: [45, 76, 77],
 71: [44],
 72: [45, 79],
 73: [24, 56, 80],
 74: [24, 81, 82, 83],
 75: [45, 76, 84],
 76: [45, 76],
 78: [45, 76, 86, 87],
 79: [45, 88, 89, 90, 91, 92, 93],
 80: [45, 88, 94, 95],
 81: [96, 97],
 82: [68, 98, 99, 100],
 83: [2, 101, 102],
 84: [2, 103],
 85: [104, 105],
 87: [2, 25, 82, 108],
 88: [2, 25, 66, 82, 106],
 89: [82, 109],
 90: [24],
 91: [24],
 92: [24],
 93: [24, 25, 66],
 94: [2],
 98: [112],
 99: [113]}