In [94]:
#基础环境配置
import os
import numpy as np
from sklearn import  preprocessing 
import pandas as pd
from collections import defaultdict
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import operator
import math
data_dir = '../data'
train_data_name = 'train.txt'
test_data_name = 'test.txt'
train_data_path = os.path.join(data_dir, train_data_name)
test_data_path = os.path.join(data_dir, test_data_name)


In [2]:
#量化评测指标
def Recall(train, test, N):
    hit = 0
    sumd = 0
    for user in train.keys():
        if user not in test:
            continue
        tu = test[user]
        rank = GetRecommendation(user, N)
        for group, pui in rank.items():#pui 评分
            if group in tu:
                hit += 1
        sumd += len(tu)
    return float(hit) / sumd

def Precision(train, test, N):
    hit = 0
    sumd = 0
    for user in train.keys():
        if user not in test:
            continue
        tu = test[user]
        rank = GetRecommendation(user, N)
        for gid, pui in rank.items():
            if gid in tu:
                hit += 1
        sumd += N
    return float(hit) / sumd

def Coverage(train, test, N):
    recommend_groups = set()
    all_gid = set()
    for user in train.keys():
        for gid in train[user].keys():
            all_gid.add(gid)
        rank = GetRecommendation(user, N)
        for gid, pui in rank.items():
            recommend_groups.add(gid)
    return float(len(recommend_groups)) / len(all_gid)

def Popularity(train, test, N):
    group_popularity = dict()
    for user, groups in train.items():
        for group in groups.keys():
            if group not in group_popularity:
                group_popularity[group] = 0
            group_popularity[group] += 1
    ret = 0
    n = 0
    for user in train.keys():
        rank = GetRecommendation(user, N)
        for group, pui in rank.items():
            ret += math.log(1 + group_popularity[group])
            n += 1
    ret /= n * 1.0
    return ret

In [57]:
#训练集和测试集预处理, 包括
# inverse_train
# inverse_test
# org_fea_mat 原始数据的特征矩阵

csv_header = ['uid', 'gid', 'online']
df_train = pd.read_csv(train_data_path, names=csv_header)
df_test = pd.read_csv(test_data_path, names=csv_header)

gidset = set(df_train.gid.unique()) | set(df_test.gid.unique())
uidset = set(df_train.uid.unique()) | set(df_test.uid.unique())
num_gids = len(gidset)
num_uids = len(uidset)
inverse_train = defaultdict(defaultdict)
inverse_test = defaultdict(defaultdict)
org_fea_mat = np.zeros(shape=(num_gids, num_uids))

le_gid = preprocessing.LabelEncoder()
le_uid = preprocessing.LabelEncoder()
le_gid.fit(list(gidset))
le_uid.fit(list(uidset))

def init_train_data(df):
    '''初始化特征矩阵和train'''
    gididx = le_gid.transform(df.gid)
    uididx = le_uid.transform(df.uid)
    org_fea_mat[gididx][uididx] = df.online
    inverse_train[uididx][gididx] = df.online
    
def init_test_data(df):
    gididx = le_gid.transform(df.gid)
    uididx = le_uid.transform(df.uid)
    inverse_test[uididx][gididx] = df.online
    
tmp1 = df_train.apply(init_train_data, axis=1)
tmp2 = df_test.apply(init_test_data, axis=1)

normed_fea_mat = preprocessing.normalize(org_fea_mat,norm='l2')


In [55]:
#训练群组相似度矩阵
sim_mat = 1-pairwise_distances(org_fea_mat, metric="cosine") #该方法是基于稠密矩阵，对于稀疏矩阵，应该重新实现一个高效的
#sim_mat = 1-pairwise_distances(normed_fea_mat, metric="euclidean") #该方法是基于稠密矩阵，对于稀疏矩阵，应该重新实现一个高效的

In [90]:
#用knn
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(normed_fea_mat)
W_knn = defaultdict(defaultdict) #基于knn的最近的k个相似度群组
distance, indx = nbrs.kneighbors(normed_fea_mat)
print indx   


[[  0 264 209 ..., 379 189  59]
 [  1   3   4 ...,  27  32  59]
 [  2 264 209 ..., 189 379  59]
 ..., 
 [740 573  59 ..., 317 132 193]
 [258 741 738 ..., 264  59 132]
 [742 301 298 ..., 249 280 290]]


In [107]:
def RecommendationBasedKnn(train, user_id, distance, indx, K):
    rank = defaultdict(lambda: 0)
    like_gidset = set(train[user_id].keys())
    other_gidset = set(le_gidset - like_gidset)

    for i in other_gidset:
        for idx, j in enumerate(indx[i]):
            if i == j: #在knn的distance列表里，第一个总是他本身
                continue
            if j not in like_gidset:
                continue
            online = train[user_id][j]
            rank[i] += float(online) * distance[i][idx]
        
    return dict(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:K])

def GetRecommendationBasedKnn(user, N):  
    return RecommendationBasedKnn(inverse_train, user, distance,indx, N)

def Precision(train, test, N):
    hit = 0
    sumd = 0
    for user in train.keys():
        if user not in test:
            continue
        tu = test[user]
        rank = GetRecommendationBasedKnn(user, N)
        for gid, pui in rank.items():
            if gid in tu:
                hit += 1
        sumd += N
    return float(hit) / sumd

for k in [5, 10, 15, 20, 40]:
    nbrs = NearestNeighbors(n_neighbors=k,  metric='cosine').fit(normed_fea_mat)
    W_knn = defaultdict(defaultdict) #基于knn的最近的k个相似度群组
    distance, indx = nbrs.kneighbors(normed_fea_mat)
    for n in  [5, 10, 15, 20, 40]:
        pre =  Precision(inverse_train, inverse_test, n)
        print str(k)+','+str(n)+','+str(pre)

ValueError: Metric 'cosine' not valid for algorithm 'ball_tree'

In [106]:
df_pre = pd.read_csv('/Users/yajun/Desktop/test.csv', header=False)
preg = df_pre.groupby(['K', 'N'])

preg.first()

<class 'pandas.core.groupby.DataFrameGroupBy'>


Unnamed: 0_level_0,Unnamed: 1_level_0,P
K,N,Unnamed: 2_level_1
5,5,0.013651
5,10,0.010385
5,15,0.007887
5,20,0.006317
5,40,0.003419
10,5,0.017345
10,10,0.011804
10,15,0.008833
10,20,0.007147
10,40,0.003828


In [54]:
#推荐算法
le_gidset = set(range(num_gids)) #得到标签化转化后的gid集合{0，1，2....}
N = 8

  
W = defaultdict(defaultdict)
for g, i in enumerate(sim_mat):
    for u,j in sorted(enumerate(i), key=operator.itemgetter(1), reverse=True)[1:N+1]:
        W[g][u] = sim_mat[g][u]

def Recommendation(train, user_id, W, K):
    rank = defaultdict(lambda: 0)
    like_gidset = set(train[user_id].keys())
    other_gidset = set(le_gidset - like_gidset)

    for i in other_gidset:
        for j, wj in W[i].items():
            if j  not in like_gidset:
                continue
            online = train[user_id][j]
            rank[i] += float(online) * wj
    return dict(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:K])
        
def GetRecommendation(user, N):  
    return Recommendation(inverse_train, user, W, N)

print 'begin...'
for i in range(5, 25, 5):
    recall = Recall(inverse_train, inverse_test, i)
    precision = Precision(inverse_train, inverse_test,i)
    coverge = Coverage(inverse_train, inverse_test, i)
    #popularity = Popularity(inverse_train, inverse_test, i)
    print '当N = ' + str(i) + '时， Precision = ' + str(precision) + ', Recall = ' + str(recall) + ', Coverge = ' + str(coverge) # + ', Popularity= ' + str(popularity)

print 'end...'

#
#实验结果显示，准确率没有采用集合为特征的算法好
#可能的原因：
#从目前的用户数据看，每个群组内平均下来加入的用户不是很多，因此如果用向量空间模型来表示物品特征，会导致特征矩阵很稀疏

begin...
当N = 5时， Precision = 0.0480064222638, Recall = 0.139242471282, Coverge = 0.74965034965
当N = 10时， Precision = 0.0360717152796, Recall = 0.20925178516, Coverge = 0.87972027972
当N = 15时， Precision = 0.030505753278, Recall = 0.265445513816, Coverge = 0.914685314685
当N = 20时， Precision = 0.0258496119882, Recall = 0.299906861223, Coverge = 0.935664335664
end...
