In [1]:
import pandas as pd
import numpy as np
import re
import math
from operator import itemgetter
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics

In [2]:
# 化为字典形式，并删除值为零的键值对
def DF_to_dict(dataframe):
    new_dataframe = dataframe.T
    dict = new_dataframe.to_dict()
    list1 = []
    list2 = []
    for i,n in dict.items():
        for j,k in n.items():
            list1.append(i)
            list2.append(j)
    for m in range(len(list1)):
        if dict[list1[m]][list2[m]] == 0:
            del dict[list1[m]][list2[m]]

    return dict

# 计算稀疏矩阵与物品相似度
def item_similarity(userSet, cluster = None):
    C = dict()
    N = dict()
    for u, items in userSet.items():
        for i in items:
            N.setdefault(i, 0)
            N[i] += 1
            for j in items:
                if i == j:
                    continue
                C.setdefault(i, {})
                C[i].setdefault(j, 0)
                C[i][j] += 1 / math.log(1 + len(items))

    # if cluster != None:
    #     for i, related_j in C.items():
    #         for j, similarity in related_j.items():
    #             count = 0
    #             for cate, cate_items in cluster.items():
    #                 if i in cate_items and j in cate_items:
    #                     count = 1
    #                     break
    #             if count == 0:
    #                 del C[i][j]
    # print(C)

    # print("稀疏矩阵: ", C)
    W = dict()
    for i, related_items in C.items():
        for j, cij in related_items.items():
            W.setdefault(i, {})
            W[i].setdefault(j, 0)
            W[i][j] = cij / math.sqrt(N[i] * N[j]) * 100

    # print("物品相似度: ", W)
    return W


def calc_rating(user_id, item_id, test, W):
    rank = dict()
    interacted_items = test[user_id]  # 该用户的浏览情况
    # print(interacted_items)
    for item in interacted_items: # 根据该资讯寻找相似资讯
        if int(item) not in W.keys():
            continue  # 若该资讯无相似资讯则跳过？
        related_item = []
        for re_item, score in W[int(item)].items():
            related_item.append((re_item, score))

        for j, v in sorted(related_item, key=itemgetter(1), reverse=True):  # j:资讯名；v：该资讯权重；  根据权重从大到小排序
            # print(j, ":", v)
            if j in interacted_items:
                continue  # 若用户浏览过该资讯则跳过
            if j not in rank.keys():
                rank[j] = 0  # 若该咨询第一次出现则初始化
            pi = interacted_items[item]
            rank[j] += pi * v
    if rank:
        if item_id in rank:
            rank_max = max(rank.values())
            rank_min = min(rank.values())
            for key in rank.keys():
                rank[key] = (rank[key] - rank_min) / (rank_max - rank_min) * 4 + 1
            result = rank[item_id]
        else:
            result = 0
    else:
        result = 0
    return result

# 计算DCG指数
def getDCG(scores):
    return np.sum(
        np.divide(np.power(2, scores) - 1, np.log(np.arange(scores.shape[0], dtype=np.float32) + 2)),
        dtype=np.float32)

def getNDCG(rank_list, pos_items):
    relevance = np.ones_like(pos_items)
    it2rel = {it: r for it, r in zip(pos_items, relevance)}
    rank_scores = np.asarray([it2rel.get(it, 0.0) for it in rank_list], dtype=np.float32)

    idcg = getDCG(relevance)

    dcg = getDCG(rank_scores)

    if dcg == 0.0:
        return 0.0
    ndcg = dcg / idcg
    return ndcg

In [8]:
# 载入数据
LD = pd.read_csv('zuoxiajiao_df.csv', index_col=0)
RD = pd.read_csv('youxiajiao_df.csv', index_col=0)
LU = pd.read_csv('zuoshangjiao_df.csv', index_col=0)
RU = pd.read_csv('youshangjiao_df.csv', index_col=0)

# 预处理数据
Down = pd.concat([LD, RD], axis=1)
Left = pd.concat([LD, LU])
train = DF_to_dict(Down)
test = DF_to_dict(LU)

# 计算物品相似度矩阵，另存为文件
# W = item_similarity(train)
# W_df = pd.DataFrame(W)
# W_df.to_csv('W_df.csv',float_format='%.2f')
W =pd.read_csv('W_SOM_df.csv', index_col=0)
W2 = W.fillna(0)
new_W = DF_to_dict(W2)
print(new_W)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
# 生成ratings预测值文件
test_userid = RU.index.values.tolist()
expected_itemid = RU.columns.values.tolist()
predict_rating = {}
all_pred_label = []
all_true_label = []
index_all = []
for userid in test_userid[:600]:
    sub_pred_label = []
    sub_true_label=[]
    print(1)
    for itemid in expected_itemid:
        a=RU.iloc[test_userid.index(userid)][itemid]
        if (a==0):
            continue
        b = calc_rating(userid, itemid, test, new_W)
        sub_pred_label.append(b)
        sub_true_label.append(a)
        all_pred_label.append(b)
        all_true_label.append(a)
    index_all.append(cal_4_index(sub_pred_label, sub_true_label))


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
  U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [10]:
data_all_1 = index_all
np.save('data_all_SOM_1', data_all_1)

In [19]:
RU.iloc[:4]

Unnamed: 0,1,6,9,15,18,19,23,24,29,35,...,3671,3674,3678,3686,3689,3692,3698,3700,3701,3704
5,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,5,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,3,0,0,...,0,0,0,3,0,0,0,0,0,0
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def cal_4_index(U, T):
    T=np.array(T)
    N=2
    U_MAE=[]
    U_prec=[]
    U_recall=[]
    U_DCG=[]
    recommend_N = list(np.array(U).argsort()[::-1][0:10]) 
    trans_index = [sum(T>=2)+1,sum(T>=3)+1,sum(T>=4)+1,sum(T>=5)+1,1]
    while N<=10:
        U_MAE.append(metrics.mean_absolute_error(T, U))
        U_prec.append(sum(T[recommend_N[:N]]>=4)/min(N,len(T)))
        U_recall.append(sum(T[recommend_N[:N]]>=4)/sum(T>=4))
        true_rank = [trans_index[T[i]-1] for i in recommend_N[:N]]
        U_DCG.append(getNDCG(true_rank, list(range(1, min(len(T),N)+1))))
        N = N+1
    return [U_MAE, U_prec, U_recall, U_DCG]

In [5]:
cal_4_index([1.2,3.1,4.9], [1, 3, 5])

[[0.13333333333333322,
  0.13333333333333322,
  0.13333333333333322,
  0.13333333333333322,
  0.13333333333333322,
  0.13333333333333322,
  0.13333333333333322,
  0.13333333333333322,
  0.13333333333333322],
 [0.5,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]

In [12]:
# 读入数据
LD= pd.read_csv('zuoxiajiao_df.csv',index_col=[0])
RD= pd.read_csv('youxiajiao_df.csv',index_col=[0])
LU= pd.read_csv('zuoshangjiao_df.csv',index_col=[0])
RU= pd.read_csv('youshangjiao_df.csv',index_col=[0])

#预处理数据
Down=pd.concat([LD,RD],axis=1)
Left=pd.concat([LD,LU])
user_id_list=list(Left.index)
n=755
user_id_list_1=user_id_list[0:n-1]

#print (user_id_list)
def cal_sim_list(i):
    def cal_user_sim(j):
        def distance(vector1,vector2):  
            d=0
            for a,b in zip(vector1,vector2):  
                d+=(a-b)**2
            return d**0.5
        vec_1=list(Left.loc[i])
        vec_2=list(Left.loc[j])
        dist=distance(vec_1,vec_2)
        user_sim=1/(1+dist)
        return user_sim
    user_sim_list=list(map(cal_user_sim,user_id_list))
    print('1')
    return user_sim_list
user_sim_mat=list(map(cal_sim_list,user_id_list_1))
user_sim_df = pd.DataFrame(user_sim_mat)
user_sim_df.index = user_id_list_1
user_sim_df.columns = user_id_list
user_sim_df.to_csv('user_sim_mat_1.csv',index=True)
print("输出完成")

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


KeyboardInterrupt: 

In [None]:
# 读入数据
LD= pd.read_csv('zuoxiajiao_df.csv',index_col=[0])
RD= pd.read_csv('youxiajiao_df.csv',index_col=[0])
LU= pd.read_csv('zuoshangjiao_df.csv',index_col=[0])
RU= pd.read_csv('youshangjiao_df.csv',index_col=[0])

#预处理数据
Down=pd.concat([LD,RD],axis=1)
Left=pd.concat([LD,LU])
user_id_list=list(Left.index)
n=755
user_id_list_2=user_id_list[n:2*n-1]

#print (user_id_list)
def cal_sim_list(i):
    def cal_user_sim(j):
        def distance(vector1,vector2):  
            d=0
            for a,b in zip(vector1,vector2):  
                d+=(a-b)**2
            return d**0.5
        vec_1=list(Left.loc[i])
        vec_2=list(Left.loc[j])
        dist=distance(vec_1,vec_2)
        user_sim=1/(1+dist)
        return user_sim
    user_sim_list=list(map(cal_user_sim,user_id_list))
    print('1')
    return user_sim_list
user_sim_mat=list(map(cal_sim_list,user_id_list_2))
user_sim_df = pd.DataFrame(user_sim_mat)
user_sim_df.index = user_id_list_2
user_sim_df.columns = user_id_list
user_sim_df.to_csv('user_sim_mat_2.csv',index=True)
print("输出完成")

In [None]:
# 读入数据
LD= pd.read_csv('zuoxiajiao_df.csv',index_col=[0])
RD= pd.read_csv('youxiajiao_df.csv',index_col=[0])
LU= pd.read_csv('zuoshangjiao_df.csv',index_col=[0])
RU= pd.read_csv('youshangjiao_df.csv',index_col=[0])

#预处理数据
Down=pd.concat([LD,RD],axis=1)
Left=pd.concat([LD,LU])
user_id_list=list(Left.index)
n=755
user_id_list_3=user_id_list[2*n:3*n-1]

#print (user_id_list)
def cal_sim_list(i):
    def cal_user_sim(j):
        def distance(vector1,vector2):  
            d=0
            for a,b in zip(vector1,vector2):  
                d+=(a-b)**2
            return d**0.5
        vec_1=list(Left.loc[i])
        vec_2=list(Left.loc[j])
        dist=distance(vec_1,vec_2)
        user_sim=1/(1+dist)
        return user_sim
    user_sim_list=list(map(cal_user_sim,user_id_list))
    print('1')y
    return user_sim_list
user_sim_mat=list(map(cal_sim_list,user_id_list_3))
user_sim_df = pd.DataFrame(user_sim_mat)
user_sim_df.index = user_id_list_3
user_sim_df.columns = user_id_list
user_sim_df.to_csv('user_sim_mat_3.csv',index=True)
print("输出完成")