In [250]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, math, sklearn.metrics as skm, scipy
np.set_printoptions(linewidth=400)

In [251]:
data = pd.read_table('qws2.CSV', header=None, delimiter=',')

pd_data = data.iloc[1:, 0:9]
pd_data

Unnamed: 0,0,1,2,3,4,5,6,7,8
1,302.75,89,7.1,90,73,78,80,187.75,32
2,482,85,16,95,73,100,84,1,2
3,3321.4,89,1.4,96,73,78,80,2.6,96
4,126.17,98,12,100,67,78,82,22.77,89
5,107,87,1.9,95,73,89,62,58.33,93
...,...,...,...,...,...,...,...,...,...
2503,200.8,93,2.4,98,73,100,84,7.4,41
2504,56.17,97,11.3,97,83,78,91,7.17,3
2505,93.93,80,2.1,80,67,78,82,3.72,60
2506,106.75,86,1.3,95,80,78,87,1.25,96


In [252]:
pd_data = pd_data.values.astype(float)
item_size = pd_data.shape[0]
feature_dimension = pd_data.shape[1]
max_length = 10
epsilon = 1E-10

In [253]:
feature_vectors = pd_data
print('feature_vectors:', feature_vectors, sep='\n')

feature_vectors:
[[3.0275e+02 8.9000e+01 7.1000e+00 ... 8.0000e+01 1.8775e+02 3.2000e+01]
 [4.8200e+02 8.5000e+01 1.6000e+01 ... 8.4000e+01 1.0000e+00 2.0000e+00]
 [3.3214e+03 8.9000e+01 1.4000e+00 ... 8.0000e+01 2.6000e+00 9.6000e+01]
 ...
 [9.3930e+01 8.0000e+01 2.1000e+00 ... 8.2000e+01 3.7200e+00 6.0000e+01]
 [1.0675e+02 8.6000e+01 1.3000e+00 ... 8.7000e+01 1.2500e+00 9.6000e+01]
 [3.1650e+02 7.2000e+01 1.5900e+01 ... 8.7000e+01 5.5000e+00 8.0000e+00]]


In [254]:
feature_vectors = feature_vectors / np.linalg.norm(feature_vectors, axis=1, keepdims=True)
print('l2-normalized feature_vectors:', feature_vectors, sep='\n')

l2-normalized feature_vectors:
[[7.52615874e-01 2.21247937e-01 1.76501163e-02 ... 1.98874550e-01 4.66733709e-01 7.95498199e-02]
 [9.25520798e-01 1.63214249e-01 3.07226821e-02 ... 1.61294081e-01 1.92016763e-03 3.84033526e-03]
 [9.98003821e-01 2.67424399e-02 4.20667595e-04 ... 2.40381483e-02 7.81239819e-04 2.88457779e-02]
 ...
 [4.55423370e-01 3.87883207e-01 1.01819342e-02 ... 3.97580287e-01 1.80365691e-02 2.90912405e-01]
 [4.46778607e-01 3.59934053e-01 5.44086360e-03 ... 3.64119333e-01 5.23159961e-03 4.01786850e-01]
 [8.68583913e-01 1.97592549e-01 4.36350212e-02 ... 2.38757663e-01 1.50938753e-02 2.19547277e-02]]


In [255]:
similarities = np.dot(feature_vectors, feature_vectors.T)
print('similarities:', similarities, sep='\n')

similarities:
[[1.         0.86997638 0.77945642 ... 0.75845166 0.73581933 0.88588136]
 [0.86997638 1.         0.94487737 ... 0.73931327 0.71484563 0.98982433]
 [0.77945642 0.94487737 1.         ... 0.51003225 0.50232576 0.8947768 ]
 ...
 [0.75845166 0.73931327 0.51003225 ... 1.         0.9913373  0.81457884]
 [0.73581933 0.71484563 0.50232576 ... 0.9913373  1.         0.78706589]
 [0.88588136 0.98982433 0.8947768  ... 0.81457884 0.78706589 1.        ]]


In [256]:
class User:
    call_num = np.random.randint(5, 15 + 1) # [low, hight + 1)
    call_history = np.random.choice(item_size, call_num, replace=False) # 从item_size中随机挑选call_num个数，不重复
    representative_item_index = call_history[np.random.randint(0, call_num)] # 随机选择一个item作为参考服务
    representative_item_value = pd_data[representative_item_index - 1] # pd_data [0, 2506]
    def __init__(self):
        print(f'call_num: {self.call_num}')
        print(f'call_history: {self.call_history}')
        print(f'representative_item_index: {self.representative_item_index}')
        print(f'representative_item_value: {self.representative_item_value}')
    def __str__(self):
        return f'call_num: {self.call_num} \ncall_history: {self.call_history} \nrepresentative_item_index: {self.representative_item_index} \nrepresentative_item_value: {self.representative_item_value}'

In [257]:
# 生成用户
user = User()

call_num: 14
call_history: [1086  211 1559  620  871  417 2399 2343  690 2489  110  739 1660 1542]
representative_item_index: 1660
representative_item_value: [2.17517e+03 4.70000e+01 2.30000e+00 4.80000e+01 7.30000e+01 1.00000e+02 8.40000e+01 1.84000e+00 5.00000e+00]


In [258]:
# 所有item与参考服务的相似度
similarities_with_rep = similarities[user.representative_item_index]
print('similarities_with_rep', similarities_with_rep, sep='\n')

similarities_with_rep
[0.79709122 0.82272562 0.65714158 ... 0.95761948 0.96169689 0.8810324 ]


In [259]:
# 参考服务与自身的相似度为1
similarities[user.representative_item_index, user.representative_item_index]

0.9999999999999998

In [260]:
# 处理分数
scores = np.exp(0.01 * similarities_with_rep + 0.2)
print('scores', scores, sep='\n')

scores
[1.23117736 1.231493   1.22945553 ... 1.23315533 1.23320561 1.23221126]


In [261]:
# 生成核矩阵
kernel_matrix = scores.reshape((item_size, 1)) * similarities * scores.reshape((1, item_size))
print('kernel_matrix', kernel_matrix, sep='\n')

kernel_matrix
[[1.51579768 1.31904627 1.1798459  ... 1.15150628 1.11719068 1.34394457]
 [1.31904627 1.51657501 1.43060659 ... 1.12273752 1.08562468 1.50201837]
 [1.1798459  1.43060659 1.51156091 ... 0.77326482 0.76161199 1.35554117]
 ...
 [1.15150628 1.12273752 0.77326482 ... 1.52067207 1.50756041 1.23775897]
 [1.11719068 1.08562468 0.76161199 ... 1.50756041 1.52079608 1.19600158]
 [1.34394457 1.50201837 1.35554117 ... 1.23775897 1.19600158 1.51834458]]


In [262]:
# 生成推荐列表
cis = np.zeros((max_length, item_size))
di2s = np.copy(np.diag(kernel_matrix))
selected_items = list()
selected_item = np.argmax(di2s)
selected_items.append(selected_item)

while len(selected_items) < max_length:
    k = len(selected_items) - 1
    ci_optimal = cis[:k, selected_item]
    di_optimal = math.sqrt(di2s[selected_item])
    elements = kernel_matrix[selected_item, :]
    eis = (elements - np.dot(ci_optimal, cis[:k, :])) / di_optimal
    cis[k, :] = eis
    di2s -= np.square(eis)
    di2s[selected_item] = -np.inf
    selected_item = np.argmax(di2s)
    if di2s[selected_item] < epsilon:
        break
    selected_items.append(selected_item)
    
print('rec_items_index:', selected_items)
print("rec_items_value:", pd_data[selected_items], sep='\n')

rec_items_index: [1660, 373, 1160, 1323, 1628, 1329, 2168, 1864, 448]
rec_items_value:
[[1.50000e+02 5.60000e+01 1.10000e+01 5.60000e+01 8.30000e+01 8.90000e+01 9.10000e+01 5.00000e-01 9.20000e+01]
 [4.63761e+03 3.00000e+01 5.00000e-01 3.00000e+01 6.70000e+01 6.70000e+01 8.20000e+01 4.14035e+03 8.00000e+00]
 [4.10000e+01 9.70000e+01 4.31000e+01 9.90000e+01 7.30000e+01 1.00000e+02 8.40000e+01 1.00000e+00 5.00000e+00]
 [4.98967e+03 9.30000e+01 1.60000e+00 9.30000e+01 7.30000e+01 1.00000e+02 8.00000e+01 1.40000e+01 5.00000e+00]
 [4.83500e+01 1.00000e+01 1.49000e+01 1.00000e+01 6.70000e+01 7.80000e+01 8.20000e+01 2.92000e+00 4.00000e+00]
 [1.17670e+02 9.40000e+01 3.00000e+00 9.50000e+01 7.80000e+01 3.30000e+01 8.90000e+01 7.84000e+00 4.00000e+00]
 [1.22000e+02 3.90000e+01 1.10000e+00 3.90000e+01 3.30000e+01 7.80000e+01 7.10000e+01 7.85000e+00 4.00000e+00]
 [8.05000e+01 8.80000e+01 1.10000e+00 9.60000e+01 7.30000e+01 7.80000e+01 6.20000e+01 2.95000e+01 7.00000e+00]
 [9.00000e+01 8.50000e+01

In [263]:
# 如上，我们得到了根据用户的服务调用历史记录生成的推荐列表
print('user:', user, sep='\n')

user:
call_num: 14 
call_history: [1086  211 1559  620  871  417 2399 2343  690 2489  110  739 1660 1542] 
representative_item_index: 1660 
representative_item_value: [2.17517e+03 4.70000e+01 2.30000e+00 4.80000e+01 7.30000e+01 1.00000e+02 8.40000e+01 1.84000e+00 5.00000e+00]


In [264]:
print('rec_items_index:', selected_items)
print("rec_items_value:", pd_data[selected_items], sep='\n')

rec_items_index: [1660, 373, 1160, 1323, 1628, 1329, 2168, 1864, 448]
rec_items_value:
[[1.50000e+02 5.60000e+01 1.10000e+01 5.60000e+01 8.30000e+01 8.90000e+01 9.10000e+01 5.00000e-01 9.20000e+01]
 [4.63761e+03 3.00000e+01 5.00000e-01 3.00000e+01 6.70000e+01 6.70000e+01 8.20000e+01 4.14035e+03 8.00000e+00]
 [4.10000e+01 9.70000e+01 4.31000e+01 9.90000e+01 7.30000e+01 1.00000e+02 8.40000e+01 1.00000e+00 5.00000e+00]
 [4.98967e+03 9.30000e+01 1.60000e+00 9.30000e+01 7.30000e+01 1.00000e+02 8.00000e+01 1.40000e+01 5.00000e+00]
 [4.83500e+01 1.00000e+01 1.49000e+01 1.00000e+01 6.70000e+01 7.80000e+01 8.20000e+01 2.92000e+00 4.00000e+00]
 [1.17670e+02 9.40000e+01 3.00000e+00 9.50000e+01 7.80000e+01 3.30000e+01 8.90000e+01 7.84000e+00 4.00000e+00]
 [1.22000e+02 3.90000e+01 1.10000e+00 3.90000e+01 3.30000e+01 7.80000e+01 7.10000e+01 7.85000e+00 4.00000e+00]
 [8.05000e+01 8.80000e+01 1.10000e+00 9.60000e+01 7.30000e+01 7.80000e+01 6.20000e+01 2.95000e+01 7.00000e+00]
 [9.00000e+01 8.50000e+01

In [265]:
# 参考服务 与 推荐列表的相似度
similarities_between_rep_and_rec = similarities[user.representative_item_index, selected_items]
print('similarities_between_rep_and_rec:', similarities_between_rep_and_rec, sep='\n')

similarities_between_rep_and_rec:
[1.         0.47146496 0.77828537 0.63660976 0.82865239 0.86685197 0.91704186 0.85197143 0.94014186]


In [266]:
# 推荐列表 与 参考服务的平均相似度
similarities_avg_between_rec_and_rep = np.mean(similarities_between_rep_and_rec)
print('similarities_avg_between_rec_and_rep:', similarities_avg_between_rec_and_rep, sep='\n')

similarities_avg_between_rec_and_rep:
0.810113288013395


In [267]:
# 如何衡量推荐列表的准确度， cg, dcg
# DCG penalizes highly relevant documents that appear lower in the search 
# by reducing the graded relevance value 
# logarithimically proportional to the position of the result



使用如下方式计算相似度， dis 为欧氏距离，KRCC 为肯德尔系数

肯德尔系数：

$\tau=\frac{\text { (number of concordant pairs })-(\text { number of discordant pairs })}{n(n-1) / 2}$

$\operatorname{Sim}\left(s_{i}, s_{j}\right)=\alpha\left(1-\frac{d i s\left(s_{i}, s_{j}\right)}{\sqrt{2}}\right)+(1-\alpha) \operatorname{KRCC}\left(s_{i}, s_{j}\right)$

In [272]:
# t = scipy.stats.kendalltau(user.representative_item_value, pd_data[selected_items[1]])
# t.correlation
taus = np.array([scipy.stats.kendalltau(user.representative_item_value, pd_data[item]).correlation for item in user.call_history])
print('taus:', taus, sep='\n')

taus:
[0.47892074 0.25354628 0.81698245 0.81698245 0.36623351 0.72222222 0.42257713 0.92966968 0.44444444 0.83333333 0.19720266 0.47892074 0.6479516  0.61111111]


In [275]:
# 计算欧式距离
distances = np.array([np.linalg.norm(user.representative_item_value - pd_data[item]) for item in user.call_history])
distances = distances / np.max(distances) # 归一化
distances = 1 - distances
print('distances:', distances, sep='\n')

distances:
[1897.69659179 2125.25681848 2058.9227027   475.26067005 1861.15024407 1871.28672429 1877.04284195 2057.49681519 2006.98952476  359.64511814 1850.70024977 1966.84432167 2027.1593461  2050.1823047 ]


In [270]:
# 计算综合相似度
alpha = 0.5
similarities_with_combination = np.array([alpha * distances[i] + (1-alpha) * taus[i] for i in range(len(taus))])

print('similarities_original:', similarities_between_rep_and_rec, sep='\n')
print('similarities_avg_orginal:', similarities_avg_between_rec_and_rep, sep='\n')
print('similarities_with_combination:', similarities_with_combination, sep='\n')
print('similarities_avg_with_combination:', np.mean(similarities_with_combination), sep='\n')

similarities_original:
[1.         0.47146496 0.77828537 0.63660976 0.82865239 0.86685197 0.91704186 0.85197143 0.94014186]
similarities_avg_orginal:
0.810113288013395
similarities_with_combination:
[0.61350934 0.25724788 0.47270326 0.5316856  0.57490507 0.50837888 0.66709085 0.47682701 0.52267614]
similarities_avg_with_combination:
0.5138915577569442


In [271]:
# 如上得到的是user的推荐结果列表，该方法认为每个用户具有相同的多样性偏好
# 有的用户调用了3个不同的服务，有的用户调用了30个不同的服务，这两者的多样性偏好显然是不同的
# 我们需要根据用户的服务调用历史记录来计算用户的多样性偏好, user.call_num
# 在计算核矩阵时，需要考虑用户的历史调用多样性偏好
print('user.call_num:', user.call_num, sep='\n')

user.call_num:
14
