In [28]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, scipy, math, sklearn.preprocessing

In [29]:
np.set_printoptions(linewidth=400)

In [30]:
# 读取数据集
data = pd.read_table('qws2.CSV', header=None, delimiter=',')
pd_data = data.iloc[1:, 0:9]
pd_data

Unnamed: 0,0,1,2,3,4,5,6,7,8
1,302.75,89,7.1,90,73,78,80,187.75,32
2,482,85,16,95,73,100,84,1,2
3,3321.4,89,1.4,96,73,78,80,2.6,96
4,126.17,98,12,100,67,78,82,22.77,89
5,107,87,1.9,95,73,89,62,58.33,93
...,...,...,...,...,...,...,...,...,...
2503,200.8,93,2.4,98,73,100,84,7.4,41
2504,56.17,97,11.3,97,83,78,91,7.17,3
2505,93.93,80,2.1,80,67,78,82,3.72,60
2506,106.75,86,1.3,95,80,78,87,1.25,96


In [31]:
# 根据数据集设置参数
pd_data = pd_data.values.astype(float) # 转换为float类型
item_size = pd_data.shape[0] # 获取数据集的行数
feature_dimension = pd_data.shape[1] # 获取数据集的列数，特征维度
max_length = 10 # 推荐列表的长度

In [40]:
# 归一化操作
feature_vectors = pd_data
feature_vectors = feature_vectors / np.linalg.norm(feature_vectors, axis=1, keepdims=True)
# scaler = sklearn.preprocessing.MinMaxScaler()
# feature_vectors = scaler.fit_transform(feature_vectors)
print('feature_vectors:', feature_vectors, sep='\n')

feature_vectors:
[[7.52615874e-01 2.21247937e-01 1.76501163e-02 ... 1.98874550e-01 4.66733709e-01 7.95498199e-02]
 [9.25520798e-01 1.63214249e-01 3.07226821e-02 ... 1.61294081e-01 1.92016763e-03 3.84033526e-03]
 [9.98003821e-01 2.67424399e-02 4.20667595e-04 ... 2.40381483e-02 7.81239819e-04 2.88457779e-02]
 ...
 [4.55423370e-01 3.87883207e-01 1.01819342e-02 ... 3.97580287e-01 1.80365691e-02 2.90912405e-01]
 [4.46778607e-01 3.59934053e-01 5.44086360e-03 ... 3.64119333e-01 5.23159961e-03 4.01786850e-01]
 [8.68583913e-01 1.97592549e-01 4.36350212e-02 ... 2.38757663e-01 1.50938753e-02 2.19547277e-02]]


In [41]:
# 定义用户类
class User:
    call_num = np.random.randint(5, 15 + 1) # [low, hight + 1)
    call_history = np.random.choice(item_size, call_num, replace=False) # 从item_size中随机挑选call_num个数，不重复
    representative_item_index = call_history[np.random.randint(0, call_num)] # 随机选择一个item作为参考服务
    representative_item_value_origin = pd_data[representative_item_index - 1]
    representative_item_value = feature_vectors[representative_item_index - 1] # pd_data [0, 2506]
    def __init__(self):
        print(f'call_num: {self.call_num}')
        print(f'call_history: {self.call_history}')
        print(f'representative_item_index: {self.representative_item_index}')
        print(f'representative_item_value: {self.representative_item_value}')
        print(f'representative_item_value_origin: {self.representative_item_value_origin}')
    def __str__(self):
        return f'call_num: {self.call_num} \ncall_history: {self.call_history} \nrepresentative_item_index: {self.representative_item_index} \nrepresentative_item_value: {self.representative_item_value}'

In [45]:
# 生成用户
user = User()

call_num: 6
call_history: [1279 2324 1460  311  359 2406]
representative_item_index: 1279
representative_item_value: [0.46354925 0.37470231 0.01892826 0.38242813 0.30903284 0.38629105 0.33607321 0.00645106 0.37470231]
representative_item_value_origin: [120.    97.     4.9   99.    80.   100.    87.     1.67  97.  ]


In [43]:
# 如上我们构造了一个用户，为他随机生成了服务的调用历史记录
# 我们根据服务服务调用历史记录随机选择了一个参考服务
similarities = np.dot(feature_vectors, feature_vectors.T)
print('similarities:', similarities, sep='\n')

similarities:
[[1.         0.86997638 0.77945642 ... 0.75845166 0.73581933 0.88588136]
 [0.86997638 1.         0.94487737 ... 0.73931327 0.71484563 0.98982433]
 [0.77945642 0.94487737 1.         ... 0.51003225 0.50232576 0.8947768 ]
 ...
 [0.75845166 0.73931327 0.51003225 ... 1.         0.9913373  0.81457884]
 [0.73581933 0.71484563 0.50232576 ... 0.9913373  1.         0.78706589]
 [0.88588136 0.98982433 0.8947768  ... 0.81457884 0.78706589 1.        ]]


使用如下方式计算相似度， dis 为欧氏距离，KRCC 为肯德尔系数

肯德尔系数：

$\tau=\frac{\text { (number of concordant pairs })-(\text { number of discordant pairs })}{n(n-1) / 2}$

$\operatorname{Sim}\left(s_{i}, s_{j}\right)=\alpha\left(1-\frac{d i s\left(s_{i}, s_{j}\right)}{\sqrt{2}}\right)+(1-\alpha) \operatorname{KRCC}\left(s_{i}, s_{j}\right)$

In [51]:
# 现在我们拥有了相似度向量，还需要对于 item 的评分
taus = np.array([scipy.stats.kendalltau(user.representative_item_value, item).correlation for item in feature_vectors])
print('taus:', taus, sep='\n')

taus:
[0.30988989 0.81698245 0.68571429 ... 0.51428571 0.59160798 0.57142857]


In [57]:
# 欧氏距离
distances = np.array([np.linalg.norm(user.representative_item_value - item) for item in feature_vectors])
print('distances:', distances, sep='\n')

distances:
[0.7094904  0.72983024 0.98141925 ... 0.10770356 0.08172501 0.62638311]


In [66]:
# alpha 是一个平衡参数
alpha = 0.5
scores = np.array([alpha * distances[i] + (1-alpha) * taus[i] for i in range(len(taus))])
print('scores:', scores, sep='\n')

scores:
[0.50969015 0.77340634 0.83356677 ... 0.31099464 0.33666649 0.59890584]


In [67]:
# 根据特征矩阵和 scores 生成核矩阵
kernel_matrix = scores.reshape((item_size, 1)) * similarities * scores.reshape((1, item_size))
print('kernel_matrix', kernel_matrix, sep='\n')

kernel_matrix
[[0.25978405 0.34294259 0.33116045 ... 0.12022286 0.12626335 0.27042096]
 [0.34294259 0.59815737 0.60914904 ... 0.17782349 0.18613151 0.45848423]
 [0.33116045 0.60914904 0.69483355 ... 0.13221811 0.14096969 0.44669763]
 ...
 [0.12022286 0.17782349 0.13221811 ... 0.09671766 0.10379448 0.15172061]
 [0.12626335 0.18613151 0.14096969 ... 0.10379448 0.11334433 0.1586973 ]
 [0.27042096 0.45848423 0.44669763 ... 0.15172061 0.1586973  0.3586882 ]]


In [68]:
# dpp 核心算法
def dpp(kernel_matrix, max_length, epsilon=1E-10):
    cis = np.zeros((max_length, item_size))
    di2s = np.copy(np.diag(kernel_matrix))
    selected_items = list()
    selected_item = np.argmax(di2s)
    selected_items.append(selected_item)

    while len(selected_items) < max_length:
        k = len(selected_items) - 1
        ci_optimal = cis[:k, selected_item]
        di_optimal = math.sqrt(di2s[selected_item])
        elements = kernel_matrix[selected_item, :]
        eis = (elements - np.dot(ci_optimal, cis[:k, :])) / di_optimal
        cis[k, :] = eis
        di2s -= np.square(eis)
        di2s[selected_item] = -np.inf
        selected_item = np.argmax(di2s)
        if di2s[selected_item] < epsilon:
            break
        selected_items.append(selected_item)
    return selected_items

In [105]:
# 调用 dpp 算法
selected_items = dpp(kernel_matrix, max_length)
print('selected_items:', selected_items, sep='\n')

selected_items:
[77, 786, 1497, 1070, 1628, 1548, 2168, 2165, 570]


In [107]:
print('user.representative_item_index:', user.representative_item_index, sep='\n')

user.representative_item_index:
1279


In [109]:
print(scores[1278])

0.5


In [111]:
for (index, item) in enumerate(selected_items):
    print(f"index: {index}, item: {item}, scores: {scores[item - 1]}")

index: 0, item: 77, scores: 0.556329505357628
index: 1, item: 786, scores: 0.6963184615889936
index: 2, item: 1497, scores: 0.5871837464332773
index: 3, item: 1070, scores: 0.7139475193111698
index: 4, item: 1628, scores: 0.5574859690999936
index: 5, item: 1548, scores: 0.4972302963115882
index: 6, item: 2168, scores: 0.5832973779928448
index: 7, item: 2165, scores: 0.6397542619643823
index: 8, item: 570, scores: 0.5574365748343221


In [112]:
# 计算推荐列表的准确性
dcg_value = np.sum([2 ** scores[item - 1] / np.log2(index + 2) for (index, item) in enumerate(selected_items)])
print('dcg_value:', dcg_value, sep='\n')

dcg_value:
6.456515902069293
