In [45]:
import pandas as pd, os
import numpy as np, pickle, math, time
from itertools import combinations, permutations, product
from scipy import spatial
from scipy import stats
from tqdm import tqdm
from collections import Counter

In [2]:
# 设置显示宽度
np.set_printoptions(linewidth=400)

In [3]:
# 定义数据集路径
QWS_file_path = '../code/qws2.CSV'

In [4]:
# 读取数据
data = pd.read_table(QWS_file_path, header=None, delimiter=',')
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency\t,Documentation,Service Name ...,WSDL Address
1,302.75,89,7.1,90,73,78,80,187.75,32,MAPPMatching ...,http://xml.assessment.com/service/MAPPMatching...
2,482,85,16,95,73,100,84,1,2,Compound2 ...,http://www.mssoapinterop.org/asmx/WSDL/compoun...
3,3321.4,89,1.4,96,73,78,80,2.6,96,USDAData ...,http://www.strikeiron.com/webservices/usdadata...
4,126.17,98,12,100,67,78,82,22.77,89,GBNIRHolidayDates ...,http://www.holidaywebservice.com/Holidays/GBNI...
...,...,...,...,...,...,...,...,...,...,...,...
2503,200.8,93,2.4,98,73,100,84,7.4,41,garnierService ...,http://genome.dkfz-heidelberg.de/menu/hobit/em...
2504,56.17,97,11.3,97,83,78,91,7.17,3,AWSAlexa ...,http://awis.amazonaws.com/AWSAlexa/AWSAlexa.wsdl
2505,93.93,80,2.1,80,67,78,82,3.72,60,interop2 ...,http://www.cs.fsu.edu/~engelen/interop2.wsdl
2506,106.75,86,1.3,95,80,78,87,1.25,96,SailboatCalcsWS ...,http://pooh.poly.asu.edu/cst556-sailboatcalcsw...


In [5]:
# 获取数据，忽略标题行，和非功能属性
pd_data = data.iloc[1:, :8].values.astype(float)
item_size = pd_data.shape[0]
data_dimensions = pd_data.shape[1]
print(f'item_size: {item_size}\ndata_dimensions: {data_dimensions}')

item_size: 2507
data_dimensions: 8


In [11]:
# 每一列的最大值和最小值
column_max = np.max(pd_data, axis=0)
column_min = np.min(pd_data, axis=0)
# print(f'column_max: {column_max}\ncolumn_min: {column_min}')

# 正负属性，如延迟越高越不好，归一化
pos_or_neg = ['-', '+', '+', '+', '+', '+', '+', '-']
for (index, value) in enumerate(pos_or_neg):
    if value == '-':  # 如果是负属性，改变方向
        pd_data[:, index] = (pd_data[:, index] - column_min[index]) / (column_max[index] - column_min[index])
    else:
        pd_data[:, index] = (column_max[index] - pd_data[:, index]) / (column_max[index] - column_min[index])

all_services = pd_data
print(f'all_services: \n{all_services}\nall_services.shape: {all_services.shape}')
# 数据集的划分
constrains_index = np.random.choice(item_size, 6, replace=False)  # 随机选择6个索引
constrains = pd_data[constrains_index, :]  # 选择6个索引的数据，作为约束集
print(f"constrains_service: \n{constrains}\nconstrains_service.shape:{constrains.shape}")

candidates = np.delete(pd_data, constrains_index, axis=0)  # 删除6个索引的数据，作为候选集
print(f"candidates_service: \n{candidates}\ncandidates_service.shape:{candidates.shape}")

all_services: 
[[5.36579259e-02 8.81720430e-01 1.62790698e-01 ... 6.71641791e-01 6.66666667e-01 4.52887611e-02]
 [8.98505251e-02 8.38709677e-01 3.69767442e-01 ... 1.00000000e+00 7.55555556e-01 1.81155045e-04]
 [6.63157448e-01 8.81720430e-01 3.02325581e-02 ... 6.71641791e-01 6.66666667e-01 5.67619140e-04]
 ...
 [1.14948099e-02 7.84946237e-01 4.65116279e-02 ... 6.71641791e-01 7.11111111e-01 8.38144006e-04]
 [1.40833126e-02 8.49462366e-01 2.79069767e-02 ... 6.71641791e-01 8.22222222e-01 2.41540059e-04]
 [5.64342062e-02 6.98924731e-01 3.67441860e-01 ... 8.35820896e-01 8.22222222e-01 1.26808531e-03]]
all_services.shape: (2507, 8)
constrains_service: 
[[4.52281295e-02 1.00000000e+00 2.95348837e-01 1.00000000e+00 7.14285714e-01 1.00000000e+00 7.55555556e-01 2.01685950e-03]
 [1.13877969e-03 6.12903226e-01 1.39534884e-01 6.08695652e-01 6.07142857e-01 8.35820896e-01 4.88888889e-01 3.69556291e-04]
 [4.31686343e-02 1.00000000e+00 3.02325581e-02 1.00000000e+00 3.03571429e-01 1.00000000e+00 4.888888

In [14]:
# 生成历史调用记录，随机从所有服务中挑选出 10 - 15 个服务
gen_histories = lambda all_services: all_services[np.random.choice(all_services.shape[0], np.random.randint(10, 15 + 1), replace=False)]

In [33]:
# 生成很多个用户的历史调用记录
gen_users_histories = lambda user_count, all_services: np.array([gen_histories(all_services) for _ in range(user_count)], dtype=list)

In [34]:
users_histories =gen_users_histories(6, all_services)

In [35]:
# 获取用户历史记录的香农信息熵，表示多样化程度
def get_shannon_entropies(histories):
    indexs = [[np.argmax(i) for i in item] for item in histories]
    shannon_entropies = []
    for item in indexs:
        shannon_entropy = np.abs(sum([count / len(item) * (math.log2(count / len(item))) for count in Counter(item).values()]))
        shannon_entropies.append(shannon_entropy)
    return np.array(shannon_entropies)

In [36]:
# 归一化香农熵，获取用户的多样化 fu 参数; H0 表示超参数，需要调参
fus = lambda shannon_entropies, H0 : np.asarray([(item - np.min(shannon_entropies) + H0) / (np.max(shannon_entropies) - np.min(shannon_entropies) + H0) for item in shannon_entropies])
fus(get_shannon_entropies(users_histories), H0=1)

array([1.        , 0.96097304, 0.85150711, 0.85150711, 0.71556066, 0.98499253])

In [37]:
# 获取两个列表之间的相似度，综合相似度，alpha 是一个固定的参数，dimensions 表示 item 的维度
def get_similarity(item1, item2, alpha=0.5, dimensions=8):
    distance = spatial.distance.euclidean(item1, item2)
    tau = stats.kendalltau(item1, item2).correlation
    similarity = alpha * (1.0 - distance / np.sqrt(8.0)) + (1.0 - alpha) * tau
    return similarity

In [43]:
# 根据约束服务和候选服务，生成推荐列表，fu 是个性参数，alpha是一个超参数
def get_kernel_matrix(constraints, candidates, fu, alpha):
    similarities  = np.asarray([get_similarity(item, constraints) for item in candidates])
    kernel_matrix = np.diag(np.square(similarities))
    comb = [(i,j) for (i, j) in list(combinations(range(len(candidates)), 2))]
    for (i, j) in tqdm(comb):
        kernel_matrix[i, j] = fu * alpha * similarities[i] * similarities[j] * get_similarity(candidates[i], candidates[j])
        kernel_matrix[j, i] = kernel_matrix[i, j]
    return kernel_matrix

In [None]:
# dpp 核心算法，max_length 是 topK
def dpp(kernel_matrix, max_length, epsilon=1E-10):
    item_size = kernel_matrix.shape[0]
    cis = np.zeros((max_length, item_size))
    di2s = np.copy(np.diag(kernel_matrix))
    selected_items = list()
    selected_item = np.argmax(di2s)
    selected_items.append(selected_item)
    while len(selected_items) < max_length:
        k = len(selected_items) - 1
        ci_optimal = cis[:k, selected_item]
        di_optimal = math.sqrt(di2s[selected_item])
        elements = kernel_matrix[selected_item, :]
        eis = (elements - np.dot(ci_optimal, cis[:k, :])) / di_optimal
        cis[k, :] = eis
        di2s -= np.square(eis)
        di2s[selected_item] = -np.inf
        selected_item = np.argmax(di2s)
        if di2s[selected_item] < epsilon:
            break
        selected_items.append(selected_item)
    return selected_items

In [38]:
# 获取推荐列表与参考服务相比较精确度
def get_dcg_value(constraint, result_list):
    gain = lambda score, rank: (np.power(2, score) - 1) / np.log2(1 + rank)
    dcg = np.sum([gain(get_similarity(item, constraint), index+1) for (index, item) in enumerate(result_list)])
    return dcg

In [39]:
# 获取某个列表的多样性，用累计不相似度来表示，alpha 是一个固定的参数，dimensions 表示 item 的维度
def get_diversity_of_list(hlist, alpha=0.5, dimensions=8):
    return 2 / (len(hlist) * (len(hlist) - 1)) * np.sum([1 - get_similarity(hlist[i], hlist[j], alpha, dimensions) for i, j in list(permutations(range(len(hlist)), 2))])

In [40]:
# 获取两个列表的多样性均方根误差，top_k 表示推荐的服务数量
def get_rmdse_of_lists(historical_list, recommend_list, top_k):
    historical_diversity = get_diversity_of_list(historical_list)
    recommend_diversity = get_diversity_of_list(recommend_list)
    return np.sum(np.square(historical_diversity - recommend_diversity)) / top_k

In [41]:
# 控制变量法比较候选集数量，属性维度，推荐数量
n_list = [1000, 1300, 1600, 1900, 2200, 2500]
d_list = [3, 4, 5, 6, 7, 8]
top_k = [3, 4, 5, 6, 7, 8]

In [42]:
# 使用 dpp 评判
dpp_res = {}

In [None]:
# 根据控制变量法，设置设置实验参数变化的影响结果
exp_list = list(product(n_list, d_list, top_k))

In [48]:
for (n, dimension, topK) in exp_list:
    pass

array([0.04522813, 1.        , 0.29534884])

In [None]:
def dpp_eva(n, dimension, topK):
    _constraint = constrains[0, :dimension]
    _candidates = candidates[:n, :dimension]
    _kernel_matrix = get_kernel_matrix(_constraint, _candidates, 1, 1)
    _result_list = dpp(_kernel_matrix, topK)
    _dcg = get_dcg_value(_constraint, _result_list)
    _diversity = get_diversity_of_list(_result_list)
    _rmdse = get_rmdse_of_lists(_constraint, _result_list, topK)
    return _dcg, _diversity, _rmdse