## 数据集大小和标签相关性的关系

In [1]:
import skmultilearn
# from skmultilearn.dataset import load_dataset
from leemultilearn.dataset import available_datasets, load_dataset
import leemultilearn
import numpy as np
import scipy.sparse as sp
import scipy
import pandas as pd

根据 ProXML (MM 2019), 度量数据集标签相关性强弱的方法有这样一种: 

计算标签相关性矩阵 $A$，

$$
a_{jk} = \langle y_{\bullet j},  y_{\bullet k} \rangle
$$

得到其拉普拉斯矩阵 $L = I - D^{1/2} A D^{1/2}$, 
记其特征值 

$$
\lambda_1 \leq \lambda_2 \leq \dots \leq \lambda_q
$$

其中 $\lambda_2$ (第二小的特征值) 和图的连通性关联非常大, 这个特征值被单独命名为the algebraic connectivity of the graph. 

我们尝试用 algebraic connectivity 指标来评估不同的数据集, 同时展示
数据集的样本数 $N$, 
每个样本的平均标签数 $\mathrm{avg.}{|\mathcal{Y_i}|}$, 
每个标签的平均样本数 $\mathrm{avg.}{|\mathcal{X^j}|}$. 

$\mathrm{avg.}{|\mathcal{Y_i}|}$ 可以用来评估标签的稀疏程度, 
$\mathrm{avg.}{|\mathcal{X^j}|}$ 可以用来评估标签的长尾程度.  

In [2]:
def algebraic_connectivity(Y):
    n, q = Y.shape
    A = (Y.T @ Y) + 0.001 * np.eye(q)
    D1 = np.diag(A.sum(axis=1).A.ravel() ** (-1/2)) 
    I = np.eye(q)
    L = I - D1 @ A @ D1
    
    eigval, eigvec = np.linalg.eigh(L)
    return min(eigval[1], eigval[-2])
    

In [3]:
dataset_names = {
    name for name, split in 
    available_datasets()
}

datasets = [
    [name] + list(load_dataset(name, 'undivided'))
    for name in dataset_names
]

In [4]:
# print('''
# dataset               N       Q       Algebraic     avg_Yi    avg_Xj
#                                       connectivity
# --------------------------------------------------------------------
# ''')

dataset_info = pd.DataFrame(columns=['name', 'N', 'Q', 'Algebraic connectivity', 'avg_Yi', 'avg_Xj'])

for name, X, Y, _, _ in datasets:
    n, m = X.shape
    _, q = Y.shape
#     Y = Y.toarray()
    c = abs(algebraic_connectivity(Y))
    avg_Yi = Y.sum(axis=0).mean()
    avg_Xj = Y.sum(axis=1).mean()
    
    dataset_info.loc[len(dataset_info)] = [name, n, q, c, avg_Yi, avg_Xj]
    # print(f"{name:<20s}  {n:<6d}  {q:<6d}  {c:<12.4f}  {avg_Yi:<8.1f}  {avg_Xj:<8.1f}")

In [5]:
pd.options.display.float_format = '{:.3f}'.format
dataset_info.sort_values(by='Q')[-10:]

Unnamed: 0,name,N,Q,Algebraic connectivity,avg_Yi,avg_Xj
9,Yahoo-Social,12111,39,0.0,397.256,1.279
1,Yahoo-Science,6428,40,0.186,232.975,1.45
10,Medical,978,45,0.0,27.067,1.245
19,Enron,1702,53,0.794,137.283,4.275
14,rcv1subset1,6000,101,0.193,171.069,2.88
22,rcv1subset5,6000,101,0.136,156.921,2.642
16,mediamill,43907,101,0.385,1902.158,4.376
11,Bibtex,7395,159,0.04,111.711,2.402
26,Corel5k,5000,374,0.129,47.086,3.522
13,delicious,16105,983,0.109,311.614,19.02


结果发现并不能很好地观测到 Algebraic connectivity 随 Q 增大而下降的趋势. 
有可能是因为这里没有 Q 足够大的数据集. 

In [6]:
dataset_names = ['Medical', 'Enron']

In [7]:
from skmultilearn.problem_transform import BinaryRelevance
from leemultilearn.models import MetaBinaryRelevance
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from leemultilearn.models import MLPClassifier

from leemultilearn.metrics import (
    hamming_loss, 
    label_ranking_loss,
    average_precision_score,
    coverage_error,
    one_error,
    top_k_accuracy_score
)

In [17]:
for dataset_name in ['Bibtex']:
    X_train, Y_train, _, _ = load_dataset(dataset_name, 'train')
    X_test, Y_test, _, _ = load_dataset(dataset_name, 'test')
    Y_test = Y_test.toarray()
    
    n, q =  Y_train.shape
    c = abs(algebraic_connectivity(Y_train))
    
    # without label correlations
    model1 = BinaryRelevance(classifier=SVC(probability=True))
    model1.fit(X_train, Y_train)
    Y_pred1 = model1.predict(X_test).toarray()
    Y_pred_proba1 = model1.predict_proba(X_test).toarray()
    
    # with label correlations
    model2 = MetaBinaryRelevance(classifier=SVC(probability=True))
    model2.fit(X_train, Y_train)
    Y_pred2 = model2.predict(X_test).toarray()
    Y_pred_proba2 = model2.predict_proba(X_test).toarray()
    
    print('dataset: %s (n=%d, q=%d, c=%.4f)' % (dataset_name, n, q, c))
    print('  BR:')
    print('  %15s: %.4f' % ('hamming_loss', hamming_loss(Y_test, Y_pred1)))
    print('  %15s: %.4f' % ('ranking_loss', label_ranking_loss(Y_test, Y_pred_proba1)))
    print('  %15s: %.4f' % ('precision@5', top_k_accuracy_score(Y_test, Y_pred1, k=5)))
    print('  MBR:')
    print('  %15s: %.4f' % ('hamming_loss', hamming_loss(Y_test, Y_pred2)))
    print('  %15s: %.4f' % ('ranking_loss', label_ranking_loss(Y_test, Y_pred_proba2)))
    print('  %15s: %.4f' % ('precision@5', top_k_accuracy_score(Y_test, Y_pred2, k=5)))

dataset: Bibtex (n=4880, q=159, c=0.0412)
  BR:
     hamming_loss: 0.0131
     ranking_loss: 0.0649
      precision@5: 0.0859
  MBR:
     hamming_loss: 0.0124
     ranking_loss: 0.0785
      precision@5: 0.1183


```
dataset: Enron (n=1141, q=53, c=0.8120)
  BR:
     hamming_loss: 0.0426
     ranking_loss: 0.1097
      precision@5: 0.1561
  MBR:
     hamming_loss: 0.0413
     ranking_loss: 0.1096
      precision@5: 0.1651

```