In [1]:
from typing import List, Dict, Tuple, Any
import argparse

try:
    from .utils import isnotebook
except ImportError:
    try:
        from utils import isnotebook
    except:
        isnotebook = lambda: True
from pathlib import Path
import json
import sys
sys.path.append('../')
from skmultilearn.dataset import load_from_arff

In [6]:
def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Read an .arff dataset file")
    parser.add_argument("-i", "--input-file", type=Path)
    parser.add_argument(
        "-f",
        "--format",
        default="meka",
        choices=["meka", "mulan"],
        help="Format. See http://scikit.ml/datasets.html#ARFF-files",
    )
    parser.add_argument(
        "-t",
        "--train_percent",
        default=70.0,
        type=float,
        help="Percent (t) to keep for train",
    )
    parser.add_argument(
        "-o",
        "--output_dir",
        required=True,
        type=Path,
    )
    parser.add_argument(
        "-p",
        "--filename_prefix",
        help="prefix in prefix_train.arff and prefix_dev.arff",
    )
    if isnotebook():
        import shlex  # noqa

        args_str = ("-i ../.data/bibtex_meka/all.arff "
                    "-o ../.data/bibtex_meka"
                   )
        args = parser.parse_args(shlex.split(args_str))
    else:
        args = parser.parse_args()
    return args

In [7]:
#def main(args: argparse.Namespace)->None:
args = get_args()
    

In [8]:
X, y, feature_names, label_names =load_from_arff(args.input_file, label_count=159, return_attribute_definitions=True)

In [9]:
y

<7395x159 sparse matrix of type '<class 'numpy.int64'>'
	with 17762 stored elements in List of Lists format>

In [10]:
from skmultilearn.model_selection import IterativeStratification

In [11]:
k_fold = IterativeStratification(n_splits=2, order=1)

In [12]:
folds = list(k_fold.split(X, y))

In [15]:
folds[0][1]

array([   0,    2,    4, ..., 7389, 7391, 7394])

In [None]:

if __name__=="__main__":
    main()

In [34]:
import numpy as np

In [62]:
def dataset(
    x: np.ndarray,
    y: np.ndarray,
    feature_names: List[Tuple[str, Any]],
    label_names: List[Tuple[str, Any]],
) -> List[Dict]:
    num_features = len(feature_names)
    assert x.shape[-1] == num_features
    num_total_labels = len(label_names)
    assert y.shape[-1] == num_total_labels
    all_labels = np.array([l[0] for l in label_names])
    data = [{'x': xi.tolist(), 'y': (all_labels[yi ==1]).tolist()} for xi, yi in zip(x, y)]
    return data

In [66]:
imdb = dataset(X.toarray(),y.toarray(), feature_names, label_names)

In [70]:
len(imdb[1]['x'])

1001

In [78]:
def cardinality(y: List[List])->float:
    return np.mean([len(i) for i in y])

In [79]:
cardinality([i['y'] for i in imdb ])

1.999669200043004

In [102]:
def diversity(y: List[List]) -> float:
    vocab = set([v for l in y for v in l])
    l2i = {label: i for i, label in enumerate(vocab)}
    i2l = {i: label for label, i in l2i.items()}
    label_sets_present = set()
    for yi in y:
        yi_onehot = [0]*len(vocab)
        for label in yi:
            yi_onehot[l2i[label]]=1
        label_sets_present.add(tuple(yi_onehot))
    return len(label_sets_present)/np.power(2, len(vocab))
    

In [103]:
diversity([i['y'] for i in imdb])

1.6774982213974e-05

In [None]:
from allennlp.data