In [1]:
!pip install milwrap

Collecting milwrap
  Downloading milwrap-0.1.1-py3-none-any.whl (4.8 kB)
Installing collected packages: milwrap
Successfully installed milwrap-0.1.1


In [None]:
import pandas as pd
import numpy as np
from milwrap.countbase import MilCountBasedMultiClassLearner

In [4]:
# Generate dataset

def generate_class_ratios(n_classes):
    
    while True:
        use_classes = np.random.choice([0, 1], size=n_classes, p=[.8, .2])
        if not np.all(use_classes == 0):
            break
    
    ratio_classes = use_classes * np.random.uniform(low=0, high=1, size=n_classes)
    ratio_classes = ratio_classes**4
    ratio_classes = ratio_classes / np.sum(ratio_classes)

    return ratio_classes

def generate_instance(n_classes, n_instances_of_each_bags):

    class_labels_of_intance_in_bags = [np.random.choice(
        np.arange(n_classes),
        size=n_instance_in_bag,
        p=generate_class_ratios(n_classes)) for n_instance_in_bag in n_instances_of_each_bags]
    
    return class_labels_of_intance_in_bags

np.random.seed(123)

n_classes = 15
n_bags = 100
n_max_instance_in_one_bag = 1000
n_instances_of_each_bags = [np.random.randint(low=0, high=n_max_instance_in_one_bag) for _ in range(n_bags)]
class_labels_of_instance_in_bags = generate_instance(n_classes, n_instances_of_each_bags)
count_each_class_of_instance_in_bags = [
    pd.Series(x).value_counts().to_dict() for x in class_labels_of_instance_in_bags
]
count_each_class_of_instance_in_bags_matrix = \
    pd.DataFrame(count_each_class_of_instance_in_bags)[list(range(n_classes))].values
count_each_class_of_instance_in_bags_matrix = np.nan_to_num(count_each_class_of_instance_in_bags_matrix)
lower_threshold = np.zeros_like(count_each_class_of_instance_in_bags_matrix)
upper_threshold = np.zeros_like(count_each_class_of_instance_in_bags_matrix)
divisions = [0, 50, 100, 200, 1000, n_max_instance_in_one_bag]
for i_bag in range(n_bags):
    for i_class in range(n_classes):
        positive_count = count_each_class_of_instance_in_bags_matrix[i_bag, i_class]
        for i_division in range(len(divisions)-1):
            if divisions[i_division] <= positive_count and positive_count < divisions[i_division+1]:
                lower_threshold[i_bag, i_class] = divisions[i_division]
                upper_threshold[i_bag, i_class] = divisions[i_division+1]

n_fatures = 7
x_min = 0
x_max = 100
cov_diag = 0.1*40**2

means_of_classes = [np.random.uniform(low=x_min, high=x_max, size=n_fatures) for _ in range(n_classes)]
covs_of_classes = [np.eye(n_fatures)*cov_diag for _ in range(n_classes)]
bags = [
    np.vstack([
        np.random.multivariate_normal(
            means_of_classes[class_label],
            covs_of_classes[class_label],
            size=1) for class_label in class_labels_of_instance_in_bag
    ]) for class_labels_of_instance_in_bag in class_labels_of_instance_in_bags
]

len(bags): 100


In [14]:
# Show dataset structures

print("len(bags) =", len(bags))
print("bags[0].shape =", bags[0].shape)
print("bags[0][:4, :5] =\n", bags[0][:4, :5])
print("lower_threshold.shape = ", lower_threshold.shape)
print("lower_threshold[:3, :4] = \n", lower_threshold[:3, :4])
print("upper_threshold.shape = ", upper_threshold.shape)
print("upper_threshold[:3, :4] = \n", upper_threshold[:3, :4])

len(bags) = 100
bags[0].shape = (510, 7)
bags[0][:4, :5] =
 [[ 60.72742676  24.91347932  -6.96853301   1.22269867  53.49671751]
 [ 72.63044984  16.14588346  18.953747     0.56814811  32.67886356]
 [ 72.94026896 107.30726053  69.86360443   8.86078304 103.95422024]
 [ 49.38941779  34.00583834  -1.34933507  41.55929543  72.78695625]]
lower_threshold.shape =  (100, 15)
lower_threshold[:3, :4] = 
 [[200.   0.   0.   0.]
 [  0.   0.   0.   0.]
 [  0.   0. 200.   0.]]
upper_threshold.shape =  (100, 15)
upper_threshold[:3, :4] = 
 [[1000.   50.   50.   50.]
 [  50.   50.   50.   50.]
 [  50.   50. 1000.   50.]]


In [15]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(min_samples_leaf=10)

learner = MilCountBasedMultiClassLearner(clf)
clf_mil, y_mil = learner.fit(
    bags,
    lower_threshold,
    upper_threshold,
    n_classes,
    debug_true_y=class_labels_of_instance_in_bags,
    max_iter=10)

print("MIL instance unit accuracy")
print(np.mean(clf_mil.predict(np.vstack(bags)) == np.hstack(class_labels_of_instance_in_bags)))
print("----")

print("MIL instance unit accuracy (label adjusted)")
print(np.mean(np.hstack(y_mil) == np.hstack(class_labels_of_instance_in_bags)))
print("----")

print("SIL instance unit accuracy")
clf.fit(np.vstack(bags), np.hstack(class_labels_of_instance_in_bags))
print(np.mean(clf.predict(np.vstack(bags)) == np.hstack(class_labels_of_instance_in_bags)))
print("----")


iter: 0
false negative instances
-43.0
false positive instances
13.0
num changes instances
1642
instance unit accuracy
0.8457921261834974
instance unit accuracy (label adjusted)
0.869177403369673
-----
iter: 1
false negative instances
-15.0
false positive instances
0.0
num changes instances
279
instance unit accuracy
0.897475908314531
instance unit accuracy (label adjusted)
0.9000695864875693
-----
iter: 2
false negative instances
-8.0
false positive instances
11.0
num changes instances
191
instance unit accuracy
0.9085886595112076
instance unit accuracy (label adjusted)
0.9106340805094575
-----
iter: 3
false negative instances
-8.0
false positive instances
1.0
num changes instances
171
instance unit accuracy
0.9104864728085528
instance unit accuracy (label adjusted)
0.9125318938068026
-----
iter: 4
false negative instances
-8.0
false positive instances
0.0
num changes instances
160
instance unit accuracy
0.9132277586824958
instance unit accuracy (label adjusted)
0.9147670961347869
---