In [10]:
from dfdpy.python import make_dfd, MermaidJsGraphExporter
process_node_list, data_store_node_list, edges = make_dfd(source="""
np.random.seed(123)

n_classes = 15
n_bags = 200
n_max_instance_in_one_bag = 1000
n_instances_of_each_bags = [np.random.randint(low=0, high=n_max_instance_in_one_bag) for _ in range(n_bags)]
class_labels_of_instance_in_bags = generate_instance(n_classes, n_instances_of_each_bags)
count_each_class_of_instance_in_bags = [
    pd.Series(x).value_counts().to_dict() for x in class_labels_of_instance_in_bags
]
count_each_class_of_instance_in_bags_matrix = \
    pd.DataFrame(count_each_class_of_instance_in_bags)[list(range(n_classes))].values
count_each_class_of_instance_in_bags_matrix = np.nan_to_num(count_each_class_of_instance_in_bags_matrix)
lower_threshold = np.zeros_like(count_each_class_of_instance_in_bags_matrix)
upper_threshold = np.zeros_like(count_each_class_of_instance_in_bags_matrix)
divisions = [0, 50, 100, 200, 1000, n_max_instance_in_one_bag]
for i_bag in range(n_bags):
    for i_class in range(n_classes):
        positive_count = count_each_class_of_instance_in_bags_matrix[i_bag, i_class]
        for i_division in range(len(divisions)-1):
            if divisions[i_division] <= positive_count and positive_count < divisions[i_division+1]:
                lower_threshold[i_bag, i_class] = divisions[i_division]
                upper_threshold[i_bag, i_class] = divisions[i_division+1]

n_fatures = 7
x_min = 0
x_max = 100
cov_diag = 0.1*40**2

means_of_classes = [np.random.uniform(low=x_min, high=x_max, size=n_fatures) for _ in range(n_classes)]
covs_of_classes = [np.eye(n_fatures)*cov_diag for _ in range(n_classes)]
bags = [
    np.vstack([
        np.random.multivariate_normal(
            means_of_classes[class_label],
            covs_of_classes[class_label],
            size=1) for class_label in class_labels_of_instance_in_bag
    ]) for class_labels_of_instance_in_bag in class_labels_of_instance_in_bags
]

true_y = [np.array([class_label for class_label in class_labels_of_instance_in_bag]) for class_labels_of_instance_in_bag in class_labels_of_instance_in_bags]

flatten_features = np.vstack(bags)
max_n_clusters = 500
# cluster_generator = KMeans(n_clusters=max_n_clusters, random_state=0)
# cluster_generator = DBSCAN()# KMeans(n_clusters=n_clusters, random_state=0)
cluster_generator = MiniBatchKMeans(n_clusters=max_n_clusters, random_state=0)
insample_estimated_clusters = cluster_generator.fit_predict(flatten_features)
n_clusters = np.max(insample_estimated_clusters) + 1
print("n_clusters:", n_clusters)

cluster_encoder = OneHotEncoder(sparse=False)
cluster_encoder.fit(np.array([np.arange(n_clusters)]).T)

milclassifier = generate_mil_classifier(
    cluster_generator,
    cluster_encoder,
    bags,
    lower_threshold,
    upper_threshold,
    n_clusters)

df_confusion_matrix = pd.crosstab(np.hstack(true_y), milclassifier.predict(np.vstack(bags)))

print("confusion matrix")
print(df_confusion_matrix)
""", hidden_id_list=[])
exporter = MermaidJsGraphExporter(graph_orientation="LR")
print(exporter.export(process_node_list=process_node_list, data_store_node_list=data_store_node_list, edges=edges))

graph LR;
L7-L7("n_instances_of_each_bags&nbsp;=&nbsp;[np.random.randint(low=0,&nbsp;high=n_max_instance_in_one_bag)&nbsp;for&nbsp;_&nbsp;in&nbsp;range(n_bags)]<br />[Line 7-7]");
style L7-L7 text-align:left;
L8-L8("class_labels_of_instance_in_bags&nbsp;=&nbsp;generate_instance(n_classes,&nbsp;n_instances_of_each_bags)<br />[Line 8-8]");
style L8-L8 text-align:left;
L9-L11("count_each_class_of_instance_in_bags&nbsp;=&nbsp;[<br />&nbsp;&nbsp;&nbsp;&nbsp;pd.Series(x).value_counts().to_dict()&nbsp;for&nbsp;x&nbsp;in&nbsp;class_labels_of_instance_in_bags<br />]<br />[Line 9-11]");
style L9-L11 text-align:left;
L12-L12("count_each_class_of_instance_in_bags_matrix&nbsp;=&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;pd.DataFrame(count_each_class_of_instance_in_bags)[list(range(n_classes))].values<br />[Line 12-12]");
style L12-L12 text-align:left;
L13-L13("count_each_class_of_instance_in_bags_matrix&nbsp;=&nbsp;np.nan_to_num(count_each_class_of_instance_in_bags_matrix)<br />[Line 13-13]");
style L13-L13 text