In [103]:
from keras.layers import Input, Activation, Dense, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D, Masking, TimeDistributed
from keras.models import Model
from keras.losses import binary_crossentropy
from keras.optimizers import SGD, Adam
import keras

In [4]:
from sklearn.datasets import make_classification
import numpy as np

In [45]:
def create_mil_dataset(bag_sizes, prob_inst_positive=None, random_state=123456, **make_classification_kwargs):
    """This is not going to be efficient"""
    if prob_inst_positive is None:
        prob_inst_positive = 1 - np.exp(np.log(0.5)/np.mean(bag_sizes))
        
    X, y= make_classification(n_samples=np.asarray(bag_sizes).sum()*3, random_state=random_state)
    negative = X[y==0]
    positive = X[y==1]
    neg_pos = 0
    pos_pos = 0
    bags = []
    labels = []
    for i in range(len(bag_sizes)):
        bagdata = []
        baglabels = (np.random.uniform(size=bag_sizes[i]) > (1-prob_inst_positive)).astype(int)
        for lab in baglabels:
            if lab == 0:
                bagdata.append(negative[neg_pos])
                neg_pos += 1
            else:
                bagdata.append(positive[pos_pos])
                pos_pos += 1
        bagdata = np.array(bagdata)
#         print(bagdata.shape)
#         print(baglabels.shape)
        bags.append(np.hstack([bagdata, baglabels.reshape(-1, 1)]))
        labels.append(max(baglabels))
    bags = np.array(bags)
    labels = np.array(labels)
    return bags,labels

In [46]:
dataset, labels = create_mil_dataset(np.array([1,2,3,4]))
dataset

array([ array([[-0.31744079,  3.35742705,  1.51997029,  0.60017759, -0.49366211,
         0.27422987, -0.92999096,  0.01487095, -0.48760159,  0.38039557,
        -2.18293672,  0.89617065, -1.96978545,  0.08484421, -0.08224033,
         0.43238977, -1.02951028,  0.39702147,  2.39678042, -1.23626892,
         0.        ]]),
       array([[ 0.44115327, -1.07135667, -1.34089625, -1.32886541,  1.84688341,
         1.68270572, -1.74907225,  0.78183575,  0.22147063,  1.72968859,
         0.75852711,  0.58378655,  0.09461058, -0.96498   , -0.74447138,
        -0.8456957 ,  1.68408972, -1.15467063, -1.28278174,  2.35392486,
         0.        ],
       [-0.56085934, -0.75851402, -0.66804914,  0.31342068,  0.24250114,
         0.12092999, -1.04110844, -1.6240619 , -1.68110141,  0.38793581,
         0.60986195, -1.07067775,  1.07712214,  1.8150662 ,  0.40377646,
         0.70519992,  1.97515803, -1.19816859, -0.9173684 , -0.58900454,
         0.        ]]),
       array([[ 0.40330952,  0.17644434

In [47]:
np.mean(create_mil_dataset(np.array([6]*200))[1])  # balancing of classes is working

0.46999999999999997

In [49]:
dataset, labels = create_mil_dataset(np.random.randint(5, 20, size=200))
labels

array([1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1])

In [62]:
def extract_sil_dataset(mil_dataset):
    sil_dataset = None
    for bag in dataset:
        if sil_dataset is None:
            sil_dataset = bag
        sil_dataset = np.vstack([sil_dataset, bag])
    return sil_dataset[:, :-1], sil_dataset[:, -1]

In [63]:
X_sil, Y_sil = extract_sil_dataset(dataset)
X_sil.shape

(2497, 20)

In [50]:
nfeats = dataset[0][0].shape[0] - 1

In [104]:
inlayer = Input([None, nfeats])
masked_input = Masking()(inlayer)
layer2 = TimeDistributed(Dense(10, activation='relu'))(masked_input)
layer3 = TimeDistributed(Dense(1, activation='relu'))(layer2)
layer4 = GlobalAveragePooling1D()(layer3)
layer5 = Activation('sigmoid')(layer4)
model = Model([inlayer], [layer5])

TypeError: Layer global_average_pooling1d_4 does not support masking, but was passed an input_mask: Tensor("masking_2/Any_1:0", shape=(?, ?), dtype=bool)

In [87]:
model.predict(np.array([dataset[0][:,:-1]]))

array([[ 0.53886724]], dtype=float32)

In [88]:
model.compile('sgd', 'binary_crossentropy', metrics=['accuracy'])

In [89]:
np.array([d[:, :-1] for d in dataset]).shape

(200,)

In [82]:
size2bags = {k: [bag for bag in dataset if len(bag) == k] for k in range(5, 21)}

In [79]:
batchsize = 20
for model.fit(np.array([d[:, :-1] for d in dataset]), labels)

ValueError: Error when checking input: expected input_3 to have 3 dimensions, but got array with shape (200, 1)

In [94]:
np.max(X_sil, axis=0)

array([ 3.01881551,  3.05552074,  3.64355299,  3.6477442 ,  3.52298785,
        3.60181064,  3.3930971 ,  3.60523094,  3.137509  ,  4.27051298,
        3.09873802,  3.60147595,  3.18147832,  3.57764453,  4.10825088,
        3.55248442,  3.10293039,  3.04243421,  5.44919258,  3.38162514])

In [95]:
np.min(X_sil, axis=0)

array([-3.28418358, -4.71442206, -3.87594178, -3.59609428, -3.28798966,
       -3.76107614, -3.52572937, -3.68092065, -3.37180263, -4.33056449,
       -3.26702823, -3.91428866, -3.09552008, -3.86244063, -3.81579132,
       -2.92441762, -4.20076804, -4.07828847, -1.95134692, -3.38761966])

In [98]:
X = keras.preprocessing.sequence.pad_sequences([d[:, :-1] for d in dataset], dtype='float32', value=0)

In [99]:
X

array([[[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 1.32641292,  1.04845202, -2.30484843, ..., -0.04771969,
         -1.19255769, -0.77209359],
        ..., 
        [ 1.02215028,  1.60033703, -0.10369639, ...,  0.36812752,
         -0.98323721,  0.54205555],
        [-0.41213149, -0.57176757,  1.1794343 , ...,  0.15750039,
         -0.58093196,  0.27885589],
        [-0.91263419, -1.75275707, -0.14167933, ..., -0.9121604 ,
         -0.04578342,  0.58567148]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [-0.49138495, -1.60011649, -1.5429343 , ...,

model.fit(X, labels, batch_size=32, 