In [35]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
# import pandas as pd
from scipy.io import loadmat, savemat
from scipy.stats import binom

# Making Synthetic Dataset
The first order of business is making a dataset that we can test this on, and compare against baseline methods

The dataset will have the following structure
 * $K$ kinds of instances, each with $N_k$ instances.
 * Each kind will have $M_k$ features
 * There is an assumed latent binary label for each instance, $Y_{k, n}$.
 * The label of the total bag will be $0$ if and only if all $Y_{k, n} = 0$

In [70]:
1 - binom.cdf(0, 25, 0.03)

0.53302529474562832

In [73]:
K = 2
M = [30, 35]
N = [10, 15]
p = 0.03  # The probability of a unit being labeled positive, roughly causing an equal balance of classes
D = 1000  # The number of data points to generate

np.random.seed(123546)

covs_0 = []
covs_1 = []
means_0 = [[0]*M[k] for k in range(K)]
means_1 = []
for k in range(K):
    cov = np.random.uniform(low=0.05, high=3.0, size=(M[k], M[k]))
    cov = np.sqrt(np.dot(cov, cov.T))/3  # assuring the dataset is positive semi-definite, therefore a good covariance matrix
    covs_0.append(cov)
    
    cov = cov.copy()

    cov[0, 0] *= 2
    cov[0, 1] *= 1.5
    covs_1.append(cov)
    means_1.append(np.random.uniform(low=2, high=4, size=(M[k])))  # making the positive class have different distributions    
    
print(means_0)
print(covs_0)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[array([[ 3.03141043,  2.69307117,  2.86937817,  2.79199817,  2.73058002,
         2.57091856,  2.79050764,  2.61139162,  2.42478006,  2.58047853,
         2.848647  ,  2.65886174,  2.73117155,  2.68174382,  2.54797315,
         2.42700036,  2.78429051,  2.68504782,  3.03821315,  2.49891026,
         2.67097861,  2.54455371,  2.55507312,  2.32363233,  2.68221921,
         2.5964347 ,  2.69998645,  2.5683707 ,  2.47758088,  2.40886664],
       [ 2.69307117,  3.15029837,  2.96267697,  2.96069085,  2.93077314,
         2.62757329,  2.98621627,  2.79280711,  2.3949287 ,  2.75139397,
         2.76435607,  2.49378634,  2.86754068,  2.86529896,  2.76288148,
         2.40555236,  2.91285608,  2.64174356,  3.01323951,  2.56471846,
         2.71848596,  2.76489497,  2.81952245,  2.49026142,  2.75004

This will be super inefficient, but we only need one pass, so it's not too bad...

In [74]:
dataset = []
labels = []
for d in range(D):
    x = []
    y = []
    for k in range(K):
        ls = np.random.uniform(size=N[k]) < p 
        for n in range(N[k]):
            x_dkn = 0
            if ls[n] == 0:
                x_dkn = np.random.multivariate_normal(means_0[k], covs_0[k])
            else:
                x_dkn = np.random.multivariate_normal(means_1[k], covs_1[k])
                
            x.append(np.ravel(x_dkn))
            y.append(ls[n])
    dataset.append(np.concatenate(x))
    labels.append(np.array(y))
                                
dataset = np.array(dataset)
labels = np.array(labels)

  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [75]:
print(dataset.shape)
print(labels.shape)
dataset

(1000, 825)
(1000, 25)


array([[ 2.79673501,  3.14282708,  2.91605277, ...,  2.46913029,
         3.11989531,  2.92351144],
       [ 0.96784125,  0.6794085 ,  0.3332632 , ..., -1.38753259,
        -2.27380915, -1.91793064],
       [ 0.62188813, -0.18138514,  0.49646997, ...,  2.05466305,
         2.07691908,  1.22200005],
       ..., 
       [ 6.09395614,  6.3277074 ,  6.0922489 , ...,  0.66595861,
        -0.09044591, -1.03655273],
       [-0.27015794, -0.17719692, -1.52203324, ..., -2.50215438,
        -1.57924752, -3.00361599],
       [ 0.97260619, -0.28193222,  1.27257004, ...,  1.16415252,
         0.70330398,  1.01755049]])

In [76]:
bag_labels = labels.max(axis=1)
np.unique(bag_labels, return_counts=True)

(array([False,  True], dtype=bool), array([464, 536]))

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [78]:
result = train_test_split(dataset, bag_labels)
trainX = result[0]
testX = result[1]
trainY = result[2]
testY = result[3]

In [79]:
lr = LogisticRegression()
grid = {'C': [0.01, 0.03, 0.06, 0.1, 0.5, 1.0]}
search = GridSearchCV(lr, grid, scoring='roc_auc', n_jobs=2, cv=3)
search.fit(trainX, trainY)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'C': [0.01, 0.03, 0.06, 0.1, 0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [80]:
search.score(testX, testY)

0.67961538461538462

In [81]:
rf = RandomForestClassifier()
grid = {'n_estimators': [2, 4, 10, 30, 100], 
        'max_features': ['auto', 3, 5, 10, 20]}
search = GridSearchCV(rf, grid, scoring='roc_auc', n_jobs=2, cv=3)
search.fit(trainX, trainY)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'n_estimators': [2, 4, 10, 30, 100], 'max_features': ['auto', 3, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [82]:
search.score(testX, testY)

0.6086217948717948

In [83]:
search.best_score_

0.65530815946515908

In [84]:
search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [85]:
savemat('./synth_dataset.mat', {'train_X': trainX,
                                'train_Y': trainY,
                                'test_X': testX,
                                'test_Y': testY})