In [0]:
import numpy as np
from numpy import genfromtxt
from sklearn import svm
from sklearn.metrics import accuracy_score
import random
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import numpy.linalg as la
import scipy.io as sio
import pickle
from cvxopt import matrix, solvers
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

**Code for datasets:**

The next two sections generate the .p and .m files used for training and testing. You do not need to run this again if those files are loaded.

In [0]:
def preprocess(W):
    u = np.min(W,axis = 0)
    v = np.max(W,axis = 0)
    l = W.shape[0]
    t = W.shape[1]
    for i in range(l):
        for j in range(t):
            W[i,j] = W[i,j] - u[j]
            W[i,j]/=(v[j]-u[j])
    return W

datasets = []
# Breast Cancer
data_file = open('/content/breast-cancer-wisconsin.data', 'r')
data = np.loadtxt(data_file, delimiter=",")[:, 1:]
zero_indices = np.where(data[:, -1] == 2.0)
one_indices = np.where(data[:, -1] != 2.0)
data[:, -1][zero_indices] = 0
data[:, -1][one_indices] = 1
#data = preprocess(data)
cancer = data
datasets.append(('cancer', data))

# Pima Indians Diabetes
data_file = open('/content/pima-indians-diabetes.csv', 'r')
data = np.loadtxt(data_file, delimiter=",")
#data = preprocess(data)
diabetes = data
datasets.append(('diabetes', data))

# Banknote
data_file = open('/content/data_banknote_authentication.txt', 'r')
data = np.loadtxt(data_file, delimiter=",")
#data = preprocess(data)
banknote = data
datasets.append(('banknote', data))

# Ringnorm
data = []
file = open('/content/ringnorm.data', 'r')
for line in file.readlines():
  data.append([float(x) for x in line.split()])
data = np.array(data)
#data = preprocess(data)
ringnorm = data
datasets.append(('ringnorm', data))

# twonorm
data = []
file = open('/content/twonorm.data', 'r')
for line in file.readlines():
  data.append([float(x) for x in line.split()])
data = np.array(data)
#data = preprocess(data)
twonorm = data
datasets.append(('twonorm', data))

mat_file = dict()
mat_file['cancer'] = cancer
mat_file['ringnorm'] = ringnorm
mat_file['twonorm'] = twonorm
mat_file['diabetes'] = diabetes
mat_file['banknote'] = banknote

Generate biased training sets

In [0]:
trials = 100 # trials per dataset
training_points = 100 # size of training dataset
testing_points = 500 # size of testing dataset

for name, data in datasets:
  trial = 0
  training_sets = []
  testing_sets = []
  while trial < trials:
    print("trial {}".format(trial))


    # Now decide whether to bias it up or down
    bias_direction = bool(random.getrandbits(1))

    # Larger values will be selected with x4 probability or smaller values with x4 probability
    factor = .25
    if bias_direction:
      factor = 4.0

    train_sample_probs = np.array([1.0 for i in range(len(data))])
    median = np.median(np.linalg.norm(data[:, :-1], axis = 1))
    print(median)
    biased_inds = np.where(np.linalg.norm(data[:, :-1] > median))

    train_sample_probs[biased_inds] = factor

    #train_sample_probs = np.linalg.norm(data[:, :-1], axis = 1)
    train_sample_probs = train_sample_probs / sum(train_sample_probs)
    inds = [i for i in range(len(data))]

    train_inds = np.random.choice(inds, training_points, replace = False, p=train_sample_probs)
    possible_test_inds = list(set(range(len(data))) - set(train_inds))
    test_inds = np.random.choice(possible_test_inds, testing_points, replace = False)

    # Check to make sure its well balanced
    y_train = data[:, -1]
    if sum(y_train) < 20:
      print("labels not balanced, skipping this set")
      continue
    
    trial += 1
    training_sets.append(train_inds)
    testing_sets.append(test_inds)
  train_inds_full = np.array(training_sets)
  test_inds_full = np.array(testing_sets)
  mat_file['train_inds_{}'.format(name)] = train_inds_full
  mat_file['test_inds_{}'.format(name)] = test_inds_full
sio.savemat('multi_datasets.mat', mat_file)
dsets = open('multi_datasets.p', 'wb')
pickle.dump(mat_file, dsets)



trial 0
6.557438524302
trial 1
6.557438524302
trial 2
6.557438524302
trial 3
6.557438524302
trial 4
6.557438524302
trial 5
6.557438524302
trial 6
6.557438524302
trial 7
6.557438524302
trial 8
6.557438524302
trial 9
6.557438524302
trial 10
6.557438524302
trial 11
6.557438524302
trial 12
6.557438524302
trial 13
6.557438524302
trial 14
6.557438524302
trial 15
6.557438524302
trial 16
6.557438524302
trial 17
6.557438524302
trial 18
6.557438524302
trial 19
6.557438524302
trial 20
6.557438524302
trial 21
6.557438524302
trial 22
6.557438524302
trial 23
6.557438524302
trial 24
6.557438524302
trial 25
6.557438524302
trial 26
6.557438524302
trial 27
6.557438524302
trial 28
6.557438524302
trial 29
6.557438524302
trial 30
6.557438524302
trial 31
6.557438524302
trial 32
6.557438524302
trial 33
6.557438524302
trial 34
6.557438524302
trial 35
6.557438524302
trial 36
6.557438524302
trial 37
6.557438524302
trial 38
6.557438524302
trial 39
6.557438524302
trial 40
6.557438524302
trial 41
6.557438524302
tr