In [0]:
import numpy as np
from numpy import genfromtxt
from sklearn import svm
from sklearn.metrics import accuracy_score
import random
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import numpy.linalg as la
import scipy.io as sio
import pickle
from cvxopt import matrix, solvers
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

**Code for datasets:**

The next two sections generate the .p and .m files used for training and testing. You do not need to run this again if those files are loaded.

In [0]:
def preprocess(W):
    u = np.min(W,axis = 0)
    v = np.max(W,axis = 0)
    l = W.shape[0]
    t = W.shape[1]
    for i in range(l):
        for j in range(t):
            W[i,j] = W[i,j] - u[j]
            W[i,j]/=(v[j]-u[j])
    return W

datasets = []
# Breast Cancer
data_file = open('/content/breast-cancer-wisconsin.data', 'r')
data = np.loadtxt(data_file, delimiter=",")[:, 1:]
zero_indices = np.where(data[:, -1] == 2.0)
one_indices = np.where(data[:, -1] != 2.0)
data[:, -1][zero_indices] = 0
data[:, -1][one_indices] = 1
data = preprocess(data)
cancer = data
datasets.append(('cancer', data))

# Pima Indians Diabetes
data_file = open('/content/pima-indians-diabetes.csv', 'r')
data = np.loadtxt(data_file, delimiter=",")
data = preprocess(data)
diabetes = data
datasets.append(('diabetes', data))

# Banknote
data_file = open('/content/data_banknote_authentication.txt', 'r')
data = np.loadtxt(data_file, delimiter=",")
data = preprocess(data)
banknote = data
datasets.append(('banknote', data))

# Ringnorm
data = []
file = open('/content/ringnorm.data', 'r')
for line in file.readlines():
  data.append([float(x) for x in line.split()])
data = np.array(data)
data = preprocess(data)
ringnorm = data
datasets.append(('ringnorm', data))

# twonorm
data = []
file = open('/content/twonorm.data', 'r')
for line in file.readlines():
  data.append([float(x) for x in line.split()])
data = np.array(data)
data = preprocess(data)
twonorm = data
datasets.append(('twonorm', data))

mat_file = dict()
mat_file['cancer'] = cancer
mat_file['ringnorm'] = ringnorm
mat_file['twonorm'] = twonorm
mat_file['diabetes'] = diabetes
mat_file['banknote'] = banknote

[5. 1. 1. 1. 2. 1. 3. 1. 1. 0.]
[  6.    148.     72.     35.      0.     33.6     0.627  50.      1.   ]
[ 3.6216   8.6661  -2.8073  -0.44699  0.     ]
[ 0.8494  2.177   0.5982  1.6894  3.1137 -3.406   3.7986 -2.6421  1.5779
 -0.1808 -0.2118  1.6327  4.664   1.0808 -1.1717 -1.6605  0.5775  1.6638
  3.0895 -3.0276  0.    ]
[-1.2036 -2.624   0.5963  1.3859 -1.3597  0.6758  1.0008 -0.9589 -1.3487
 -0.5572 -0.4398 -1.1223 -0.1817 -1.317  -0.3551 -1.422   0.1983 -3.0514
 -1.065  -0.8541  1.    ]


Generate biased training sets

In [0]:
trials = 100 # trials per dataset
training_points = 100 # size of training dataset
testing_points = 500 # size of testing dataset

for name, data in datasets:
  trial = 0
  training_sets = []
  testing_sets = []
  while trial < trials:
    print("trial {}".format(trial))
    data_dimensions = len(data[0]) - 2

    # First select random feature
    feature_to_bias = random.randint(0, data_dimensions)

    # Now decide whether to bias it up or down
    bias_direction = bool(random.getrandbits(1))

    # Larger values will be selected with x4 probability or smaller values with x4 probability
    factor = .25
    if bias_direction:
      factor = 4.0

    train_sample_probs = np.array([1.0 for i in range(len(data))])
    median = np.median(data[:, feature_to_bias])
    biased_inds = np.where(data[:, feature_to_bias] > median)

    train_sample_probs[biased_inds] = factor
    train_sample_probs = train_sample_probs / sum(train_sample_probs)
    inds = [i for i in range(len(data))]

    train_inds = np.random.choice(inds, training_points, replace = False, p=train_sample_probs)
    possible_test_inds = list(set(range(len(data))) - set(train_inds))
    test_inds = np.random.choice(possible_test_inds, testing_points, replace = False)

    # Check to make sure its well balanced
    y_train = data[:, -1]
    if sum(y_train) < 20:
      print("labels not balanced, skipping this set")
      continue
    
    trial += 1
    training_sets.append(train_inds)
    testing_sets.append(test_inds)
  train_inds_full = np.array(training_sets)
  test_inds_full = np.array(testing_sets)
  mat_file['train_inds_{}'.format(name)] = train_inds_full
  mat_file['test_inds_{}'.format(name)] = test_inds_full
sio.savemat('single_datasets.mat', mat_file)
dsets = open('single_datasets.p', 'wb')
pickle.dump(mat_file, dsets)


trial 0
trial 1
trial 2
trial 3
trial 4
trial 5
trial 6
trial 7
trial 8
trial 9
trial 10
trial 11
trial 12
trial 13
trial 14
trial 15
trial 16
trial 17
trial 18
trial 19
trial 20
trial 21
trial 22
trial 23
trial 24
trial 25
trial 26
trial 27
trial 28
trial 29
trial 30
trial 31
trial 32
trial 33
trial 34
trial 35
trial 36
trial 37
trial 38
trial 39
trial 40
trial 41
trial 42
trial 43
trial 44
trial 45
trial 46
trial 47
trial 48
trial 49
trial 50
trial 51
trial 52
trial 53
trial 54
trial 55
trial 56
trial 57
trial 58
trial 59
trial 60
trial 61
trial 62
trial 63
trial 64
trial 65
trial 66
trial 67
trial 68
trial 69
trial 70
trial 71
trial 72
trial 73
trial 74
trial 75
trial 76
trial 77
trial 78
trial 79
trial 80
trial 81
trial 82
trial 83
trial 84
trial 85
trial 86
trial 87
trial 88
trial 89
trial 90
trial 91
trial 92
trial 93
trial 94
trial 95
trial 96
trial 97
trial 98
trial 99
trial 0
trial 1
trial 2
trial 3
trial 4
trial 5
trial 6
trial 7
trial 8
trial 9
trial 10
trial 11
trial 12
tri