In [12]:
import numpy as np
from sklearn import svm
from dprl.ldp import NumericAndCategorical, SGD

from sklearn import metrics
import dprl
import tensorflow as tf
from copy import deepcopy

from dprl.utils import process_train_data
from examples.data_utils import *
from sklearn.datasets import make_blobs

# DPRL Example

## 1 Discrete part

In [12]:
avg_l = []
rl_l = []
random_l = []
greedy_l = []

for i in range(100):
    tf.reset_default_graph()

    total_epsilon = 10400
    num = 2000
    epsilon_level = [10, 4] # 24 * 10 + 36 * 5

    for _ in range(1):
        # load data
        x_train, y_train, x_test, y_test, col = preprocess_data('adult_train.csv', 'adult_test.csv')

        # Defines problem
        problem = 'classification'

        # Network parameters
        parameters = dict()
        parameters['activation'] = tf.nn.relu
        parameters['hidden_dim'] = 100
        parameters['comb_dim'] = 10
        parameters['iterations'] = 2000
        parameters['layer_number'] = 5
        parameters['batch_size'] = 2000
        parameters['learning_rate'] = 0.01
        parameters['epsilon_budget'] = total_epsilon
        parameters['epsilon_level'] = epsilon_level

        # Defines predictive model
        pred_model = svm.SVC(decision_function_shape='ovo')

        # Flags for using stochastic gradient descent / pre-trained model
        flags = {'sgd': False, 'pretrain': False}

        # Initializes DVRL
        dprl_class = dprl.Dprl(x_train, y_train, x_test, y_test,
                               problem, pred_model, parameters, 'None', flags)

        # Trains DVRL
        dprl_class.train_dprl('accuracy')

        print('Finished dprl training.')

        print(y_train/2)

        print(x_train)

        y_train = y_train / 2
        # Estimates data values
        dve_out = dprl_class.data_valuator(x_train, y_train)

        # Predicts with DVRL
        print(dve_out)

        print('Finished data valuation.')

    a = np.asarray(dve_out)
    save_npy('value.npy', a)


    total_epsilon = 420
    num = 60
    epsilon_level = [10, 5] #
    model = svm.SVC(decision_function_shape='ovo')

    dve_out = load_npy('value.npy')

    sorted_idx = np.argsort(dve_out)

    high_idx = sorted_idx[:24]
    low_idx = sorted_idx[24:]

    # load data
    x_train, y_train, x_test, y_test, col = preprocess_data(
        'adult_train.csv', 'adult_test.csv')

    epsilon_list = np.zeros(len(y_train))

    for i in high_idx:
        epsilon_list[i] = epsilon_level[0]

    for i in low_idx:
        epsilon_list[i] = epsilon_level[1]



    # process

    x_train_s, y_train_s = process_train_data(x_train,
                                              y_train,
                                              epsilon_list)

    model.fit(x_train_s, y_train_s/2)
    y_pred = model.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred, normalize=True)

    print('acc rl: \t', acc)
    rl_l.append(acc)

    epsilon_list_n = [total_epsilon/len(y_train)] * len(y_train)

    x_train_n, y_train_n = process_train_data(x_train,
                                              y_train,
                                              epsilon_list)
    model.fit(x_train_n, y_train_n/2)
    y_pred = model.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred, normalize=True)

    print('acc baseline: \t', acc)
    avg_l.append(acc)


    # load data
    x_train, y_train, x_test, y_test, col = preprocess_data(
        'train.csv', 'test.csv')

    epsilon_list_random = np.zeros(len(y_train))

    for i in range(24):
        epsilon_list_random[i] = epsilon_level[0]

    for i in range(24, 60):
        epsilon_list_random[i] = epsilon_level[1]



    # process

    x_train_random, y_train_random = process_train_data(x_train,
                                              y_train,
                                              epsilon_list_random)

    model.fit(x_train_random, y_train_random/2)
    y_pred = model.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred, normalize=True)

    print('acc random: \t', acc)
    random_l.append(acc)

    def test():
        epsilon_list_g = [epsilon_level[1]] * 60
        for kk in range(24):
            print('kk', kk)
            comp_list = []
            index_list = []
            select_index = 0
            for gi in range(60):
                if epsilon_list_g[gi] == epsilon_level[1]:
                    # try
                    temp_el = deepcopy(epsilon_list_g)
                    temp_el[gi] = epsilon_level[0]
                    temp_x, temp_y = process_train_data(x_train, y_train, temp_el)
                    model.fit(temp_x, temp_y / 2)
                    y_pred = model.predict(x_test)
                    acc = metrics.accuracy_score(y_test, y_pred, normalize=True)
                    comp_list.append(acc)
                    print('comp_list', comp_list)
                    index_list.append(gi)

            v = 0
            for walk in range(len(comp_list)):
                print('acc', comp_list[walk])
                if comp_list[walk] > v:
                    v = comp_list[walk]
                    select_index = walk

            # select_index = np.argmax(comp_list, axis=0)
            ii = index_list[select_index]
            epsilon_list_g[ii] = epsilon_level[0]
            print(epsilon_list_g)

        x_train_g, y_train_g = process_train_data(x_train,
                                                   y_train,
                                                   epsilon_list_g)

        model.fit(x_train_g, y_train_g/2)
        y_pred = model.predict(x_test)
        acc = metrics.accuracy_score(y_test, y_pred, normalize=True)
        return acc

    greedy_l.append(test())







    # def eval_utility(x_train, y_train, x_test, y_test, model) -> float:
    #     """Evaluate the coalition utility.
    #     """
    #
    #     single_pred_label = (True if len(np.unique(y_train)) == 1
    #                          else False)
    #
    #     if single_pred_label:
    #         y_pred = [y_train[0]] * len(y_test)
    #     else:
    #         model.fit(x_train, y_train)
    #         y_pred = model.predict(x_test)
    #
    #     return metrics.accuracy_score(y_test, y_pred, normalize=True)

save_npy('rl2.npy', rl_l)
save_npy('random2.npy', random_l)
save_npy('avg2.npy', avg_l)
save_npy('greedy2.npy', greedy_l)

## 2 Continuous part

In [13]:
# load adult data

In [14]:
avg_l = []
rl_l = []
g_random_l = []

for _ in range(100):
    tf.reset_default_graph()

    total_epsilon = 20000
    num = 2000

    X_train, Y_train = make_blobs(n_samples=2000, centers=2,
                      random_state=0, cluster_std=0.4)
    X_test, Y_test = make_blobs(n_samples=100, centers=2,
                      random_state=0, cluster_std=0.4)
    epsilon_l = 1
    epsilon_h = 100

    # Defines problem
    problem = 'classification'

    # Network parameters
    parameters = dict()
    parameters['activation'] = tf.nn.relu
    parameters['hidden_dim'] = 100
    parameters['comb_dim'] = 10
    parameters['iterations'] = 2000
    parameters['layer_number'] = 5
    parameters['batch_size'] = 60
    parameters['learning_rate'] = 0.01
    parameters['epsilon_budget'] = 20000
    parameters['epsilon_level'] = [1, 100]

    # Defines predictive model
    pred_model = svm.SVC(decision_function_shape='ovo')

    # Flags for using stochastic gradient descent / pre-trained model
    flags = {'sgd': False, 'pretrain': False}

    # Initializes DVRL
    dprl_class = dprl.Dprl(X_train, Y_train, X_test, Y_test,
                               problem, pred_model, parameters, 'None', flags)

    # Trains DVRL
    dprl_class.train_dprl('accuracy')

    print('Finished dprl training.')

    # print(Y_train/2)
    #
    # print(X_train)

    Y_train = Y_train / 2
    # Estimates data values
    dve_out = dprl_class.data_valuator(X_train, Y_train)

    # Predicts with DVRL
    print(dve_out)

    print('Finished data valuation.')

    a = np.asarray(dve_out)
    save_npy('value.npy', a)


    model = svm.SVC(decision_function_shape='ovo')

    dve_out = load_npy('value.npy')

    sorted_idx = np.argsort(dve_out)

    high_idx = sorted_idx[:24]
    low_idx = sorted_idx[24:]

    y_train = Y_train
    y_test = Y_test
    x_train = X_train
    x_test = X_test

    epsilon_list = np.zeros(len(y_train))

    temp_list = dve_out * (20000 / np.sum(dve_out))
    not_bound_idx = []
    sum = 0
    for i, v in enumerate(temp_list):
        if v < 1:
            epsilon_list[i] = 1
            sum += 1
        if v > 100:
            epsilon_list[i] = 100
            sum +=100
        if 1 < v < 100:
            not_bound_idx.append(i)


    c = (20000 - sum) / np.sum(dve_out[not_bound_idx])
    for i in not_bound_idx:
        epsilon_list[i] = dve_out[i]*c

    x_train_rl, y_train_rl = process_train_data(x_train,
                                                y_train,
                                                epsilon_list)
    model.fit(x_train_rl, y_train_rl/2)
    y_pred = model.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred, normalize=True)
    print('acc rl: \t', acc)
    rl_l.append(acc)

    x_train_a, y_train_a = process_train_data(x_train,
                                              y_train,
                                              [20000/2000]*2000)
    model.fit(x_train_rl, y_train_rl/2)
    y_pred = model.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred, normalize=True)
    print('acc avg: \t', acc)
    avg_l.append(acc)


    # epsilon_list_n = [total_epsilon/len(y_train)] * len(y_train)
    #
    # x_train_n, y_train_n = process_train_data(x_train,
    #                                           y_train,
    #                                           epsilon_list)
    # model.fit(x_train_n, y_train_n/2)
    # y_pred = model.predict(x_test)
    # acc = metrics.accuracy_score(y_test, y_pred, normalize=True)
    #
    # print('acc baseline: \t', acc)
    # avg_l.append(acc)

# g_random

    epsilon_list = np.random.normal(0, 99/6, 2000)
    epsilon_list = epsilon_list + 20000 / 2000
    for i, e in enumerate(epsilon_list):
        if epsilon_list[i] < 1:
            epsilon_list[i] = 1
        elif epsilon_list[i] > 100:
            epsilon_list[i] = 100

    x_train_g, y_train_g = process_train_data(x_train,
                                              y_train,
                                              epsilon_list)
    model.fit(x_train_g, y_train_g/2)
    y_pred = model.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred, normalize=True)

    print('g baseline: \t', acc)
    g_random_l.append(acc)

save_npy('rl2c.npy', rl_l)
save_npy('g_random2c.npy', g_random_l)
save_npy('avg2c.npy', avg_l)

Finished dprl training.
y_train_valid_pred [0. 1. 1. ... 0. 1. 0.]
INFO:tensorflow:Restoring parameters from None
[0.51849663 0.50377846 0.5473702  ... 0.50038725 0.49718928 0.4800536 ]
Finished data valuation.
acc rl: 	 1.0
acc avg: 	 0.88
g baseline: 	 0.5
Finished dprl training.
y_train_valid_pred [0. 1. 1. ... 0. 1. 0.]
INFO:tensorflow:Restoring parameters from None
[0.43499115 0.4307492  0.43423223 ... 0.42710742 0.4418071  0.4518956 ]
Finished data valuation.
acc rl: 	 1.0
acc avg: 	 0.54
g baseline: 	 0.5
Finished dprl training.
y_train_valid_pred [0. 1. 1. ... 0. 1. 0.]
INFO:tensorflow:Restoring parameters from None
[0.40229324 0.40229324 0.40229324 ... 0.40229324 0.40229324 0.40229324]
Finished data valuation.
acc rl: 	 1.0
acc avg: 	 0.54
g baseline: 	 0.5
Finished dprl training.
y_train_valid_pred [0. 1. 1. ... 0. 1. 0.]
INFO:tensorflow:Restoring parameters from None
[0.49929094 0.5182932  0.4993602  ... 0.5169887  0.5140263  0.5120352 ]
Finished data valuation.
acc rl: 	 1.

100%|██████████| 2000/2000 [00:17<00:00, 114.01it/s]
100%|██████████| 2000/2000 [00:16<00:00, 120.52it/s]
100%|██████████| 2000/2000 [00:16<00:00, 119.09it/s]
100%|██████████| 2000/2000 [00:17<00:00, 117.05it/s]
100%|██████████| 2000/2000 [00:17<00:00, 114.63it/s]
100%|██████████| 2000/2000 [00:17<00:00, 113.07it/s]
100%|██████████| 2000/2000 [00:17<00:00, 113.03it/s]
100%|██████████| 2000/2000 [00:18<00:00, 105.95it/s]
100%|██████████| 2000/2000 [00:19<00:00, 103.26it/s]
100%|██████████| 2000/2000 [00:18<00:00, 107.19it/s]
100%|██████████| 2000/2000 [00:16<00:00, 120.83it/s]
100%|██████████| 2000/2000 [00:17<00:00, 116.19it/s]
100%|██████████| 2000/2000 [00:17<00:00, 113.56it/s]
100%|██████████| 2000/2000 [00:16<00:00, 121.66it/s]
100%|██████████| 2000/2000 [00:18<00:00, 110.97it/s]
100%|██████████| 2000/2000 [00:17<00:00, 116.42it/s]
100%|██████████| 2000/2000 [00:15<00:00, 125.94it/s]
100%|██████████| 2000/2000 [00:16<00:00, 123.21it/s]
100%|██████████| 2000/2000 [00:16<00:00, 118.3