In [1]:
import DPrivacy as dp
from ChoiceMaker import DTChoice
from sklearn import model_selection, feature_selection
%matplotlib inline

In [2]:
class DB:
    def __init__(self, X, y, X_test, y_test, epsilon=1, depth=0, max_depth=0):
        self.epsilon = epsilon
        self.ncol = X.shape[1]
        self.X = X
        self.y = y
        self.X_test = X_test
        self.y_test = y_test
        self.depth = depth
        self.max_depth = max_depth

In [3]:
#alg_list
class TreePart:
    def __init__(self, splits=5):
        self.splits=splits
        self.kf = model_selection.KFold(splits)
        self.lo = model_selection.LeaveOneOut()
    def get_expected_correct(self, y, epsilon):
        tot_correct = 0
        if len(y) < self.splits:
            gen = self.lo.split(y)
        else:
            gen = self.kf.split(y)
        for train_idx, test_idx in gen:
            hist = pd.value_counts(y.iloc[train_idx])
            noisy_hist = dp.hist_noiser(hist, epsilon)
            pred = noisy_hist.idxmax()
            tot_correct += (y.iloc[test_idx] == pred).sum()
        return tot_correct / len(y)
    
class Leaf(TreePart):
    def error(self, db):
        return 1.0-self.get_expected_correct(db.y, db.epsilon)
    def run(self, db):
        frequencies = pd.value_counts(db.y)
        noisy_freqs = dp.hist_noiser(frequencies, db.epsilon)
        return np.repeat(noisy_freqs.idxmax(), db.y_test.size)
    
#This error is a little optimistic
class Split(TreePart):
    def error(self, db):
        max_correct = 0
        for col in db.X.columns:
            x = db.X[col]
            cats = x.cat.categories
            correct = 0
            for cat in cats:
                leaf_correct = self.get_expected_correct(db.y[x == cat], db.epsilon)
                correct += leaf_correct * (x == cat).sum()
            correct /= len(db.y)
            max_correct = max(max_correct, correct)
        return 1-max_correct
    def run(self, db):
        return None

In [4]:
#counts are noised by epsilon. 

In [31]:
nurs = pd.read_csv('../datasets/nursery.data', header=None)
nurs = nurs.apply(lambda x: x.astype('category'))
nurs_train = nurs.sample(3000, replace=True)
nurs_test = nurs.sample(3000, replace=True)
X_cols = nurs_train.columns[:-1]
y_col = nurs_train.columns[-1]
data = DB(nurs_train[X_cols], nurs_train[y_col], \
          nurs_test[X_cols], nurs_test[y_col], epsilon=1, max_depth=2)

In [32]:
data.epsilon=0.1

In [33]:
leaf = Leaf()
split = Split()

In [34]:
leaf.error(data)

0.67333333333333334

In [35]:
split.error(data)

0.30400000000000005

In [36]:
class PDTree:
    def __init__(self):
        pass

    def entropy(self, y):
        arr = pd.value_counts(y)
        arr = arr[arr > 0]
        arr = arr/arr.sum()
        return (-np.log2(arr) * arr).sum()
    def decision_helper(self, db, cm):
        action = cm.choose(db)
        if action is not None:
            return action
        utils = []
        for col in db.X.columns:
            x = db.X[col]
            cats = x.cat.categories
            cur_ent = 0
            for cat in cats:
                ent = self.entropy(db.y[x == cat])
                cur_ent += ent * (x == cat).sum()
            #cur_ent /= len(db.y)
            utils.append(-cur_ent)
        #print(utils)
        best_idx = dp.exp_mech(utils, db.epsilon, 1) #Change sensitivity
        col_name = db.X.columns[best_idx]
        new_cols = db.X.columns[db.X.columns != col_name]
        splitX = db.X[col_name]
        preds = np.repeat(db.y.cat.categories[0], len(db.y_test))
        for att in splitX.cat.categories:
            train_split = db.X.loc[splitX == att, new_cols]
            y_split = db.y[splitX == att]
            test_split_loc = db.X_test[col_name] == att 
            test_split = db.X_test.loc[test_split_loc, new_cols]
            test_split_y = db.y_test.loc[test_split_loc]
            db_new = DB(train_split, y_split, test_split, test_split_y, db.epsilon, db.depth+1, db.max_depth)
            preds[test_split_loc] = self.decision_helper(db_new, cm)
        return preds
    
    def fit_and_predict(self, data, cm):
        budget = data.epsilon / data.X.shape[1]
        data.epsilon = budget
        return self.decision_helper(data, cm)

In [37]:
dt = PDTree()
class null_cm:
    def __init__(self):
        pass
    def choose(self, db):
        if db.depth < db.max_depth:
            return split.run(db)
        else:
            return leaf.run(db)

In [38]:
data = DB(nurs_train[X_cols], nurs_train[y_col], \
          nurs_test[X_cols], nurs_test[y_col], epsilon=100, max_depth=2)
P = dt.fit_and_predict(data, null_cm())

In [39]:
P

array(['priority', 'spec_prio', 'priority', ..., 'priority', 'priority',
       'priority'],
      dtype='<U9')

In [40]:
(data.y_test == P).sum() / len(P)

0.58066666666666666

### Experiments

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
5,usual,proper,complete,1,convenient,convenient,slightly_prob,not_recom,not_recom
6,usual,proper,complete,1,convenient,convenient,problematic,recommended,priority
7,usual,proper,complete,1,convenient,convenient,problematic,priority,priority
8,usual,proper,complete,1,convenient,convenient,problematic,not_recom,not_recom
9,usual,proper,complete,1,convenient,inconv,nonprob,recommended,very_recom
