In [1]:
import sklearn
from sklearn import model_selection, feature_selection
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from log_regression import test, DPLogisticRegression
from ChoiceMaker import ChoiceMaker, DTChoice
import graphviz
import seaborn as sns
import copy

In [2]:
import pickle
def load_or_run(name, f):
    name = 'log_regression_metadata/' + name
    try:
        r = pickle.load(open(name, 'rb'))
    except:
        r = f()
        pickle.dump(r, open(name, 'wb'))
    return r

### Data Generation Methods

In [3]:
#train_set
#def gen_binary_data(domain, nrow, seed):
ncol = 7
nrow = 1000
seed=12345
y_ratio=0.4
def gen_data(low, high, ncol, nrow, y_ratio, seed=12345, prng=None):
    """Generates an array containing a binary output and inputs drawn 
    from Gaussians. The output is the last column of the array
    and the inputs are the other columns. The inputs are conditionally
    independent of each other given the output. Thus, input at column i
    for output value j are drawn from a normal with mean mu_ij and 
    sigma_ij. Furthermore, we force sigma_ij = sigma_i with no dependence
    on j because this assumption allows us to derive logistic regression
    as the best fitting algorithm. We draw sigma_i and mu_ij randomly.
    We ensure sigma_i is at most one-quarter of high-low to ensure the
    Gaussian will mostly fit in the [low, high] interval.
    
    Parameters
    ----------

    low: lower bound on an input value
    
    high: upper bound on an input value
    
    ncol: number of inputs of the dataset
    
    nrow: number of rows in the dataset
    
    y_ratio: fraction of rows that take value 0 on the output column
    
    seed: seed value to use. Default: 12345
    
    prng: random number generator. One of seed or prng must not be None. 
    """
    if(prng == None):
        prng = np.random.RandomState(seed)
    sigmas = prng.uniform(0, (high-low)/6, ncol)
    s1 = int(y_ratio*nrow)
    s2 = nrow-s1
    def gen_col(sigma, sz):
        mu = prng.uniform(low+3*sigma, high-3*sigma)
        ans = prng.normal(mu, sigma, sz)
        c = (ans < low).sum()
        while(c > 0):
            ans[ans < low] = prng.normal(mu, sigma, c)
            c = (ans < low).sum()
        c = (ans > high).sum()
        while(c > 0):
            ans[ans > high] = prng.normal(mu, sigma, c)
            c = (ans > high).sum()
        return ans
    P1 = np.array([gen_col(s, s1) for s in sigmas] + [np.zeros(s1)]).T
    P2 = np.array([gen_col(s, s2) for s in sigmas] + [np.ones(s2)]).T
    A = np.concatenate((P1, P2))
    return A[np.random.permutation(len(A))]

In [4]:
def reshape_dset(db, ncol, nrow, y_ratio, seed=12345, prng=None):
    """Rescales an input database to the desired parameters.
    Parameters
    ----------
    db: Input array. We assume last row is the output row.
        The output row must be binary.
        
    ncol: Desired number of columns in output.
    
    nrow: Desired number of rows.
    
    y_ratio: Desired percentage of class 2 in the output
    
    seed: seed value to use. Default: 12345
    
    prng: random number generator. One of seed or prng must not be None. 
    """
    if(prng == None):
        prng = np.random.RandomState(seed)
    ys = db[db.columns[-1]]
    v1, v2 = ys.unique()[:2]
    Z1 = db[ys == v1]
    s1 = int(y_ratio*nrow)
    Z2 = db[ys == v2]
    s2 = nrow - s1
    def reshape(Z, nrow):
        db = pd.DataFrame()
        while db.shape[0] + Z.shape[0] < nrow:
            db = pd.concat((db, Z))
        return pd.concat((db, Z.sample(nrow - db.shape[0])), ignore_index=True)
    db = pd.concat((reshape(Z1, s1), reshape(Z2, s2)))
    db_x = db[db.columns[:-1]]
    ys = db[db.columns[-1]]
    db = pd.DataFrame()
    while db.shape[1] + db_x.shape[1] < ncol:
        db = pd.concat((db, db_x), axis=1, ignore_index=True)
    rand_cols = np.random.choice(db_x.columns, ncol-db.shape[1], replace=False)
    return pd.concat((db, db_x[rand_cols], ys), axis=1, ignore_index=True)

### ChoiceMaker Classes

In [5]:
#TODO: Change this to sklearn.model_selection.KFold
def manual_CV(db, parts, clf):
    A = np.linspace(0, len(db.X), parts+1).astype(int)
    arr = []
    for i in range(0, len(A)-1):
        l = A[i]
        r = A[i+1]
        X_test = db.X[l:r]
        y_test = db.y[l:r]
        X_train = np.concatenate((db.X[:l], db.X[r:]))
        y_train = np.concatenate((db.y[:l], db.y[r:]))
        #Have to fix case when only one class exists
        if len(np.unique(y_train)) == 1:
            score = (y_train == y_train[0]).sum() / len(y_train)
            arr.append(score)
        else:
            clf.fit(X_train, y_train)
            arr.append(clf.score(X_test, y_test))
    return np.array(arr)

In [6]:
#alg_list
class DP:
    def __init__(self, C):
        self.name = str(C)
        self.epsilon=0.1
        self.model = DPLogisticRegression(self.epsilon, C=C, K=1.02, fit_intercept=True)
    def error(self, db):
        A = manual_CV(db, 5, self.model)
        return 1.0-A.mean()
    def run(self, db):
        self.model.set_epsilon(db.epsilon)
        return self.model.fit(db.X, db.y)

In [7]:
class DB:
    def __init__(self, X, y, epsilon=1):
        self.epsilon = epsilon
        self.ncol = X.shape[1]
        
        #self.X_train, self.X_test, self.y_train, self.y_test = model_selection.train_test_split(X, y, train_size=ts)
        self.X = pd.get_dummies(X)
        self.y = y

In [8]:
class DBMetafeatures:
    def __init__(self):
        self.sensitivities = {'nrow': 0, 'ncol': 0, 'eps': 0, 'numy': 1}
    
    def __call__(self, dataset):
        return {'nrow': dataset.X.shape[0], 
                'ncol': dataset.ncol,
                'eps': dataset.epsilon, 
                'numy': dataset.y.sum()}

In [9]:
#Generating Tset methods
def gen_tset_size_range(eps_vals, col_vals, row_vals, ratios, reps=3, db=None):
    for c in col_vals:
        for r in row_vals:
            for e in eps_vals:
                for ratio in ratios:
                    for x in range(0, reps):
                        if db is None:
                            G = pd.DataFrame(gen_data(low=0, high=1, ncol=c, nrow=r, y_ratio=ratio, prng=prng))
                        else:
                            G = reshape_dset(db, ncol=c, nrow=r, y_ratio=ratio, prng=prng)
                        last_name = G.columns[-1]
                        yield DB(G.drop([last_name], axis=1), G[last_name], e)
            
#G = gen_tset_size_range(4, [0.2, 0.4, 0.6, 0.8, 1.0], np.arange(20, 200, 20))

### Global Constants

In [10]:
prng = np.random.RandomState(12345)
eps_vals = [0.1, 0.2, 0.4, 0.8, 1.6, 3.2]
col_vals = np.arange(2, 20, 2)
row_vals = [20, 40, 100, 150, 300, 600, 1000, 2000, 3000]
ratios = [0.15, 0.3, 0.5, 0.7, 0.85]
num_reps = 3
alg_list = [DP(C=x) for x in [0.1, 0.5, 1.0, 1.5, 2]]
def split_db(db):
    return gen_tset_size_range(eps_vals, col_vals, row_vals, ratios, num_reps, db)

### Adult Dataset

In [11]:
adult = pd.read_csv('data/adult.data', header=None)
adult = adult.rename(columns = {0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 
                                4: 'education_num', 5: 'marital_status', 6: 'occupation',
                                7: 'relationship', 8: 'race', 9: 'sex', 10: 'captial_gain',
                                11: 'captial_loss', 12: 'hours-per-week', 13: 'native_country', 14: 'TARGET'})
adult = adult.drop('education', axis=1)

In [12]:
le = LabelEncoder()
adult['TARGET'] = le.fit_transform(adult['TARGET'])

In [13]:
adult_data = load_or_run('adult.p', lambda: list(split_db(adult)))

In [14]:
adult_cm = load_or_run('adult_cm.p', lambda: DTChoice(adult_data, DBMetafeatures(), {a.name: a for a in alg_list}, reps=1))

### Credit Default Dataset

In [15]:
default = pd.read_csv('data/application_train.csv')
default['TARGET'] = le.fit_transform(default['TARGET'])

In [16]:
#C = default.corr()
#default = default[abs(C['TARGET']).sort_values().index[-30:]].sample(10000)

In [17]:
#For simplicity, get rid of all columns with missing data
default = default[default.columns[default.notnull().all()]].sample(10000)

In [18]:
#Put target at end
new_cols = list(default.columns)
new_cols[1] = new_cols[-1]
new_cols[-1] = 'TARGET'
default = default[new_cols]

In [19]:
default_data = load_or_run('default.p', lambda: list(split_db(default)))

In [20]:
default_cm = load_or_run('default_cm.p', lambda: DTChoice(default_data, DBMetafeatures(), {a.name: a for a in alg_list}, reps=1))

In [21]:
default_cm.get_approximate_regret()

array([ 0.08067337,  0.09602507,  0.11464457,  0.13024859,  0.1305381 ,
        0.0631563 ])

In [22]:
default_cm.get_regret(copy.copy(adult_data[0]))

Unnamed: 0,0.1,0.5,1.0,1.5,2,cm
0,0.6,0.35,0.5,0.4,0.0,0.2


### Lending Dataset Preprocessing

In [23]:
lending = pd.read_csv('data/fam_credit_ss.csv')

In [24]:
#Swap first and last column
L = list(lending.columns)
L[0] = L[-1]
L[-1] = 'credit_card'
lending = lending[L]

In [25]:
#Some columns should be changed to object
lending.select_dtypes(['int64']).apply(pd.Series.nunique)

social_security       4
stud_loan             4
medical_exp           4
income             5963
marital_status        6
auto_insurance      712
children             10
age                  85
gender                2
housing               3
health_status         7
work_hours           93
credit_card           2
dtype: int64

In [26]:
unordered_cols = ['social_security', 'stud_loan', 'medical_exp', 'marital_status', 'housing', 'health_status']
lending[unordered_cols] = lending[unordered_cols].astype('object')

In [27]:
lending_data = load_or_run('lending.p', lambda: list(split_db(lending)))

In [28]:
lending_cm = load_or_run('lending_cm.p', lambda: DTChoice(lending_data, DBMetafeatures(), {a.name: a for a in alg_list}, reps=1))

In [29]:
lending_cm.get_approximate_regret()

array([ 0.08587098,  0.10133844,  0.11961043,  0.13002583,  0.14041655,
        0.06673914])

### Data analysis

In [46]:
"""
def get_cm(db):
    return DTChoice(db, DBMetafeatures(), {a.name: a for a in alg_list}, reps=1)
from multiprocessing import Pool
pool = Pool(processes=3)
cms = pool.map(get_cm, [adult_data[:10], default_data[:10], lending_data[:10]])
"""

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [30]:
#adult_cm, lending_cm, default_cm
cm1 = DTChoice(adult_data + lending_data, DBMetafeatures(), 
               {a.name: a for a in alg_list}, 1,
               pd.concat((adult_cm.y, lending_cm.y)))

In [34]:
L = [cm1.get_regret(x) for x in default_data]

In [42]:
L = pd.concat(L)

In [45]:
L.mean(axis=0)

0.1    0.082945
0.5    0.106328
1.0    0.121096
1.5    0.131782
2      0.136892
cm     0.083012
dtype: float64

In [142]:
dot_data = sklearn.tree.export_graphviz(adult_cm.model, out_file = None, filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


### Synthetic database

In [23]:
cm = DTChoice(train_set, DBMetafeatures(), {a.name: a for a in alg_list}, reps=1)

In [23]:
#Variance calculation
cm.regrets.std(axis=0)

0.1    0.083701
0.5    0.050298
1.0    0.067889
1.5    0.074992
2      0.082567
dtype: float64

In [29]:
cm.regrets.mean(axis=0)

0.1    0.064082
0.5    0.030071
1.0    0.033723
1.5    0.039092
2      0.046795
dtype: float64

In [55]:
data = pd.concat([cm.X, cm.regrets], axis=1)
D = data.groupby(list(cm.X.columns))

In [63]:
A = np.array(D.mean())

In [64]:
B = np.array(D.std())

In [67]:
D.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0.1,0.5,1.0,1.5,2
eps,ncol,nrow,numy,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.2,2,40,6.0,0.003333,0.075000,0.155000,0.228333,0.266667
0.2,2,40,12.0,0.006667,0.076667,0.098333,0.125000,0.160000
0.2,2,40,20.0,0.091667,0.031667,0.030000,0.075000,0.058333
0.2,2,40,28.0,0.005000,0.021667,0.103333,0.128333,0.110000
0.2,2,40,34.0,0.000000,0.096667,0.155000,0.166667,0.180000
0.2,2,80,12.0,0.025833,0.024167,0.070000,0.058333,0.117500
0.2,2,80,24.0,0.060000,0.035000,0.083333,0.078333,0.110000
0.2,2,80,40.0,0.080833,0.072500,0.042500,0.073333,0.043333
0.2,2,80,56.0,0.096667,0.066667,0.039167,0.081667,0.070833
0.2,2,80,68.0,0.005833,0.019167,0.072500,0.091667,0.087500


In [68]:
D.std()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0.1,0.5,1.0,1.5,2
eps,ncol,nrow,numy,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.2,2,40,6.0,0.007454,0.057130,0.098707,0.075599,0.016667
0.2,2,40,12.0,0.014907,0.072982,0.033019,0.106719,0.051841
0.2,2,40,20.0,0.063738,0.023124,0.037546,0.049301,0.059803
0.2,2,40,28.0,0.011180,0.028626,0.081989,0.053229,0.050484
0.2,2,40,34.0,0.000000,0.075599,0.078307,0.080579,0.062805
0.2,2,80,12.0,0.035404,0.031375,0.059102,0.057660,0.072313
0.2,2,80,24.0,0.074396,0.057855,0.062846,0.069310,0.105640
0.2,2,80,40.0,0.063847,0.037800,0.038573,0.047562,0.087876
0.2,2,80,56.0,0.093142,0.050861,0.043760,0.047196,0.060524
0.2,2,80,68.0,0.013044,0.029404,0.042959,0.070465,0.067700
