In [2]:
import os,sys

import utils as ut
import numpy as np
import pandas as pd
from random import seed, shuffle
#SEED = 1122334455
#seed(SEED) # set the random seed so that the random permutations can be reproduced again
#np.random.seed(SEED)

"""
    The adult dataset can be obtained from: http://archive.ics.uci.edu/ml/datasets/Adult
    The code will look for the data files (adult.data, adult.test) in the present directory, if they are not found, it will download them from UCI archive.
"""

def check_data_file(fname):
    files = os.listdir(".") # get the current directory listing
    print("Looking for file '%s' in the current directory..." % fname)

    if fname not in files:
        print("'%s' not found! Downloading from UCI Archive..." % fname)
        
    else:
        print("File found in current directory..")
    
    print('')
    return

def load_user_data(load_data_size=None):

    """
        if load_data_size is set to None (or if no argument is provided), then we load and return the whole data
        if it is a number, say 10000, then we will return randomly selected 10K examples
    """

    attrs = ['country', 'socialNbFollowers', 'socialNbFollows', 'socialProductsLiked',\
             'productsListed', 'productsSold', 'productsPassRate', 'productsWished', 'productsBought',\
             'gender', 'hasAnyApp', 'hasAndroidApp', 'hasIosApp', 'hasProfilePicture',\
             'daysSinceLastLogin', 'seniority'] # attributes with integer values -- the rest are categorical
    int_attrs = ['socialNbFollowers', 'socialNbFollows', 'socialProductsLiked', 'productsListed',\
                 'productsSold', 'productsPassRate', 'productsWished', 'productsBought',\
                 'daysSinceLastLogin', 'seniority'] # attributes with integer values -- the rest are categorical
    sensitive_attrs = ['gender'] # the fairness constraints will be used for this feature
    attrs_to_ignore = ['gender'] # sex and race are sensitive feature so we will not use them in classification, we will not consider fnlwght for classification since its computed externally and it highly predictive for the class (for details, see documentation of the adult data)
    attrs_for_classification = set(attrs) - set(attrs_to_ignore)

    # adult data comes in two different files, one for training and one for testing, however, we will combine data from both the files
    data_files = ["data_final_likes.csv"]



    X = []
    y = []
    x_control = {}

    attrs_to_vals = {} # will store the values for each attribute for all users
    for k in attrs:
        if k in sensitive_attrs:
            x_control[k] = []
        elif k in attrs_to_ignore:
            pass
        else:
            attrs_to_vals[k] = []

    for f in data_files:
        check_data_file(f)

        for line in open(f):
            line = line.strip()
            if line == "": continue # skip empty lines
            if line.startswith("country"): # ignore header
                continue
            line = line.split(",")
            line = line[:-4]
            
            y.append(1) #fixed class label because there isn't any

            for i in range(0,len(line)):
                attr_name = attrs[i]
                attr_val = line[i]

                if attr_name in sensitive_attrs:
                    x_control[attr_name].append(attr_val)
                elif attr_name in attrs_to_ignore:
                    pass
                else:
                    attrs_to_vals[attr_name].append(attr_val)
                    

    def convert_attrs_to_ints(d): # discretize the string attributes
        for attr_name, attr_vals in d.items():
            if attr_name in int_attrs: continue
            uniq_vals = sorted(list(set(attr_vals))) # get unique values

            # compute integer codes for the unique values
            val_dict = {}
            for i in range(0,len(uniq_vals)):
                val_dict[uniq_vals[i]] = i
            # replace the values with their integer encoding
            for i in range(0,len(attr_vals)):
                attr_vals[i] = val_dict[attr_vals[i]]
            d[attr_name] = attr_vals

    
    # convert the discrete values to their integer representations
    convert_attrs_to_ints(x_control)
    convert_attrs_to_ints(attrs_to_vals)
    

    # if the integer vals are not binary, we need to get one-hot encoding for them
    for attr_name in attrs_for_classification:
        attr_vals = attrs_to_vals[attr_name]
        if attr_name in int_attrs or attr_name in ['hasAnyApp', 'hasAndroidApp', 'hasIosApp',\
            'hasProfilePicture']: #MUST catch everything that is int or binary category, otherwise will fuck up
            X.append(attr_vals)

        else:            
            attr_vals, index_dict = ut.get_one_hot_encoding(attr_vals)
            for inner_col in attr_vals.T:                
                X.append(inner_col) 


    # convert to numpy arrays for easy handline
    #print(X)
    X = np.array(X, dtype=float).T
    y = np.array(y, dtype = float)
    for k, v in x_control.items(): x_control[k] = np.array(v, dtype=float)
        
    # shuffle the data
    perm = list(range(0,len(y))) # shuffle the data before creating each fold
    shuffle(perm)
    #print(perm)
    df = pd.read_csv(data_files[0])
    df = df.iloc[perm, :]
    X = X[perm]
    y = y[perm]
    for k in x_control.keys():
        x_control[k] = x_control[k][perm]

    # see if we need to subsample the data
    if load_data_size is not None:
        print("Loading only %d examples from the data" % load_data_size)
        X = X[:load_data_size]
        y = y[:load_data_size]
        for k in x_control.keys():
            x_control[k] = x_control[k][:load_data_size]

    return df, X, y, x_control

In [3]:
import sampling_methods
import numpy as np
import numpy.linalg as la
import sys
import sampling_methods
from misc import *

DATA_SIZE=5000
SAMPLE_SIZE=2500
NO_SAMPLES=2500
df, nrecords,labels,classes=load_user_data(DATA_SIZE)
nrecords=nrecords.tolist()
nrecords=remove_zero_cols(nrecords)
normalize(nrecords)
gender=classes['gender']

# prepare 0-1 arrays determining the gender and the race of the data points
gender=list(map(int,gender))
for i in range(len(gender)):
    gender[i]=gender[i]%2

# enrich the data vectors by adding pairwise product features  
reg_nrecords=np.array(nrecords).copy().tolist()
for e in range(len(nrecords)):
    reg_nrecords[e]=add_features(nrecords[e])
reg_nrecords=remove_zero_cols(reg_nrecords)
X=np.asarray(nrecords)
Y=np.array(reg_nrecords)




Looking for file 'data_final_likes.csv' in the current directory...
File found in current directory..

Loading only 5000 examples from the data


In [4]:
S=sampling_methods.kDPPGreedySample(Y,1250)

print(len(S))
df13 = df.iloc[S, :]
df13.to_csv("dpp_1250_1.csv")

[0.00011122 0.00012601 0.00010676 ... 0.00030558 0.00011456 0.00016506] 84947.10357753215
[0.00015779 0.00017802 0.00014883 ... 0.00042724 0.00016315 0.0002021 ] 54356.53891800997
[0.00018594 0.00021164 0.00011638 ... 0.00050007 0.0001943  0.00024948] 44034.25460938486
[1.61039677e-04 1.89039338e-04 8.12838152e-05 ... 5.54740510e-04
 1.77398053e-04 1.30366947e-04] 38809.08328303028
[1.79450352e-04 2.09910020e-04 7.32620904e-05 ... 6.23501308e-04
 1.97767341e-04 1.46389269e-04] 34492.176917260396
[2.05380966e-04 2.37034335e-04 8.51900119e-05 ... 3.90254588e-04
 2.23798329e-04 1.46592944e-04] 29613.606206804372
[2.22756116e-04 2.55767275e-04 9.37149245e-05 ... 3.04220485e-04
 2.42106464e-04 1.62545675e-04] 26634.293701544913
[2.40059254e-04 2.75656016e-04 9.94034340e-05 ... 2.32520043e-04
 2.60944309e-04 1.66695002e-04] 24709.101998890153
[3.06013304e-04 1.35599381e-06 1.26121047e-04 ... 3.09606997e-04
 3.97999546e-06 2.22474387e-04] 18428.39204493413
[3.25701181e-04 1.45075052e-06 1.335

KeyboardInterrupt: 

In [34]:
import sampling_methods
import numpy as np
import numpy.linalg as la
import sys
import sampling_methods
from misc import *

DATA_SIZE=5000
SAMPLE_SIZE=2500
NO_SAMPLES=2500
df1, nrecords,labels,classes=load_user_data(DATA_SIZE)
nrecords=nrecords.tolist()
nrecords=remove_zero_cols(nrecords)
normalize(nrecords)
gender=classes['gender']

# prepare 0-1 arrays determining the gender and the race of the data points
gender=list(map(int,gender))
for i in range(len(gender)):
    gender[i]=gender[i]%2

# enrich the data vectors by adding pairwise product features  
reg_nrecords=np.array(nrecords).copy().tolist()
for e in range(len(nrecords)):
    reg_nrecords[e]=add_features(nrecords[e])
reg_nrecords=remove_zero_cols(reg_nrecords)
X=np.asarray(nrecords)
Y=np.array(reg_nrecords)

S1=sampling_methods.kDPPGreedySample(Y,1250)



Looking for file 'data_final.csv' in the current directory...
File found in current directory..

Loading only 5000 examples from the data


In [59]:
S = S1
print(len(S))
df13 = df1.iloc[S, :]
df13.to_csv("dpp_1250_1.csv")

1250


In [37]:
import sampling_methods
import numpy as np
import numpy.linalg as la
import sys
import sampling_methods
from misc import *

DATA_SIZE=5000
SAMPLE_SIZE=2500
NO_SAMPLES=2500
df2, nrecords,labels,classes=load_user_data(DATA_SIZE)
nrecords=nrecords.tolist()
nrecords=remove_zero_cols(nrecords)
normalize(nrecords)
gender=classes['gender']

# prepare 0-1 arrays determining the gender and the race of the data points
gender=list(map(int,gender))
for i in range(len(gender)):
    gender[i]=gender[i]%2

# enrich the data vectors by adding pairwise product features  
reg_nrecords=np.array(nrecords).copy().tolist()
for e in range(len(nrecords)):
    reg_nrecords[e]=add_features(nrecords[e])
reg_nrecords=remove_zero_cols(reg_nrecords)
X=np.asarray(nrecords)
Y=np.array(reg_nrecords)

S2=sampling_methods.kDPPGreedySample(Y,1250)



Looking for file 'data_final.csv' in the current directory...
File found in current directory..

Loading only 5000 examples from the data


In [60]:
S = S2
print(len(S))
df13 = df2.iloc[S, :]
df13.to_csv("dpp_1250_2.csv")

1250


In [44]:
import sampling_methods
import numpy as np
import numpy.linalg as la
import sys
import sampling_methods
from misc import *

DATA_SIZE=5000
SAMPLE_SIZE=2500
NO_SAMPLES=2500
df11, nrecords,labels,classes=load_user_data(DATA_SIZE)
nrecords=nrecords.tolist()
nrecords=remove_zero_cols(nrecords)
normalize(nrecords)
gender=classes['gender']

# prepare 0-1 arrays determining the gender and the race of the data points
gender=list(map(int,gender))
for i in range(len(gender)):
    gender[i]=gender[i]%2

# enrich the data vectors by adding pairwise product features  
reg_nrecords=np.array(nrecords).copy().tolist()
for e in range(len(nrecords)):
    reg_nrecords[e]=add_features(nrecords[e])
reg_nrecords=remove_zero_cols(reg_nrecords)
X=np.asarray(nrecords)
Y=np.array(reg_nrecords)

S11=sampling_methods.kDPPGreedySample(Y,2000)

S111=sampling_methods.kDPPGreedySample(Y,500)

Looking for file 'data_final.csv' in the current directory...
File found in current directory..

Loading only 5000 examples from the data


In [56]:
S = list(set(S11 + S111))
#print(S)
#df11 = df11.iloc[S, :]
#df11.to_csv("dpp_2500_1.csv")

In [40]:
import sampling_methods
import numpy as np
import numpy.linalg as la
import sys
import sampling_methods
from misc import *

DATA_SIZE=5000
SAMPLE_SIZE=2500
NO_SAMPLES=2500
df12, nrecords,labels,classes=load_user_data(DATA_SIZE)
nrecords=nrecords.tolist()
nrecords=remove_zero_cols(nrecords)
normalize(nrecords)
gender=classes['gender']

# prepare 0-1 arrays determining the gender and the race of the data points
gender=list(map(int,gender))
for i in range(len(gender)):
    gender[i]=gender[i]%2

# enrich the data vectors by adding pairwise product features  
reg_nrecords=np.array(nrecords).copy().tolist()
for e in range(len(nrecords)):
    reg_nrecords[e]=add_features(nrecords[e])
reg_nrecords=remove_zero_cols(reg_nrecords)
X=np.asarray(nrecords)
Y=np.array(reg_nrecords)

S12=sampling_methods.kDPPGreedySample(Y,2000)



Looking for file 'data_final.csv' in the current directory...
File found in current directory..

Loading only 5000 examples from the data


In [41]:
S122=sampling_methods.kDPPGreedySample(Y,500)

In [57]:
S = list(set(S12 + S122))
print(len(S))
df12 = df12.iloc[S, :]
df12.to_csv("dpp_2500_2.csv")

2028


In [42]:
import sampling_methods
import numpy as np
import numpy.linalg as la
import sys
import sampling_methods
from misc import *

DATA_SIZE=5000
SAMPLE_SIZE=2500
NO_SAMPLES=2500
df13, nrecords,labels,classes=load_user_data(DATA_SIZE)
nrecords=nrecords.tolist()
nrecords=remove_zero_cols(nrecords)
normalize(nrecords)
gender=classes['gender']

# prepare 0-1 arrays determining the gender and the race of the data points
gender=list(map(int,gender))
for i in range(len(gender)):
    gender[i]=gender[i]%2

# enrich the data vectors by adding pairwise product features  
reg_nrecords=np.array(nrecords).copy().tolist()
for e in range(len(nrecords)):
    reg_nrecords[e]=add_features(nrecords[e])
reg_nrecords=remove_zero_cols(reg_nrecords)
X=np.asarray(nrecords)
Y=np.array(reg_nrecords)

S13=sampling_methods.kDPPGreedySample(Y,2000)



Looking for file 'data_final.csv' in the current directory...
File found in current directory..

Loading only 5000 examples from the data


In [43]:


S133=sampling_methods.kDPPGreedySample(Y,500)

In [58]:
S = list(set(S13 + S133))
print(len(S))
df13 = df13.iloc[S, :]
df13.to_csv("dpp_2500_3.csv")

2017
