In [None]:
from numpy import mean
from numpy import std
from pandas import read_csv
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
 
# load the dataset
def load_dataset(full_path):
	# load the dataset as a numpy array
	dataframe = read_csv(full_path, header=None, na_values='?')
	# drop rows with missing
	dataframe = dataframe.dropna()
	# split into inputs and outputs
	last_ix = len(dataframe.columns) - 1
	X, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]
	# select categorical and numerical features
	cat_ix = X.select_dtypes(include=['object', 'bool']).columns
	num_ix = X.select_dtypes(include=['int64', 'float64']).columns
	# label encode the target variable to have the classes 0 and 1
	y = LabelEncoder().fit_transform(y)
	return X.values, y, cat_ix, num_ix
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores
import time
start = time.time()
# de
grad_boos = GradientBoostingClassifier(n_estimators=100)
names = 'GBM'

 
# define the location of the dataset
full_path = '../GeneralDatasets/Csv/0a_no1_e20.csv'
# load the dataset
X, y, cat_ix, num_ix = load_dataset(full_path)
# define models
results = list()
# evaluate each model
import tqdm
# for i in tqdm.tqdm(range(len(models))):
# define steps
steps = [('c',OneHotEncoder(handle_unknown='ignore'),cat_ix), ('n',MinMaxScaler(),num_ix)]
# one hot encode categorical, normalize numerical
ct = ColumnTransformer(steps)
# wrap the model i a pipeline
pipeline = Pipeline(steps=[('t',ct),('m',grad_boos)])
# evaluate the model and store results
scores = evaluate_model(X, y, pipeline)
print(f"Time to run: {time.time()-start} seconds")

In [43]:
for i, score in enumerate(scores):
    print(f"Accuracy with data sanitized with alpha = {input_file[i].split('a')[0]}: {mean(score)}")

Accuracy with data sanitized with alpha = 9875: nan
Accuracy with data sanitized with alpha = 80: nan
Accuracy with data sanitized with alpha = 25: nan
Accuracy with data sanitized with alpha = 0: nan


In [39]:
score


array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan])

In [None]:
# /home/jc/Desktop/udem_H20/thesis_research/reconstructor/data/disp_impact_remover_1.0.csv
print(f"Accuracy with data sanitized with disparate impact (a=1.0): {mean(scores)}")

In [20]:
print(f"Accuracy with data sanitized with alpha = 0.9875: {mean(scores)}")

Accuracy with data sanitized with alpha = 0.9875: 0.8460150242300162


In [18]:
print(f"Accuracy with data sanitized with alpha = 0.80: {mean(scores)}")

alpha = 0.80: 0.8463982621103877


In [15]:
print(f"Accuracy with data sanitized with alpha = 0.25: {mean(scores)}")

alpha = 0.25: 0.8361822462673703


In [None]:
print(f"Accuracy with data sanitized with alpha = 0: {mean(scores)}")

In [52]:
# from numpy import mean
# from numpy import std
# from pandas import read_csv
# from matplotlib import pyplot
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble import BaggingClassifier
 

# # load the dataset
# def load_dataset(full_path):
# 	# load the dataset as a numpy array
# 	dataframe = read_csv(full_path, header=None, na_values='?')
# 	# drop rows with missing
# 	dataframe = dataframe.dropna()
# 	# split into inputs and outputs
# 	last_ix = len(dataframe.columns) - 1
# 	import pdb;pdb.set_trace()    
# 	X, y = dataframe.drop(9, axis=1), dataframe[9]
# 	# select categorical and numerical features
# 	cat_ix = X.select_dtypes(include=['object', 'bool']).columns
# 	num_ix = X.select_dtypes(include=['int64', 'float64']).columns
# 	# label encode the target variable to have the classes 0 and 1
# 	y = LabelEncoder().fit_transform(y)
# 	return X.values, y, cat_ix, num_ix
 
# # evaluate a model
# def evaluate_model(X, y, model):
# 	# define evaluation procedure
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	# evaluate model
# 	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# 	return scores

# scores = []
# input_file = ['9875a_no1_e20','80a_no1_e20', '25a_no1_e20', '0a_no1_e20']
# for file in input_file:
#     start = time.time()
#     # de
#     grad_boos = GradientBoostingClassifier(n_estimators=100)
#     names = 'GBM'


#     # define the location of the dataset
#     full_path = f'../GeneralDatasets/Csv/{file}.csv'

#     X, y, cat_ix, num_ix = load_dataset(full_path)
#     # define models
# #     import pdb;pdb.set_trace()

#     steps = [('c',OneHotEncoder(handle_unknown='ignore'),cat_ix), ('n',MinMaxScaler(),num_ix)]
#     # one hot encode categorical, normalize numerical
#     ct = ColumnTransformer(steps)
#     pipeline = Pipeline(steps=[('t',ct),('m',grad_boos)])
#     score = evaluate_model(X, y, pipeline)
#     scores.append(score)
#     print(f"Run completed for {file} in {(time.time()-start)/60} seconds")
# # summarize performance