# This notebook contains the experiments on Adult Census dataset with Altruist

Load few libraries we will need

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from altruist import Altruist
from fi_techniques import FeatureImportance
import pandas as pd 
import numpy as np
import urllib
import networkx as nx
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

Using TensorFlow backend.


## Data Loading

Firstly, we load the dataset and we set the feature and class names

In [3]:
feature_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','salary']
class_names=['<=50K','>50K'] #0: <=50K and 1: >50K
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names=feature_names, delimiter=', ')
data_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', names=feature_names, delimiter=', ')
data_test = data_test.drop(data_test.index[[0]])

We are doing the following preprocessing influenced by a github notebook

In [4]:
data = data[(data != '?').all(axis=1)]
data_test = data_test[(data_test != '?').all(axis=1)]
data_test['salary'] = data_test['salary'].map({'<=50K.': '<=50K', '>50K.': '>50K'})
frames = [data, data_test]
data = pd.concat(frames)

Feature Engineering from:
https://github.com/pooja2512/Adult-Census-Income/blob/master/Adult%20Census%20Income.ipynb. So run and skip the next code block

In [5]:
hs_grad = ['HS-grad','11th','10th','9th','12th']
elementary = ['1st-4th','5th-6th','7th-8th']
# replace elements in list.
for i in hs_grad:
    data['education'].replace(i , 'HS-grad', regex=True , inplace=True)
for e in elementary:
    data['education'].replace(e , 'elementary-school', regex=True, inplace = True)

married= ['Married-spouse-absent','Married-civ-spouse','Married-AF-spouse']
separated = ['Separated','Divorced']
#replace elements in list.
for m in married:
    data['marital-status'].replace(m ,'Married', regex=True, inplace = True)
for s in separated:
    data['marital-status'].replace(s ,'Separated', regex=True, inplace = True)

self_employed = ['Self-emp-not-inc','Self-emp-inc']
govt_employees = ['Local-gov','State-gov','Federal-gov']
for se in self_employed:
    data['workclass'].replace(se , 'Self_employed', regex=True, inplace = True)
for ge in govt_employees:
    data['workclass'].replace(ge , 'Govt_employees', regex=True, inplace = True)

del_cols = ['relationship','education-num']
data.drop(labels = del_cols, axis = 1, inplace = True)

index_age = data[data['age'] == 90].index
data.drop(labels = index_age, axis = 0, inplace =True)
num_col_new = ['age','capital-gain', 'capital-loss',
       'hours-per-week','fnlwgt']
cat_col_new = ['workclass', 'education', 'marital-status', 'occupation',
               'race', 'sex','salary','native-country']#add native-country label
scaler = MinMaxScaler()
#pd.DataFrame(scaler.fit_transform(data[num_col_new]),columns = num_col_new)
class DataFrameSelector(TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y = None):
        return self
    def transform(self,X):
        return X[self.attribute_names]
class num_trans(TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        df = pd.DataFrame(X)
        df.columns = num_col_new 
        return df
pipeline = Pipeline([('selector',DataFrameSelector(num_col_new)),  
                     ('scaler',MinMaxScaler()),('transform',num_trans())])#('scaler',MinMaxScaler()),        
num_df = pipeline.fit_transform(data)
num_df.shape
# columns which I don't need after creating dummy variables dataframe
cols = ['workclass_Govt_employess','education_Some-college',
        'marital-status_Never-married','occupation_Other-service',
        'race_Black','sex_Male','salary_>50K']
class dummies(TransformerMixin):
    def __init__(self,cols):
        self.cols = cols
    
    def fit(self,X,y = None):
        return self
    
    def transform(self,X):
        df = pd.get_dummies(X)
        df_new = df[df.columns.difference(cols)] 
        return df_new
pipeline_cat=Pipeline([('selector',DataFrameSelector(cat_col_new)),
                      ('dummies',dummies(cols))])
cat_df = pipeline_cat.fit_transform(data)
cat_df['id'] = pd.Series(range(cat_df.shape[0]))
num_df['id'] = pd.Series(range(num_df.shape[0]))
final_df = pd.merge(cat_df,num_df,how = 'inner', on = 'id')
print(f"Number of observations in final dataset: {final_df.shape}")

Number of observations in final dataset: (45167, 82)


We extract the train and target data from the dataframe

In [6]:
y = final_df['salary_<=50K'].values
final_df.drop(labels = ['id','salary_<=50K'],axis = 1,inplace = True)
X = final_df.values

In [7]:
feature_names = list(final_df.columns.values)
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'race', 'sex','native-country']

In [8]:
len(X)

45167

In [9]:
from imblearn.under_sampling import TomekLinks, NeighbourhoodCleaningRule, NearMiss,RandomUnderSampler
from collections import Counter

print('Original dataset shape %s' % Counter(y))

tl = TomekLinks()
X_res, y_res = tl.fit_resample(X, y)
print('TomekLinks: Resampled dataset shape %s' % Counter(y_res))
ncr = NeighbourhoodCleaningRule()
X_res, y_res = ncr.fit_resample(X_res, y_res)
print('NC: Resampled dataset shape %s' % Counter(y_res))
nm = NearMiss(version=3)
X_res, y_res = nm.fit_resample(X_res, y_res)
print('NM: Resampled dataset shape %s' % Counter(y_res))
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_res, y_res)
print('Random: Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({1: 33970, 0: 11197})
TomekLinks: Resampled dataset shape Counter({1: 31155, 0: 11197})
NC: Resampled dataset shape Counter({1: 25878, 0: 11197})
NM: Resampled dataset shape Counter({0: 11197, 1: 7017})
Random: Resampled dataset shape Counter({0: 7017, 1: 7017})


In [14]:
rus = RandomUnderSampler(sampling_strategy='all', random_state=42)
X_res2, y_res2 = rus.fit_resample(X_res[:9000], y_res[:9000])
print('Random: Resampled dataset shape %s' % Counter(y_res2))

Random: Resampled dataset shape Counter({0: 1983, 1: 1983})


In [10]:
rus = RandomUnderSampler(sampling_strategy='all', random_state=42)
X_res2, y_res2 = rus.fit_resample(X_res[:7517], y_res[:7517])
print('Random: Resampled dataset shape %s' % Counter(y_res2))

Random: Resampled dataset shape Counter({0: 500, 1: 500})


We have 1372 instances. We are going to use the build-in GridSearch of LionForests to find and train the best classifier for this dataset

## Machine Learning models training step

We will use a MinMax scaler to normalize the input


In [11]:
scaler = MinMaxScaler(feature_range=(-1,1))

In [12]:
classifiers = {}
scalers = {}

In [13]:
pipe = Pipeline(steps=[('scaler', scaler), ('rf', RandomForestClassifier(random_state=77))])
parameters =[{
    'rf__max_depth': [7], #1, 5, 7, 10
    'rf__max_features': ['sqrt'], #'sqrt', 'log2', 0.75, None
    'rf__bootstrap': [False], #True, False
    'rf__min_samples_leaf' : [2], #1, 2, 5, 10, 0.10
    'rf__n_estimators': [10] #10, 100, 500, 1000
}]
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1, verbose=1)
clf.fit(X_res2, y_res2)
scaler_rf = clf.best_estimator_.steps[0][1]
rf = clf.best_estimator_.steps[1][1]
classifiers[1] = [rf, str("Random Forests: "+ str(clf.best_score_))]
scalers[1] = scaler_rf

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.3s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.4s finished


In [14]:
pipe = Pipeline(steps=[('scaler', scaler), ('svm', SVC(probability=True,random_state=77))])
parameters = [
  {'svm__C': [-3, 1, 3, 10, 100, 1000], 'svm__kernel': ['linear']},
  {'svm__C': [-3, 1, 3, 10, 100, 1000], 'svm__gamma': [0.1, 0.01, 0.001, 0.0001], 'svm__kernel': ['rbf']},
]
parameters = {'svm__C': [3], 'svm__gamma': [0.1], 'svm__kernel': ['rbf']} #best
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1, verbose=1)
clf.fit(X_res2, y_res2)
scaler_svm = clf.best_estimator_.steps[0][1]
svm = clf.best_estimator_.steps[1][1]
classifiers[2] = [svm, str("SVM: "+ str(clf.best_score_))]
scalers[2] = scaler_svm

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [15]:
pipe = Pipeline(steps=[('scaler', scaler), ('lr', LogisticRegression(random_state=77))])
parameters = [
  {'lr__C': [-3, 1, 3, 10, 100, 1000], 'lr__penalty': ['l1'], 'lr__solver': ['liblinear', 'saga']},
  {'lr__C': [-3, 1, 3, 10, 100, 1000], 'lr__penalty': ['l2'], 'lr__solver': ['newton-cg', 'lbfgs', 'sag','saga']}
]
parameters = {'lr__C': [3], 'lr__penalty': ['l2'], 'lr__solver': ['sag']}#best
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1, verbose=1)
clf.fit(X_res2, y_res2)
scaler_lr = clf.best_estimator_.steps[0][1]
lr = clf.best_estimator_.steps[1][1]
classifiers[3] = [lr, str("Logistic Regression: "+ str(clf.best_score_))]
scalers[3] = scaler_lr

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished


In [20]:
pipe = Pipeline(steps=[('scaler', scaler), ('nn', MLPClassifier(early_stopping=True, random_state=77))])
parameters = {
    'nn__hidden_layer_sizes': [(2,10),(5,10),(10,100),(20,200),(50,500),(100,1000)], 
    'nn__activation': ['logistic', 'tanh', 'relu'],
    'nn__solver': ['sgd', 'adam'],
    'nn__alpha': [0.000001,0.0001,0.001, 0.01, 0.1],
    'nn__learning_rate': ['constant', 'invscaling', 'adaptive']}
parameters = {
    'nn__hidden_layer_sizes': [(50,500)], 
    'nn__activation': ['tanh'],
    'nn__solver': ['adam'],
    'nn__alpha': [0.000001],
    'nn__learning_rate': ['constant']}
clf = GridSearchCV(pipe, parameters, scoring='f1', cv=10, n_jobs=-1, verbose=1)
clf.fit(X_res2, y_res2)

scaler_nn = clf.best_estimator_.steps[0][1]
nn = clf.best_estimator_.steps[1][1]
classifiers[4] = [nn, str("Neural Network: "+ str(clf.best_score_))]
scalers[4] = scaler_nn

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.7s finished


In [21]:
classifiers

{1: [RandomForestClassifier(bootstrap=False, max_depth=7, max_features='sqrt',
                         min_samples_leaf=2, n_estimators=10, random_state=77),
  'Random Forests: 0.9431755994216005'],
 2: [SVC(C=3, gamma=0.1, probability=True, random_state=77),
  'SVM: 0.9593078770419098'],
 3: [LogisticRegression(C=3, random_state=77, solver='sag'),
  'Logistic Regression: 0.9467018455060752'],
 4: [MLPClassifier(activation='tanh', alpha=1e-06, early_stopping=True,
                hidden_layer_sizes=(50, 500), random_state=77),
  'Neural Network: 0.9442092900222173']}

In [45]:
@interact(eli_5=False, lime=True, shap=True, perm_importance=True, intristic=True, cl=(1,4))
def g(eli_5, lime, shap, perm_importance, intristic, cl=2):
    print(classifiers[cl][1])
    X_t = scalers[cl].transform(X_res2)
    fi = FeatureImportance(X_t[900:1000], y_res2[900:1000], feature_names, class_names)
    fi_names = {fi.fi_lime:'Lime',fi.fi_shap:'Shap',fi.fi_eli:'Eli5',fi.fi_perm_imp:'Permuation Importance',fi.fi_rf:'Pseudo-Intristic RFs', fi.fi_coef_lr:'Intristic LR'}
    fis = []
    if (eli_5 and not cl == 2 and not cl == 3):
        fis.append(fi.fi_eli)
    if lime:
        fis.append(fi.fi_lime)
    if shap:
        fis.append(fi.fi_shap)
    if perm_importance:
        fis.append(fi.fi_perm_imp)
    if intristic and cl == 1:
        fis.append(fi.fi_rf)
    if intristic and cl == 3:
        fis.append(fi.fi_coef_lr)
    fis_scores = []
    for i in fis:
        fis_scores.append([])
    count = 0;
    altruistino = Altruist(classifiers[cl][0], X_t, fis, feature_names)
    for instance in X_t[900:1000]:
        if (count + 1) % 10 == 0:
            print(count+1,"/",len(X_t),"..",end=", ")
        count = count + 1
        untruthful_features = altruistino.find_untruthful_features(instance)
        for i in range(len(untruthful_features[0])):
            fis_scores[i].append(len(untruthful_features[0][i]))
    print(len(X_t),"/",len(X_t))
    count = 0
    for fis_score in fis_scores:
        fi = fis[count]
        count = count + 1
        print(fi_names[fi],np.array(fis_score).mean())
        print(fi_names[fi],np.array(fis_score).sum())
        print(count)

interactive(children=(Checkbox(value=False, description='eli_5'), Checkbox(value=True, description='lime'), Ch…