# Predict the income of an adult on the census data

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('adultincome+trunc.csv')

data_prep = pd.get_dummies(df, drop_first = True)

X = data_prep.iloc[:, :-1]
y = data_prep.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234, stratify = y)

rfc = RandomForestClassifier(random_state = 1234)
trained_model = rfc.fit(X_train, y_train)

y_predict = rfc.predict(X_test)

cm = confusion_matrix(y_test, y_predict)
score = rfc.score(X_test, y_test)

print('score: ', score)
print('Confusion matrix: \n', cm)

score:  0.7983333333333333
Confusion matrix: 
 [[385  55]
 [ 66  94]]


## Create explanations for the model

In [5]:
from interpret.ext.blackbox import TabularExplainer

In [10]:
classes = ['Not Greater than 50k', 'Greater than 50k']
features = list(X.columns)

tab_explainer = TabularExplainer(
    trained_model,
    X_train,
    features = features,
    classes = classes
)

print(tab_explainer, "ready!")

TabularExplainer ready!


## Get the global explanations

In [11]:
global_explanation = tab_explainer.explain_global(X_train)

In [12]:
global_tab_feature_importance = global_explanation.get_feature_importance_dict()
for feature, importance in global_tab_feature_importance.items():
    print(feature,":", importance)

marital status_Married : 0.134457897553098
age : 0.08737999026945
hours per week : 0.054862878214208904
education_ HS-grad : 0.05459228226812601
marital status_ Never-married : 0.04201492036510847
education_ Some-college : 0.03902094175894924
gender_ Male : 0.026954924512638787
education_ Masters : 0.013130510698434426
race_ White : 0.013018742998495862
wc_ Private : 0.008260686989919535
wc_ Local-gov : 0.006514623134033063
education_ Prof-school : 0.006411507490632027
race_ Black : 0.00431141694330449
education_ Doctorate : 0.003320328208831908
race_ Asian-Pac-Islander : 0.0021244991478086886
marital status_ Widowed : 0.0016788032177812158
race_ Other : 0.0003622585777075659
education_ Preschool : 3.30258572723802e-05


## Local feature importances

In [17]:
X_explain = X_test[0:5]

local_explanation = tab_explainer.explain_local(X_explain)
predictions = rfc.predict(X_explain)

local_features = local_explanation.get_ranked_local_names()
local_importance = local_explanation.get_ranked_local_values()

for l in range(len(local_features)):
    print('Support for', classes[l])
    label = local_features[l]
    for o in range(len(label)):
        print("\tObservation", o + 1)
        feature_list = label[o]
        total_support = 0
        for f in range(len(feature_list)):
            print("\t\t", feature_list[f], ':', local_importance[l][o][f])
            total_support += local_importance[l][o][f]
        print("\t\t ----------\n\t\t Total:", total_support, "Prediction:", classes[predictions[o]])

Support for Not Greater than 50k
	Observation 1
		 education_ HS-grad : 0.15753368669907972
		 hours per week : 0.06271450180562069
		 wc_ Private : 0.017900096565616773
		 education_ Masters : 0.012777593697927742
		 education_ Prof-school : 0.0041315933997611845
		 race_ Black : 0.0033106710860027495
		 education_ Doctorate : 0.0018746417015337237
		 education_ Preschool : -1.1182089001845643e-05
		 race_ Other : -0.0003252113353642356
		 wc_ Local-gov : -0.0007829336472287147
		 race_ Asian-Pac-Islander : -0.0018563469401808323
		 marital status_ Widowed : -0.002628879486429275
		 gender_ Male : -0.006227308637173458
		 race_ White : -0.013401018685351852
		 education_ Some-college : -0.022706441348944298
		 marital status_ Never-married : -0.04377002336767857
		 age : -0.06463253768495818
		 marital status_Married : -0.14991280649513541
		 ----------
		 Total: -0.04601190476190413 Prediction: Not Greater than 50k
	Observation 2
		 marital status_Married : 0.11368978878763233
		 hou

In [18]:
for i in range(0, len(local_features)):
    labels = local_features[i]
    print('\n Feature support values for : ', classes[i])
    
    for j in range(0, len(labels)):
        if y_predict[j] == i:
            print('\n\tObservation number: ', j + i)
            feature_name = labels[j]
            
            print('\t\t', 'Feature Name'.ljust(30), ' Value')
            print('\t\t', '-'*30, '-'*10)
            
            for k in range(0, len(feature_name)):
                print('\t\t', feature_name[k].ljust(30), round(local_importance[i][j][k], 6))


 Feature support values for :  Not Greater than 50k

	Observation number:  0
		 Feature Name                    Value
		 ------------------------------ ----------
		 education_ HS-grad             0.157534
		 hours per week                 0.062715
		 wc_ Private                    0.0179
		 education_ Masters             0.012778
		 education_ Prof-school         0.004132
		 race_ Black                    0.003311
		 education_ Doctorate           0.001875
		 education_ Preschool           -1.1e-05
		 race_ Other                    -0.000325
		 wc_ Local-gov                  -0.000783
		 race_ Asian-Pac-Islander       -0.001856
		 marital status_ Widowed        -0.002629
		 gender_ Male                   -0.006227
		 race_ White                    -0.013401
		 education_ Some-college        -0.022706
		 marital status_ Never-married  -0.04377
		 age                            -0.064633
		 marital status_Married         -0.149913

	Observation number:  1
		 Feature Name               

# Run Interpret Model Script in Azure Workspace

In [20]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with Azureml-SDK-WS02


### Prepare Datase


In [27]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

# Get the input dataset

if 'AdultIncome' not in ws.datasets:
    default_ds.upload_files( files = ['adultincome+trunc.csv'],
                           target_path = 'adult_incomes/',
                           overwrite = True,
                           show_progress = True)
    
    tab_data_set = Dataset.Tabular.from_delimited_files(path = (default_ds, 'adult_incomes/*.csv'))
    
    try:
        tab_data_set = tab_data_set.register(
            workspace = ws,
            name = 'AdultIncome',
            description = 'Adult incomes data',
            tags = {'format': 'CSV'},
            create_new_version = True
        )
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered')

Dataset already registered


In [28]:
import os, shutil
from azureml.core import Experiment

experiment_folder = 'incomes'
os.makedirs(experiment_folder, exist_ok = True)

# shutil.copy('adultincome+trunc.csv', os.path.join(experiment_folder, 'adultincome+trunc.csv'))

### Prepare Data

In [37]:
%%writefile $experiment_folder/adult_income.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import argparse, os, joblib
from azureml.core.run import Run
from azureml.interpret import ExplanationClient
from interpret.ext.blackbox import TabularExplainer

run = Run.get_context()

parser = argparse.ArgumentParser()
parser.add_argument('--input-data', type = str)

args = parser.parse_args()

print('Loading Data...')
df = run.input_datasets['raw_data'].to_pandas_dataframe()

data_prep = pd.get_dummies(df, drop_first = True)

X = data_prep.iloc[:, :-1]
y = data_prep.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234, stratify = y)

rfc = RandomForestClassifier(random_state = 1234)
trained_model = rfc.fit(X_train, y_train)

y_predict = rfc.predict(X_test)

cm = confusion_matrix(y_test, y_predict)
score = rfc.score(X_test, y_test)

run.log('accuracy', score)

os.makedirs('outputs', exist_ok = True)
joblib.dump(value = rfc, filename = 'outputs/income.pkl')

# Get the explanation
features = list(X.columns)
classes = ['notGreater', 'Greater']

tab_explainer = TabularExplainer(trained_model,
                                X_train,
                                features = features,
                                classes = classes)

explanation = tab_explainer.explain_global(X_train)

# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation, comment = 'Tabular Explanation')

run.complete()

Overwriting incomes/adult_income.py


### Create computer cluster

In [38]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'my-cluster-001'

try:
    training_cluster = ComputeTarget(workspace = ws, name = cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_DS11_V2', max_nodes = 2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output = True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


### Script Configuration for custom environment

In [39]:
%%writefile $experiment_folder/interpret_env.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- pip
- pip:
  - azureml-defaults
  - azureml-interpret

Overwriting incomes/interpret_env.yml


In [40]:
from azureml.core import ScriptRunConfig
from azureml.core import Environment
from azureml.core.environment import CondaDependencies

explain_env = Environment.from_conda_specification('explain_env', experiment_folder + '/interpret_env.yml')

input_ds = ws.datasets.get('AdultIncome')

script_config = ScriptRunConfig(source_directory = experiment_folder,
                               script = 'adult_income.py',
                               arguments = ['--input-data', input_ds.as_named_input('raw_data')],
                               environment = explain_env,
                               compute_target = training_cluster)

### Experiment

In [41]:
from azureml.core import Experiment
from azureml.widgets import RunDetails

experiment_name = 'incomes-explain'
experiment = Experiment(workspace = ws, name = experiment_name)
run = experiment.submit(config = script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'incomes-explain_1648790664_7ab928ce',
 'target': 'my-cluster-001',
 'status': 'Finalizing',
 'startTimeUtc': '2022-04-01T05:24:33.225696Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlctrain',
  'ContentSnapshotId': '217a399b-f341-475d-b1c0-e2c0daf9eb32',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '2abb5732-71c1-4b95-a455-9e0b0bfc7bc0'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'raw_data', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'adult_income.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data', 'DatasetConsumptionConfig:raw_data'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'my-cluster-001',
  'dataReferences': {},
  'data': {'raw_data': {'dataLocation': {'dataset': {'id': '2abb5732-71c1-4b95-a455-9e0b0bfc7bc

### Retrieve the feature importance values

In [42]:
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(run)
engineered_explanations = client.download_model_explanation()
feature_importances = engineered_explanations.get_feature_importance_dict()

print('Feature\tImportance')
for key, value in feature_importances.items():
    print(key, '\t', value)

Feature	Importance
marital status_Married 	 0.134457897553098
age 	 0.08737999026945
hours per week 	 0.054862878214208904
education_ HS-grad 	 0.05459228226812601
marital status_ Never-married 	 0.04201492036510847
education_ Some-college 	 0.03902094175894924
gender_ Male 	 0.026954924512638787
education_ Masters 	 0.013130510698434426
race_ White 	 0.013018742998495862
wc_ Private 	 0.008260686989919535
wc_ Local-gov 	 0.006514623134033063
education_ Prof-school 	 0.006411507490632027
race_ Black 	 0.00431141694330449
education_ Doctorate 	 0.003320328208831908
race_ Asian-Pac-Islander 	 0.0021244991478086886
marital status_ Widowed 	 0.0016788032177812158
race_ Other 	 0.0003622585777075659
education_ Preschool 	 3.30258572723802e-05
