# Simulate app form prediction process

## Load pipeline artifact

In [1]:
import pickle

path = '../artifacts/pipeline.pkl'

with open(path, 'rb') as file:
    pipeline = pickle.load(file)

In [2]:
pipeline.feature_names_in_

array(['Industry', 'Ethnicity', 'Gender', 'Age', 'CivilStatus',
       'YearsEmployed', 'Income'], dtype=object)

## Estructure categorical data options

### Feature names and categories

In [3]:
pipeline

In [4]:
processor = pipeline.steps[0][1]
encoder = processor.transformers_[0][1]

feature_names = encoder.feature_names_in_
feature_categories = encoder.categories_

In [5]:
feature_names

array(['Industry', 'Ethnicity', 'Gender', 'CivilStatus'], dtype=object)

In [6]:
feature_categories

[array(['CommunicationServices', 'ConsumerDiscretionary',
        'ConsumerStaples', 'Education', 'Energy', 'Financials',
        'Healthcare', 'Industrials', 'InformationTechnology', 'Materials',
        'Real Estate', 'Research', 'Transport', 'Utilities'], dtype=object),
 array(['Asian', 'Black', 'Latino', 'Other', 'White'], dtype=object),
 array(['Female', 'Male'], dtype=object),
 array(['Married', 'Other'], dtype=object)]

In [7]:
options = {}
for feature, categories in zip(feature_names, feature_categories):
    options[feature] = categories.tolist()

options

{'Industry': ['CommunicationServices',
  'ConsumerDiscretionary',
  'ConsumerStaples',
  'Education',
  'Energy',
  'Financials',
  'Healthcare',
  'Industrials',
  'InformationTechnology',
  'Materials',
  'Real Estate',
  'Research',
  'Transport',
  'Utilities'],
 'Ethnicity': ['Asian', 'Black', 'Latino', 'Other', 'White'],
 'Gender': ['Female', 'Male'],
 'CivilStatus': ['Married', 'Other']}

### Export options to file

In [8]:
import json

with open('../src/options_categorical.json', 'w') as file:
    json.dump(options, file, indent=4)

## Estructure numerical data options

### Feature names

In [9]:
processor = pipeline.steps[0][1]
encoder = processor.transformers_[1][1]

feature_names = encoder.feature_names_in_.tolist()
feature_names

['Age', 'YearsEmployed', 'Income']

### Average value by default

In [10]:
import pandas as pd

df = pd.read_csv('../../../data/credit-approval.csv')
df

Unnamed: 0,Industry,Ethnicity,Gender,Age,CivilStatus,YearsEmployed,Income,Approved
0,Industrials,White,Male,30,Married,1.25,0,1
1,Materials,Black,Female,58,Married,3.04,560,1
...,...,...,...,...,...,...,...,...
688,ConsumerStaples,White,Male,17,Married,0.04,750,0
689,Energy,Black,Male,35,Married,8.29,0,0


In [16]:
options = df[feature_names].mean().astype(int).to_dict()

### Export options to file

In [17]:
import json

with open('../src/options_numerical.json', 'w') as file:
    json.dump(options, file, indent=4)