# Install Packages

In [3]:
!pip install tpot mljar-supervised

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 3.3 MB/s 
[?25hCollecting mljar-supervised
  Downloading mljar-supervised-0.11.3.tar.gz (112 kB)
[K     |████████████████████████████████| 112 kB 37.5 MB/s 
[?25hCollecting xgboost>=1.1.0
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[K     |████████████████████████████████| 255.9 MB 43 kB/s 
[?25hCollecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting deap>=1.2
  Downloading deap-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (139 kB)
[K     |████████████████████████████████| 139 kB 49.9 MB/s 
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting lightgbm>=3.0.0
  Downloading lightgbm-3.3.3-py3-none-manylinux1_x86_

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML


# Options Available

- mode — the package ships with four built-in models. 
  - The Explain mode is ideal for explaining and understanding the data. It results in visualizations of feature importance as well as tree visualizations.
  - The Perform is used when building ML models for production. 
  - The Compete is meant to build models used in machine learning competitions. 
  - The Optuna mode is used to search for highly-tuned ML models.
- algorithms — specifies the algorithms you would like to use. They are usually passed in as a list.
- results_path — the path where the results will be stored
- total_time_limit — the total time in seconds for training the model
- train_ensemble — dictates if an ensemble will be created at the end of the training process
- stack_models — determines if a models stack will be created
- eval_metric — the metric that will be optimized. If auto the logloss is used for classification problems while the rmse is used for regression problems

In [None]:
#automl = AutoML(
    # mode="Explain"
    # algorithms=""
    # results_path="AutoML_22",
    # total_time_limit=30 * 60,
    # train_ensemble=True,
    # stack_models="",
    # eval_metric=""
#)

# Healthcare Dataset - SPARCS

## Load in dataset

In [5]:
import pandas as pd
sparcs = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_sparcs.csv')
sparcs

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [None]:
sparcs.columns

Index(['Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'Discharge Year', 'CCS Diagnosis Code', 'CCS Diagnosis Description',
       'CCS Procedure Code', 'CCS Procedure Description', 'APR DRG Code',
       'APR DRG Description', 'APR MDC Code', 'APR MDC Description',
       'APR Severity of Illness Code', 'APR Severity of Illness Description',
       'APR Risk of Mortality', 'APR Medical Surgical Description',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Birth Weight', 'Abortion Edit Indicator',
       'Emergency Department Indicator', 'Total Charges', 'Total Costs'],
      dtype='object')

## Potential variables of interest

- APR Risk of Mortality (categorical) 
- Total costs (continuous) 
- Length of Stay

In [None]:
sparcs['Length of Stay'].describe()

count     23583
unique       97
top           2
freq       5378
Name: Length of Stay, dtype: object

In [None]:
sparcs['Total Costs'].describe()

count    2.358300e+04
mean     1.472282e+04
std      2.718098e+04
min      6.700000e-01
25%      4.471700e+03
50%      8.320120e+03
75%      1.590874e+04
max      1.591541e+06
Name: Total Costs, dtype: float64

In [None]:
sparcs['APR Risk of Mortality'].value_counts()

Minor       13990
Moderate     4952
Major        3452
Extreme      1187
Name: APR Risk of Mortality, dtype: int64

## Create some simplified binary versions

In [None]:
sparcs['Length of Stay'] = pd.to_numeric(sparcs['Length of Stay'], errors='coerce')
sparcs['sparcs_los'] = sparcs['Length of Stay'].apply(lambda x: 'long' if x > 3 else 'short')
sparcs.drop('Length of Stay', axis=1, inplace=True)
sparcs['sparcs_los'].value_counts()


short    13008
long     10575
Name: sparcs_los, dtype: int64

# MLJar Examples

## Binary Classifier Example 1 - SPARCS

### **Create new model**

In [6]:
X = sparcs.drop(columns=['Ethnicity'])

In [7]:
y = sparcs["Ethnicity"]

In [8]:
X

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Length of Stay,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,3,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,2,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,3,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,1,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,14,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,6,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,3,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,12,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,5,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [9]:
y

0        Not Span/Hispanic
1        Not Span/Hispanic
2        Not Span/Hispanic
3        Not Span/Hispanic
4        Not Span/Hispanic
               ...        
23578    Not Span/Hispanic
23579     Spanish/Hispanic
23580    Not Span/Hispanic
23581    Not Span/Hispanic
23582    Not Span/Hispanic
Name: Ethnicity, Length: 23583, dtype: object

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

In [11]:
X_test

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Length of Stay,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
16125,New York City,Manhattan,7002054.0,1458.0,New York Presbyterian Hospital - New York Weil...,50 to 69,112,M,White,27,...,Extreme,Medical,Medicare,Private Health Insurance,Self-Pay,0,N,Y,275680.27,81822.63
21518,Long Island,Nassau,2950002.0,528.0,Nassau University Medical Center,50 to 69,115,M,Other Race,2,...,Minor,Medical,Medicaid,Self-Pay,,0,N,Y,13803.09,13052.82
19396,New York City,Queens,7002024.0,1639.0,Mount Sinai Hospital - Mount Sinai Hospital of...,70 or Older,113,F,White,6,...,Moderate,Surgical,Medicare,,,0,N,Y,24611.38,12221.53
3422,Southern Tier,Broome,303001.0,58.0,United Health Services Hospitals Inc. - Wilson...,70 or Older,137,F,White,4,...,Major,Medical,Medicare,Medicare,Private Health Insurance,0,N,Y,9272.60,4534.10
8313,Hudson Valley,Westchester,7002054.0,1047.0,New York Presbyterian Hospital - Westchester D...,50 to 69,100,M,Other Race,120 +,...,Minor,Medical,Medicaid,Self-Pay,,0,N,N,850199.47,358964.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20273,Long Island,Suffolk,5151001.0,245.0,University Hospital,50 to 69,117,F,White,3,...,Extreme,Medical,Blue Cross/Blue Shield,,,0,N,Y,29006.90,8882.86
546,Western NY,Erie,1401014.0,208.0,Women And Children's Hospital Of Buffalo,18 to 29,142,F,Black/African American,1,...,Minor,Medical,Medicaid,Self-Pay,,0,N,N,3056.99,1128.31
23246,Long Island,Suffolk,5154001.0,925.0,Good Samaritan Hospital Medical Center,0 to 17,117,M,Other Race,1,...,Minor,Medical,Medicaid,Medicaid,,0,N,Y,11473.00,2536.28
14513,New York City,Manhattan,7002053.0,1446.0,NYU Hospital for Joint Diseases,50 to 69,OOS,M,White,9,...,Minor,Medical,Blue Cross/Blue Shield,,,0,N,N,66703.07,20099.86


In [12]:
automl = AutoML(results_path="Ethnicity", mode="Explain")

In [13]:
automl.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: Ethnicity
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.567213 trained in 0.67 seconds




Exception while producing SHAP explanations. [Errno 2] No such file or directory: 'Ethnicity/2_DecisionTree/learner_fold_0_shap_dependence_class_Not Span/Hispanic.png'
Continuing ...




2_DecisionTree logloss 0.469249 trained in 16.42 seconds
* Step default_algorithms will try to check up to 3 models
Exception while producing SHAP explanations. [Errno 2] No such file or directory: 'Ethnicity/3_Default_Xgboost/learner_fold_0_shap_dependence_class_Not Span/Hispanic.png'
Continuing ...
3_Default_Xgboost logloss 0.330843 trained in 37.27 seconds
4_Default_NeuralNetwork logloss 0.52575 trained in 10.42 seconds
Exception while producing SHAP explanations. [Errno 2] No such file or directory: 'Ethnicity/5_Default_RandomForest/learner_fold_0_shap_dependence_class_Not Span/Hispanic.png'
Continuing ...
5_Default_RandomForest logloss 0.42787 trained in 19.24 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.330843 trained in 0.5 seconds
AutoML fit time: 94.79 seconds
AutoML best model: 3_Default_Xgboost


AutoML(results_path='Ethnicity')

In [14]:
pred = automl.predict(X_test)
pred

array(['Not Span/Hispanic', 'Spanish/Hispanic', 'Not Span/Hispanic', ...,
       'Spanish/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic'],
      dtype=object)

In [15]:
automl.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_Baseline,Baseline,logloss,0.567213,1.41
,2_DecisionTree,Decision Tree,logloss,0.469249,17.6
the best,3_Default_Xgboost,Xgboost,logloss,0.330843,38.57
,4_Default_NeuralNetwork,Neural Network,logloss,0.52575,11.38
,5_Default_RandomForest,Random Forest,logloss,0.42787,20.48
,Ensemble,Ensemble,logloss,0.330843,0.5

Model,Weight
3_Default_Xgboost,1

Unnamed: 0,Multi-ethnic,Not Span/Hispanic,Spanish/Hispanic,Unknown,accuracy,macro avg,weighted avg,logloss
precision,0.333333,0.895276,0.66323,0.692913,0.873813,0.646188,0.856611,0.330843
recall,0.0588235,0.972577,0.374031,0.427184,0.873813,0.458154,0.873813,0.330843
f1-score,0.1,0.932327,0.478315,0.528529,0.873813,0.509793,0.857338,0.330843
support,17.0,3683.0,516.0,206.0,0.873813,4422.0,4422.0,0.330843

Unnamed: 0,Predicted as Multi-ethnic,Predicted as Not Span/Hispanic,Predicted as Spanish/Hispanic,Predicted as Unknown
Labeled as Multi-ethnic,1,15,1,0
Labeled as Not Span/Hispanic,2,3582,75,24
Labeled as Spanish/Hispanic,0,308,193,15
Labeled as Unknown,0,96,22,88

Unnamed: 0,Multi-ethnic,Not Span/Hispanic,Spanish/Hispanic,Unknown,accuracy,macro avg,weighted avg,logloss
precision,0.333333,0.895276,0.66323,0.692913,0.873813,0.646188,0.856611,0.330843
recall,0.0588235,0.972577,0.374031,0.427184,0.873813,0.458154,0.873813,0.330843
f1-score,0.1,0.932327,0.478315,0.528529,0.873813,0.509793,0.857338,0.330843
support,17.0,3683.0,516.0,206.0,0.873813,4422.0,4422.0,0.330843

Unnamed: 0,Predicted as Multi-ethnic,Predicted as Not Span/Hispanic,Predicted as Spanish/Hispanic,Predicted as Unknown
Labeled as Multi-ethnic,1,15,1,0
Labeled as Not Span/Hispanic,2,3582,75,24
Labeled as Spanish/Hispanic,0,308,193,15
Labeled as Unknown,0,96,22,88

Unnamed: 0,Multi-ethnic,Not Span/Hispanic,Spanish/Hispanic,Unknown,accuracy,macro avg,weighted avg,logloss
precision,0,0.860952,0.478022,0.567568,0.842153,0.476635,0.799291,0.52575
recall,0,0.981808,0.168605,0.101942,0.842153,0.313089,0.842153,0.52575
f1-score,0,0.917417,0.249284,0.17284,0.842153,0.334885,0.80124,0.52575
support,17,3683.0,516.0,206.0,0.842153,4422.0,4422.0,0.52575

Unnamed: 0,Predicted as Multi-ethnic,Predicted as Not Span/Hispanic,Predicted as Spanish/Hispanic,Predicted as Unknown
Labeled as Multi-ethnic,0,17,0,0
Labeled as Not Span/Hispanic,2,3616,59,6
Labeled as Spanish/Hispanic,1,418,87,10
Labeled as Unknown,0,149,36,21

Unnamed: 0,Multi-ethnic,Not Span/Hispanic,Spanish/Hispanic,Unknown,accuracy,macro avg,weighted avg,logloss
precision,0,0.832881,0,0,0.832881,0.20822,0.693691,0.567213
recall,0,1.0,0,0,0.832881,0.25,0.832881,0.567213
f1-score,0,0.908822,0,0,0.832881,0.227205,0.75694,0.567213
support,17,3683.0,516,206,0.832881,4422.0,4422.0,0.567213

Unnamed: 0,Predicted as Multi-ethnic,Predicted as Not Span/Hispanic,Predicted as Spanish/Hispanic,Predicted as Unknown
Labeled as Multi-ethnic,0,17,0,0
Labeled as Not Span/Hispanic,0,3683,0,0
Labeled as Spanish/Hispanic,0,516,0,0
Labeled as Unknown,0,206,0,0

Unnamed: 0,Multi-ethnic,Not Span/Hispanic,Spanish/Hispanic,Unknown,accuracy,macro avg,weighted avg,logloss
precision,0,0.832881,0,0,0.832881,0.20822,0.693691,0.469249
recall,0,1.0,0,0,0.832881,0.25,0.832881,0.469249
f1-score,0,0.908822,0,0,0.832881,0.227205,0.75694,0.469249
support,17,3683.0,516,206,0.832881,4422.0,4422.0,0.469249

Unnamed: 0,Predicted as Multi-ethnic,Predicted as Not Span/Hispanic,Predicted as Spanish/Hispanic,Predicted as Unknown
Labeled as Multi-ethnic,0,17,0,0
Labeled as Not Span/Hispanic,0,3683,0,0
Labeled as Spanish/Hispanic,0,516,0,0
Labeled as Unknown,0,206,0,0

Unnamed: 0,Multi-ethnic,Not Span/Hispanic,Spanish/Hispanic,Unknown,accuracy,macro avg,weighted avg,logloss
precision,0,0.855908,0.628571,0,0.848711,0.37112,0.786217,0.42787
recall,0,0.995113,0.170543,0,0.848711,0.291414,0.848711,0.42787
f1-score,0,0.920276,0.268293,0,0.848711,0.297142,0.797787,0.42787
support,17,3683.0,516.0,206,0.848711,4422.0,4422.0,0.42787

Unnamed: 0,Predicted as Multi-ethnic,Predicted as Not Span/Hispanic,Predicted as Spanish/Hispanic,Predicted as Unknown
Labeled as Multi-ethnic,0,17,0,0
Labeled as Not Span/Hispanic,0,3665,18,0
Labeled as Spanish/Hispanic,0,428,88,0
Labeled as Unknown,0,172,34,0


### **Test new (not really) data**

In [16]:
# load in the data model 

automl_sparcs_los = AutoML(results_path="Ethnicity")

In [17]:
# create a new dataset that follows the same data structure as the training set
X_withlos = sparcs.sample(25)
X_withoutlos = X_withlos.drop(columns=['Ethnicity'])

In [18]:
X_withlos

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
4337,Central NY,Onondaga,3301003.0,630.0,St Josephs Hospital Health Center,50 to 69,132,F,White,Not Span/Hispanic,...,Minor,Surgical,Medicare,,,0,N,Y,35052.1,12784.5
14093,New York City,Manhattan,7002002.0,1439.0,Mount Sinai Beth Israel,70 or Older,100,F,White,Not Span/Hispanic,...,Moderate,Medical,Blue Cross/Blue Shield,Miscellaneous/Other,Self-Pay,0,N,Y,37336.75,9747.1
3751,Central NY,Oneida,3201002.0,589.0,"Rome Memorial Hospital, Inc",0 to 17,134,M,White,Not Span/Hispanic,...,Moderate,Medical,Blue Cross/Blue Shield,Medicaid,,0,N,Y,5585.5,1428.17
22154,Long Island,Nassau,2952005.0,552.0,Plainview Hospital,70 or Older,118,F,White,Not Span/Hispanic,...,Major,Medical,Medicare,Private Health Insurance,,0,N,Y,35699.0,13374.17
18694,New York City,Queens,7003007.0,1633.0,Queens Hospital Center,30 to 49,115,F,Black/African American,Not Span/Hispanic,...,Minor,Medical,Self-Pay,,,0,N,Y,31702.27,15635.15
14906,New York City,Manhattan,7002017.0,1450.0,Lenox Hill Hospital,50 to 69,105,M,Other Race,Not Span/Hispanic,...,Minor,Surgical,Medicare,Medicare,,0,N,N,47946.53,12363.64
10221,New York City,Bronx,7000008.0,1172.0,Lincoln Medical & Mental Health Center,30 to 49,104,F,Black/African American,Not Span/Hispanic,...,Minor,Medical,Medicaid,Medicaid,,0,N,Y,20698.17,10779.67
10122,New York City,Bronx,7000008.0,1172.0,Lincoln Medical & Mental Health Center,30 to 49,104,F,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,Y,7265.73,3784.01
1427,Western NY,Niagara,3102000.0,574.0,Niagara Falls Memorial Medical Center,50 to 69,143,F,Black/African American,Not Span/Hispanic,...,Moderate,Medical,Medicaid,,,0,N,Y,13031.94,4936.25
6648,Capital/Adirond,Warren,5601000.0,1005.0,Glens Falls Hospital,0 to 17,128,F,White,Not Span/Hispanic,...,Minor,Surgical,Medicaid,,,0,N,Y,16036.34,5475.15


In [19]:
X_withoutlos

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Length of Stay,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
4337,Central NY,Onondaga,3301003.0,630.0,St Josephs Hospital Health Center,50 to 69,132,F,White,4,...,Minor,Surgical,Medicare,,,0,N,Y,35052.1,12784.5
14093,New York City,Manhattan,7002002.0,1439.0,Mount Sinai Beth Israel,70 or Older,100,F,White,4,...,Moderate,Medical,Blue Cross/Blue Shield,Miscellaneous/Other,Self-Pay,0,N,Y,37336.75,9747.1
3751,Central NY,Oneida,3201002.0,589.0,"Rome Memorial Hospital, Inc",0 to 17,134,M,White,1,...,Moderate,Medical,Blue Cross/Blue Shield,Medicaid,,0,N,Y,5585.5,1428.17
22154,Long Island,Nassau,2952005.0,552.0,Plainview Hospital,70 or Older,118,F,White,4,...,Major,Medical,Medicare,Private Health Insurance,,0,N,Y,35699.0,13374.17
18694,New York City,Queens,7003007.0,1633.0,Queens Hospital Center,30 to 49,115,F,Black/African American,4,...,Minor,Medical,Self-Pay,,,0,N,Y,31702.27,15635.15
14906,New York City,Manhattan,7002017.0,1450.0,Lenox Hill Hospital,50 to 69,105,M,Other Race,2,...,Minor,Surgical,Medicare,Medicare,,0,N,N,47946.53,12363.64
10221,New York City,Bronx,7000008.0,1172.0,Lincoln Medical & Mental Health Center,30 to 49,104,F,Black/African American,3,...,Minor,Medical,Medicaid,Medicaid,,0,N,Y,20698.17,10779.67
10122,New York City,Bronx,7000008.0,1172.0,Lincoln Medical & Mental Health Center,30 to 49,104,F,White,1,...,Minor,Medical,Medicaid,,,0,N,Y,7265.73,3784.01
1427,Western NY,Niagara,3102000.0,574.0,Niagara Falls Memorial Medical Center,50 to 69,143,F,Black/African American,5,...,Moderate,Medical,Medicaid,,,0,N,Y,13031.94,4936.25
6648,Capital/Adirond,Warren,5601000.0,1005.0,Glens Falls Hospital,0 to 17,128,F,White,1,...,Minor,Surgical,Medicaid,,,0,N,Y,16036.34,5475.15


In [20]:
predict = automl.predict(X_withoutlos)
predict

array(['Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic', 'Not Span/Hispanic', 'Not Span/Hispanic',
       'Not Span/Hispanic'], dtype=object)

In [21]:
# actual values from X_withlos
values_actual = X_withlos['Ethnicity'].values.tolist()
values_predicted = predict.tolist()
output = pd.DataFrame({'actual': values_actual, 'predicted': values_predicted})
output

Unnamed: 0,actual,predicted
0,Not Span/Hispanic,Not Span/Hispanic
1,Not Span/Hispanic,Not Span/Hispanic
2,Not Span/Hispanic,Not Span/Hispanic
3,Not Span/Hispanic,Not Span/Hispanic
4,Not Span/Hispanic,Not Span/Hispanic
5,Not Span/Hispanic,Not Span/Hispanic
6,Not Span/Hispanic,Not Span/Hispanic
7,Not Span/Hispanic,Not Span/Hispanic
8,Not Span/Hispanic,Not Span/Hispanic
9,Not Span/Hispanic,Not Span/Hispanic


## Binary Classifier Example 2 - GENERIC

In [None]:
import pandas as pd
from supervised.automl import AutoML
import os

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_binary_bank.csv")

X = df[df.columns[:-1]]
y = df["y"]

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


In [None]:
automl = AutoML(
    # results_path="AutoML_22",
    # total_time_limit=30 * 60,
    # start_random_models=10,
    # hill_climbing_steps=3,
    # top_models_to_improve=3,
    # train_ensemble=True,
    mode="Explain"
)

automl.fit(X_train, y_train)

In [None]:
pred = automl.predict(X_test)
pred
# print("Test accuracy", accuracy_score(y_test, pred["label"]))

In [None]:
automl.report()

## Regression - Example - GENERIC

In [22]:
import numpy as np
import pandas as pd
from supervised.automl import AutoML

df = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_sparcs.csv')
x_cols = [c for c in df.columns if c != "Total Costs"]
X = df[x_cols]
y = df["Total Costs"]

In [23]:
df

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [24]:
x_cols

['Health Service Area',
 'Hospital County',
 'Operating Certificate Number',
 'Facility Id',
 'Facility Name',
 'Age Group',
 'Zip Code - 3 digits',
 'Gender',
 'Race',
 'Ethnicity',
 'Length of Stay',
 'Type of Admission',
 'Patient Disposition',
 'Discharge Year',
 'CCS Diagnosis Code',
 'CCS Diagnosis Description',
 'CCS Procedure Code',
 'CCS Procedure Description',
 'APR DRG Code',
 'APR DRG Description',
 'APR MDC Code',
 'APR MDC Description',
 'APR Severity of Illness Code',
 'APR Severity of Illness Description',
 'APR Risk of Mortality',
 'APR Medical Surgical Description',
 'Payment Typology 1',
 'Payment Typology 2',
 'Payment Typology 3',
 'Birth Weight',
 'Abortion Edit Indicator',
 'Emergency Department Indicator',
 'Total Charges']

In [25]:
X

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Severity of Illness Description,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Extreme,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Moderate,Minor,Medical,Medicaid,,,0,N,N,46421.00
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00


In [26]:
y

0         4747.83
1         2985.64
2         2129.67
3         8454.41
4        34565.03
           ...   
23578     8961.40
23579     2214.06
23580    11083.24
23581     7951.26
23582     6212.95
Name: Total Costs, Length: 23583, dtype: float64

In [27]:
automl = AutoML()
automl.fit(X, y)

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline rmse 23541.268908 trained in 1.05 seconds
2_DecisionTree rmse 10272.122412 trained in 5.35 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost rmse 7090.031708 trained in 147.12 seconds
4_Default_NeuralNetwork rmse 9920.620876 trained in 7.12 seconds
5_Default_RandomForest rmse 10972.440085 trained in 18.01 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 6451.441549 trained in 0.29 seconds
AutoML fit time: 187.65 seconds
AutoML best model: Ensemble


AutoML()

In [28]:
df["predictions"] = automl.predict(X)


In [29]:
print("Predictions")
print(df[["Total Costs", "predictions"]].head())

Predictions
   Total Costs   predictions
0      4747.83   5141.080363
1      2985.64   3823.516642
2      2129.67   3170.726115
3      8454.41   8015.218058
4     34565.03  30751.272529


## Multiclass Classifier - GENERIC

In [None]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
import supervised


import warnings

from sklearn import datasets
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

from supervised import AutoML
from supervised.exceptions import AutoMLException

# warnings.filterwarnings('error')
warnings.filterwarnings(
    "error", category=pd.core.common.SettingWithCopyWarning
)  # message="*ndarray*")

df = pd.read_csv("https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_classes_iris.csv")
X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
y = df["class"]



In [None]:
df

In [None]:
X

In [None]:
y.value_counts()

In [None]:
automl = AutoML()


In [None]:
automl.fit(X, y)

In [None]:
predictions = automl.predict_all(X)


In [None]:
print(predictions.head())
print(predictions.tail())

print(X.shape)
print(predictions.shape)

# Download outputs

## Regression Download

In [30]:
# get current working directory
import os
os.getcwd()

'/content'

In [31]:
folders = os.listdir()
foldersML = [x for x in folders if x.startswith('AutoML_1')]
print(foldersML)

['AutoML_1']


In [32]:
!zip -r /content/AutoML_1.zip /content/AutoML_1/

  adding: content/AutoML_1/ (stored 0%)
  adding: content/AutoML_1/leaderboard.csv (deflated 38%)
  adding: content/AutoML_1/split_train_indices.npy (deflated 66%)
  adding: content/AutoML_1/Ensemble/ (stored 0%)
  adding: content/AutoML_1/Ensemble/predictions_ensemble.csv (deflated 55%)
  adding: content/AutoML_1/Ensemble/ensemble.json (deflated 49%)
  adding: content/AutoML_1/Ensemble/learning_curves.png (deflated 12%)
  adding: content/AutoML_1/Ensemble/predicted_vs_residuals.png (deflated 12%)
  adding: content/AutoML_1/Ensemble/README.md (deflated 51%)
  adding: content/AutoML_1/Ensemble/status.txt (stored 0%)
  adding: content/AutoML_1/Ensemble/true_vs_predicted.png (deflated 13%)
  adding: content/AutoML_1/3_Default_Xgboost/ (stored 0%)
  adding: content/AutoML_1/3_Default_Xgboost/shap_importance.png (deflated 15%)
  adding: content/AutoML_1/3_Default_Xgboost/learner_fold_0.xgboost (deflated 60%)
  adding: content/AutoML_1/3_Default_Xgboost/learning_curves.png (deflated 9%)
  ad

## Binary Download

In [33]:
# get current working directory
import os
os.getcwd()

'/content'

In [34]:
folders = os.listdir()
foldersML = [x for x in folders if x.startswith('Ethnicity')]
print(foldersML)

['Ethnicity']


In [35]:
!zip -r /content/Ethnicity.zip /content/Ethnicity

  adding: content/Ethnicity/ (stored 0%)
  adding: content/Ethnicity/leaderboard.csv (deflated 44%)
  adding: content/Ethnicity/split_train_indices.npy (deflated 67%)
  adding: content/Ethnicity/Ensemble/ (stored 0%)
  adding: content/Ethnicity/Ensemble/predictions_ensemble.csv (deflated 60%)
  adding: content/Ethnicity/Ensemble/ensemble.json (deflated 44%)
  adding: content/Ethnicity/Ensemble/roc_curve.png (deflated 8%)
  adding: content/Ethnicity/Ensemble/learning_curves.png (deflated 9%)
  adding: content/Ethnicity/Ensemble/confusion_matrix_normalized.png (deflated 12%)
  adding: content/Ethnicity/Ensemble/confusion_matrix.png (deflated 13%)
  adding: content/Ethnicity/Ensemble/README.md (deflated 72%)
  adding: content/Ethnicity/Ensemble/status.txt (stored 0%)
  adding: content/Ethnicity/Ensemble/precision_recall_curve.png (deflated 6%)
  adding: content/Ethnicity/3_Default_Xgboost/ (stored 0%)
  adding: content/Ethnicity/3_Default_Xgboost/shap_importance.png (deflated 16%)
  addin