This notebook was built and tested on Google Colab. We need to prepare the runtime environment and install auto-sklearn and dependencies.

In [None]:
!sudo apt-get install build-essential swig
!python -m pip install --upgrade pip
!pip install scikit-learn==0.24.1

In [None]:
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
!pip install auto-sklearn

Runtime need to be restarted to run the rest of the cells.

In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
import autosklearn
print('autosklearn: %s' % autosklearn.__version__)
import pandas as pd
print('pandas: %s' % pd.__version__)

import math
import numpy as np
from autosklearn.classification import AutoSklearnClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict


The scikit-learn version is 0.24.1.
autosklearn: 0.12.6
pandas: 1.1.5


Mount Google Drive onto this runtime instance.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
def a1c_level(x):
  """
    convert numberical A1C measurement to A1c level
      
    Attributes:
        x (float):  A1C measurement
  """
  if math.isnan(x):
    return 'unknown'
  elif x <= 5.9:
    return 'low'
  elif x <= 6.2:
    return 'med'
  elif x > 6.2:
    return 'high'


def get_data(run_name):
  """
    load data from Google Drive 
      
    Attributes:
        run_name (str):   the name of the generation run
  """
  data = pd.read_csv('drive/MyDrive/2021 HHS Data Challenge/Data/'+run_name+'/all_patients.csv')
  data = data[['RACE', 'ETHNICITY', 'GENDER', 'FIRST_LANGUAGE', 'SOCIOECONOMIC_CATEGORY', 'SOCIOECONOMIC_SCORE', 'INCOME', 'INCOME_LEVEL', 'EDUCATION', 'EDUCATION_LEVEL', 'ACTIVE_WEIGHT_MANAGEMENT', 'BMI_PERCENTILE', 'SMOKER', 'ALCOHOLIC', 'HEALTHCARE_COVERAGE', 'FOOD_INSECURITY', 'SEVERE_HOUSING_COST_BURDEN', 'UNEMPLOYED', 'NO_VEHICLE_ACCESS', 'UNINSURED', 'AGE', 'Prediabetes', 'Diabetes', 'Obesity', 'Severely_Obesity', 'A1c_max']]
  data.RACE = data.RACE.astype('category')
  data.ETHNICITY = data.ETHNICITY.astype('category')
  data.SOCIOECONOMIC_CATEGORY = data.SOCIOECONOMIC_CATEGORY.astype('category')
  data.GENDER = data.GENDER.astype('category')
  data.EDUCATION = data.EDUCATION.astype('category')
  data.FOOD_INSECURITY = data.FOOD_INSECURITY.astype('boolean')
  data.SEVERE_HOUSING_COST_BURDEN = data.SEVERE_HOUSING_COST_BURDEN.astype('boolean')
  data.UNEMPLOYED = data.UNEMPLOYED.astype('boolean')
  data.NO_VEHICLE_ACCESS = data.NO_VEHICLE_ACCESS.astype('boolean')
  data.UNINSURED = data.UNINSURED.astype('boolean')
  data.Obesity = data.Obesity.astype('boolean')
  data.Severely_Obesity = data.Severely_Obesity.astype('boolean')

  return data

def prepare_dataset(data, sdoh=True):
  """
    prepare a subset dataset for model training
      
    Attributes:
        data (DataFrame):  input dataframe
        sdoh (bool):  whether to include the SDOH features
  """
  if sdoh is True:
    subset_data = data[['RACE', 'ETHNICITY', 'GENDER', 'EDUCATION', 'FOOD_INSECURITY', 'SEVERE_HOUSING_COST_BURDEN', 'UNEMPLOYED', 'NO_VEHICLE_ACCESS', 'UNINSURED', 'Obesity', 'Prediabetes', 'Diabetes', 'A1c_max']]
  else:
    subset_data = data[['RACE', 'ETHNICITY', 'GENDER', 'Obesity', 'Prediabetes', 'Diabetes', 'A1c_max']]
  subset_data = subset_data[subset_data['Prediabetes']==True]
  subset_data['A1c_max'] = subset_data['A1c_max'].apply(lambda x: a1c_level(x))
  subset_data.A1c_max = subset_data.A1c_max.astype('category')

  #print(subset_data.isnull().sum(axis = 0))
  subset_data = subset_data.dropna()

  d = defaultdict(LabelEncoder)
  fit = subset_data.apply(lambda x: d[x.name].fit_transform(x))

  fit.apply(lambda x: d[x.name].inverse_transform(x))
  test_data=subset_data.apply(lambda x: d[x.name].transform(x))

  if sdoh is True:
    test_data['INCOME_LEVEL'] = data.INCOME_LEVEL

  test_data['AGE'] = data.AGE
  test_data['BMI_PERCENTILE'] = data.BMI_PERCENTILE
  
  return test_data.dropna()

def train_and_evaluate(run_name, sdoh=True):
  """
    train a LogisticRegression based on the generated data. 
    Then evaluate the prediction performance
      
    Attributes:
        run_name (str):   the name of the generation run
        sdoh (bool):  whether to include the SDOH features
  """
  data = get_data(run_name)
  dataset = prepare_dataset(data, sdoh)
  X = dataset.drop(columns=['Prediabetes', 'Diabetes'])
  y = dataset['Diabetes']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1234)

  automl = AutoSklearnClassifier(n_jobs=-1)
  automl.fit(X_train, y_train)

  y_pred = automl.predict(X_test)

  print("Accuracy:", accuracy_score(y_test, y_pred))
  print("F1:", f1_score(y_test, y_pred))
  print("Precision:", precision_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))
  print('Balanced accuracy score:', roc_auc_score(y_test, y_pred))

In [None]:
%%time
train_and_evaluate('sample_10k_diabetes_education', sdoh=True)

  out = eval(code, glob, local_ns)


Accuracy: 0.8311812261396812
F1: 0.640311004784689
Precision: 0.8202574318112167
Recall: 0.5251128114577203
Balanced accuracy score: 0.739493094678919
CPU times: user 7min 29s, sys: 32 s, total: 8min 1s
Wall time: 1h 1min


In [None]:
%%time
train_and_evaluate('sample_10k_diabetes_education', sdoh=False)

  out = eval(code, glob, local_ns)


Accuracy: 0.8276312135517911
F1: 0.6299409765694867
Precision: 0.8162855377008653
Recall: 0.5128628288515678
Balanced accuracy score: 0.7333072166740238
CPU times: user 12min 29s, sys: 44.4 s, total: 13min 13s
Wall time: 1h 25s


In [None]:
%%time
train_and_evaluate('sample_10k_gender_age_run2', sdoh=True)

  out = eval(code, glob, local_ns)


Accuracy: 0.8540561343706188
F1: 0.6162076194607099
Precision: 0.7948526669153301
Recall: 0.5031283201511038
Balanced accuracy score: 0.7318551369264082
CPU times: user 8min 13s, sys: 34.5 s, total: 8min 48s
Wall time: 1h 1min 15s


In [None]:
%%time
train_and_evaluate('sample_10k_gender_age_run2', sdoh=False)

  out = eval(code, glob, local_ns)


Accuracy: 0.854244846902703
F1: 0.6153846153846154
Precision: 0.7982129560685034
Recall: 0.5007006071929005
Balanced accuracy score: 0.7311372585769537
CPU times: user 12min 58s, sys: 51.6 s, total: 13min 50s
Wall time: 1h 40s


In [None]:
%%time
train_and_evaluate('sample_10k_diabetes_age', sdoh=True)

  out = eval(code, glob, local_ns)


Accuracy: 0.8768417300380228
F1: 0.5800243111831442
Precision: 0.7554089709762533
Recall: 0.47073331141072017
Balanced accuracy score: 0.7185622180286143
CPU times: user 8min 20s, sys: 30.8 s, total: 8min 51s
Wall time: 1h 44s


In [None]:
%%time
train_and_evaluate('sample_10k_diabetes_age', sdoh=False)

  out = eval(code, glob, local_ns)


Accuracy: 0.8782509036410121
F1: 0.5833249522276979
Precision: 0.764163372859025
Recall: 0.4716981132075472
Balanced accuracy score: 0.7197986997299168
CPU times: user 13min 30s, sys: 51.1 s, total: 14min 22s
Wall time: 1h 50s


In [None]:
%%time
train_and_evaluate('sample_10k_run3', sdoh=True)

  out = eval(code, glob, local_ns)


Accuracy: 0.9182572614107883
F1: 0.5660230659582739
Precision: 0.8498054474708171
Recall: 0.4243248494268506
Balanced accuracy score: 0.7067748245682671
CPU times: user 9min 39s, sys: 34 s, total: 10min 13s
Wall time: 1h 43s


In [None]:
%%time
train_and_evaluate('sample_10k_run3', sdoh=False)

  out = eval(code, glob, local_ns)
INFO:numexpr.utils:NumExpr defaulting to 2 threads.


Accuracy: 0.9169505036596854
F1: 0.5573010558846253
Precision: 0.8416958381952547
Recall: 0.4165543792107796
Balanced accuracy score: 0.7026559532096105
CPU times: user 8min 34s, sys: 30.2 s, total: 9min 4s
Wall time: 1h 30s
