# DataWig Example Demo
- Data source: https://archive.ics.uci.edu/ml/datasets/heart+disease
___

## (1) Initial Setup

In [31]:
# Import dependencies
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report, matthews_corrcoef, mean_squared_error

from datawig import SimpleImputer, Imputer
from datawig.utils import random_split
from datawig.column_encoders import *
from datawig.mxnet_input_symbols import *

In [2]:
# Read input data
df = pd.read_csv('../data/Heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


#### Data Dictionary
- Age: The person’s age in years  
- Sex: The person’s sex (1 = male, 0 = female)  
- ChestPain: chest pain type  
    - Value 0: asymptomatic
    - Value 1: atypical angina
    - Value 2: non-anginal pain  
    - Value 3: typical angina  
- RestBP: The person’s resting blood pressure (mmHg on admission)  
- Chol: The person’s cholesterol measurement in mg/dl  
- Fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)  
- RestECG: resting electrocardiographic results  
    - Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria  
    - Value 1: normal  
    - Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
- MaxHR: The person’s maximum heart rate achieved
- ExAng: Exercise induced angina (1 = yes; 0 = no)
- OldPeak: ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot)
- Slope: the slope of the peak exercise ST segment 
    - 0: downsloping; 
    - 1: flat; 
    - 2: upsloping
- Ca: The number of major vessels (0–3)
- Thal: A blood disorder called thalassemia Value 0: NULL (dropped from the dataset previously
    - Value 1: fixed defect (no blood flow in some part of the heart)
    - Value 2: normal blood flow
    - Value 3: reversible defect (a blood flow is observed but it is not normal)
- AHD: Heart disease (1 = no, 0= yes) # Target variable

In [3]:
df.dtypes

Age            int64
Sex            int64
ChestPain     object
RestBP         int64
Chol           int64
Fbs            int64
RestECG        int64
MaxHR          int64
ExAng          int64
Oldpeak      float64
Slope          int64
Ca           float64
Thal          object
AHD           object
dtype: object

In [4]:
df.shape

(303, 14)

___
## (2) Data Processing
- Train test split and data masking (to simulate missing values)

In [5]:
# Perform train-test split (Default is 80/20 split)
df_train, df_test = random_split(df, split_ratios=[0.8, 0.2])
df_train.shape

(242, 14)

In [6]:
# Randomly hide a x% of cells in test dataframe
hide_proportion = 0.25 # 25% hidden
df_test_missing = df_test.mask(np.random.rand(*df_test.shape) > (1 - hide_proportion))

In [22]:
df_test_missing.sample(5)

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
141,,1.0,typical,170.0,,,,159.0,0.0,0.2,2.0,,,
125,45.0,0.0,nontypical,130.0,234.0,0.0,2.0,175.0,0.0,0.6,,0.0,normal,No
108,,,asymptomatic,120.0,260.0,0.0,0.0,140.0,,3.6,2.0,1.0,reversable,
13,44.0,1.0,nontypical,,263.0,,0.0,173.0,,0.0,1.0,0.0,,No
140,59.0,1.0,,140.0,221.0,0.0,0.0,164.0,1.0,,1.0,0.0,,


___
## (3) Simple Imputer with Hyperparameter Optimization (HPO)
- Using SimpleImputer is the easiest way to deploy an imputation model on your dataset with DataWig. As the name suggests, the SimpleImputer is straightforward to call from a python script and uses default encoders and featurizers that usually yield good results on a variety of datasets.
- Objectives: 
    - Numerical imputation: Predict missing values in `MaxHR` column
    - Categorical imputation: Predict missing values in `ChestPain` column
- DataWig also enables hyperparameter optimization to find the best model on a particular dataset.

### (i) Numerical Imputation

In [8]:
# Define columns with useful info for to-be-imputed column
input_cols = ['Age', 'Sex', 'RestBP', 'Chol', 'Fbs', 'ExAng', 'RestECG']

# Define column to be imputed
output_col_num = 'MaxHR' 

In [9]:
# Initialize SimpleImputer model for numerical imputation
imputer_num = SimpleImputer(
            input_columns=input_cols,
            output_column=output_col_num,  # Column to be imputed
            output_path='../artifacts/imputer_model_num'  # Store model data and metrics
            )

In [10]:
# Fit and imputer model with default basic hyperparameter random grid
imputer_num.fit_hpo(train_df=df_train)

2022-08-21 22:47:26,658 [INFO]  
2022-08-21 22:47:26,675 [INFO]  Epoch[0] Batch [0-7]	Speed: 12442.05 samples/sec	cross-entropy=16.275757	MaxHR-accuracy=0.000000
2022-08-21 22:47:26,682 [INFO]  Epoch[0] Train-cross-entropy=16.985389
2022-08-21 22:47:26,683 [INFO]  Epoch[0] Train-MaxHR-accuracy=0.000000
2022-08-21 22:47:26,683 [INFO]  Epoch[0] Time cost=0.020
2022-08-21 22:47:26,688 [INFO]  Saved checkpoint to "../artifacts/imputer_model_num0\model-0000.params"
2022-08-21 22:47:26,691 [INFO]  Epoch[0] Validation-cross-entropy=10.823458
2022-08-21 22:47:26,691 [INFO]  Epoch[0] Validation-MaxHR-accuracy=0.000000
2022-08-21 22:47:26,700 [INFO]  Epoch[1] Batch [0-7]	Speed: 15998.98 samples/sec	cross-entropy=13.016702	MaxHR-accuracy=0.000000
2022-08-21 22:47:26,707 [INFO]  Epoch[1] Train-cross-entropy=13.192817
2022-08-21 22:47:26,707 [INFO]  Epoch[1] Train-MaxHR-accuracy=0.000000
2022-08-21 22:47:26,708 [INFO]  Epoch[1] Time cost=0.016
2022-08-21 22:47:26,712 [INFO]  Saved checkpoint to "..

<datawig.simple_imputer.SimpleImputer at 0x25fb6c41608>

In [11]:
# Impute missing values and return original dataframe with predictions
predictions_num = imputer_num.predict(df_test_missing)
predictions_num.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD,MaxHR_imputed
85,44.0,,nonanginal,140.0,235.0,0.0,2.0,,0.0,0.0,1.0,,,No,164.152612
134,43.0,0.0,nonanginal,122.0,213.0,0.0,0.0,165.0,0.0,0.2,2.0,,normal,No,167.516661
143,64.0,1.0,nonanginal,125.0,309.0,,,,1.0,1.8,2.0,0.0,reversable,Yes,128.506749
253,51.0,,nonanginal,120.0,295.0,0.0,2.0,157.0,,,,,normal,,152.829996
205,45.0,1.0,asymptomatic,142.0,309.0,0.0,,147.0,1.0,0.0,2.0,,reversable,,144.024414


In [12]:
# Evaluate performance (compare actual and predicted) with MSE
cols_num = pd.concat([df_test[[output_col_num]], 
                      predictions_num[[f'{output_col_num}_imputed']]], axis=1)
cols_num.head()

Unnamed: 0,MaxHR,MaxHR_imputed
85,180,164.152612
134,165,167.516661
143,131,128.506749
253,157,152.829996
205,147,144.024414


In [13]:
# Calculate RMSE (test set)
rmse_datawig = np.sqrt(mean_squared_error(cols_num[output_col_num],
                        cols_num[f'{output_col_num}_imputed']))
rmse_datawig

18.703566871101927

___
### (ii) Categorical Imputation

In [14]:
# Define column to be imputed
output_col_cat = 'ChestPain' 

In [15]:
# Initialize SimpleImputer model for categorical imputation
imputer_cat = SimpleImputer(
                input_columns=input_cols,
                output_column=output_col_cat,  # Column to be imputed
                output_path='../artifacts/imputer_model_cat'  # Store model data and metrics
                )

In [16]:
# Fit and imputer model with default basic hyperparameter (random search)
imputer_cat.fit_hpo(train_df=df_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<datawig.simple_imputer.SimpleImputer at 0x25fb8d08748>

In [17]:
# Impute missing values and return original dataframe with predictions
predictions_cat = imputer_cat.predict(df_test_missing)
predictions_cat.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD,ChestPain_imputed,ChestPain_imputed_proba
85,44.0,,nonanginal,140.0,235.0,0.0,2.0,,0.0,0.0,1.0,,,No,nonanginal,0.322175
134,43.0,0.0,nonanginal,122.0,213.0,0.0,0.0,165.0,0.0,0.2,2.0,,normal,No,nonanginal,0.451878
143,64.0,1.0,nonanginal,125.0,309.0,,,,1.0,1.8,2.0,0.0,reversable,Yes,asymptomatic,0.753769
253,51.0,,nonanginal,120.0,295.0,0.0,2.0,157.0,,,,,normal,,asymptomatic,0.343994
205,45.0,1.0,asymptomatic,142.0,309.0,0.0,,147.0,1.0,0.0,2.0,,reversable,,asymptomatic,0.698469


In [18]:
# Evaluate performance (compare actual and predicted) with MSE
cols_cat = pd.concat([df_test[[output_col_cat]], 
                      predictions_cat[[f'{output_col_cat}_imputed']]], axis=1)
cols_cat.head()

Unnamed: 0,ChestPain,ChestPain_imputed
85,nonanginal,nonanginal
134,nonanginal,nonanginal
143,nonanginal,asymptomatic
253,nonanginal,asymptomatic
205,asymptomatic,asymptomatic


In [19]:
# Calculate F1 score (test set)
f1_datawig = f1_score(cols_cat[output_col_cat],
                     cols_cat[f'{output_col_cat}_imputed'],
                     average='macro')
f1_datawig

0.3733552631578948

In [20]:
# Calculate MCC - classification metric (test set)
mcc_datawig = matthews_corrcoef(cols_cat[output_col_cat],
                                cols_cat[f'{output_col_cat}_imputed'])
mcc_datawig

0.2621317809974712

___
## (4) Imputer - Flexible Specifications
Imputer is the backbone of the SimpleImputer and is responsible for running the preprocessing code, creating the model, executing training, and making predictions. Using the Imputer enables more flexibility with specifying model parameters, such as using particular encoders and featurizers rather than the default ones that SimpleImputer uses.

In [23]:
input_cols

['Age', 'Sex', 'RestBP', 'Chol', 'Fbs', 'ExAng', 'RestECG']

In [46]:
data_encoder_cols = [NumericalEncoder('Age'),
                     CategoricalEncoder('Sex'),
                     NumericalEncoder('RestBP'),
                     NumericalEncoder('Chol'),
                     CategoricalEncoder('Fbs'),
                     CategoricalEncoder('ExAng'),
                     CategoricalEncoder('RestECG')]

# To-be-imputed column label
label_encoder_cols = [NumericalEncoder('MaxHR')]

data_featurizer_cols = [NumericalFeaturizer('Age'),
                        EmbeddingFeaturizer('Sex'),
                        NumericalFeaturizer('RestBP'),
                        NumericalFeaturizer('Chol'),
                        EmbeddingFeaturizer('Fbs'),
                        EmbeddingFeaturizer('ExAng'),
                        EmbeddingFeaturizer('RestECG')]

imputer = Imputer(
            data_featurizers = data_featurizer_cols,
            data_encoders = data_encoder_cols,
            label_encoders = label_encoder_cols,
            output_path = 'imputer_model'
)

In [47]:
imputer.fit(train_df=df_train)

<datawig.imputer.Imputer at 0x25fb8f06888>

In [48]:
predictions, metrics = imputer.transform_and_compute_metrics(df_test_missing)

In [50]:
metrics

{'MaxHR': 22386.304899628114}

In [49]:
predictions

{'MaxHR': array([[165.52394531],
        [163.16403826],
        [134.42359848],
        [149.1670025 ],
        [157.30364576],
        [143.33626401],
        [148.90711075],
        [153.07634522],
        [147.56038902],
        [143.89642766],
        [145.49931581],
        [156.6602911 ],
        [157.45461505],
        [150.09164103],
        [167.22070499],
        [148.73248139],
        [147.29664533],
        [141.55057221],
        [156.34972041],
        [155.61489691],
        [145.93607281],
        [139.41628029],
        [146.5495422 ],
        [148.68578137],
        [139.20250272],
        [146.87761836],
        [152.17691458],
        [147.57417206],
        [157.08767442],
        [147.79161175],
        [142.49096727],
        [163.86739371],
        [162.87696398],
        [133.03343982],
        [139.29469731],
        [151.46181197],
        [141.38064594],
        [153.86526207],
        [147.42739151],
        [159.51802533],
        [156.07503555],
       

In [58]:
prob_dict_topk = imputer.predict_proba_top_k(df_test_missing, top_k=5)
prob_dict_topk

{'MaxHR': array([[165.52394531],
        [163.16403826],
        [134.42359848],
        [149.1670025 ],
        [157.30364576],
        [143.33626401],
        [148.90711075],
        [153.07634522],
        [147.56038902],
        [143.89642766],
        [145.49931581],
        [156.6602911 ],
        [157.45461505],
        [150.09164103],
        [167.22070499],
        [148.73248139],
        [147.29664533],
        [141.55057221],
        [156.34972041],
        [155.61489691],
        [145.93607281],
        [139.41628029],
        [146.5495422 ],
        [148.68578137],
        [139.20250272],
        [146.87761836],
        [152.17691458],
        [147.57417206],
        [157.08767442],
        [147.79161175],
        [142.49096727],
        [163.86739371],
        [162.87696398],
        [133.03343982],
        [139.29469731],
        [151.46181197],
        [141.38064594],
        [153.86526207],
        [147.42739151],
        [159.51802533],
        [156.07503555],
       