In [1]:
def load_data():
    """
    Load the dataset from scikit-learn datasets module.

    Returns:
        This returns a Pandas DataFrame of the MNIST dataset
    """
    from sklearn import datasets
    
    X, y = datasets.load_digits(return_X_y=True, as_frame=True)
    X['label'] = y
    return X


def preprocess_data(data, omit_digits=[], train_size=0.8, delete_data=0.1):
    """
    Preprocesses the provided MNIST data and excludes provided digits, if any. 

    Args:
        data: the input DataFrame to be preprocessed.
        omit_digits: a list of digits to be ommitted from the dataset. 
        delete_data: How much of a fraction of the DataFrame to replace values with np.NaN (0,1].
    Returns:
        This returns a tuple of training and testing DataFrames of the MNIST dataset
        that may have some digits ommitted.
    """
    import random
    import numpy as np
    import pandas as pd
    from sklearn import model_selection
    
    # Remove duplicates in the input list of digits
    omit_digits = set(omit_digits)
    
    if train_size == 0:
        return (None, data[~(data['label'].isin(omit_digits))])
    
    train, test = model_selection.train_test_split(data[~(data['label'].isin(omit_digits))],
                                                   train_size=train_size, random_state=42)
    
    train = pd.DataFrame(train, columns=data.columns)
    test = pd.DataFrame(test, columns=data.columns)
    
    train = create_missing_data(train, delete_data, skip_cols=['label'])
    test = create_missing_data(test, delete_data, skip_cols=['label'])
            
    return (train, test)

def create_features(data, is_train, feature_generator=None):
    """
    Generates features for the provided dataset and returns the augmented dataset.
    If the dataset provided is for training, a stateful feature generator will also be
    returned.

    Args:
        data: a DataFrame containing the MNIST dataset.
        is_train: a boolean that indicates whether the input data is a training set.
                  This also indicates whether a feature_generator will be initialized or not.
        feature_generator: This is a fitted scikit-learn Pipeline that contains feature transformations
                            generated from a training set.

    Returns:
        augmented_data: the Pandas DataFrame containing the original and augmented datasets.
        feature_generator: Optionally returned scikit-learn Pipeline of feature transformations.
    """
    import numpy as np
    import pandas as pd
    from sklearn.pipeline import Pipeline, FeatureUnion
    from sklearn.preprocessing import StandardScaler, RobustScaler
    from sklearn.decomposition import PCA, TruncatedSVD
    from sklearn.experimental import enable_iterative_imputer  # noqa
    from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
    from sklearn.linear_model import LassoLars
    
    if not is_train and feature_generator is None:
        raise ValueError('A test set should have a feature_generator provided.')
        
    if is_train:
        scaler = FeatureUnion([
            ('standard_scaler', StandardScaler()),
            ('robust_scaler', RobustScaler()),
        ])
        imputers = FeatureUnion([
            ('Mean Imputer', SimpleImputer(strategy='mean')),
            ('LassoLars Regression Imputer', IterativeImputer(
                LassoLars(random_state=42),
                random_state=42)),
            ('KNN Imputer', KNNImputer())
        ])
        decomposer = FeatureUnion([
            ('PCA', PCA(n_components=10, random_state=42)),
            ('SVD', TruncatedSVD(random_state=42))
        ])
        
        feature_generator =  Pipeline([
            ('impute', imputers),
            ('scaler', scaler),
            ('decomposer', decomposer)
        ])
        
        features = feature_generator.fit_transform(data.drop('label', axis=1))
    
    else:
        features = feature_generator.transform(data.drop('label', axis=1))
        
    augmented_data = np.concatenate([
        data.drop('label', axis=1).values, 
        features,
    ], axis=1)
    
    augmented_data = pd.DataFrame(augmented_data)
    augmented_data.columns = augmented_data.columns.astype(str)
    augmented_data['label'] = data['label'].values
    
    return (augmented_data, feature_generator)


def generate_model(data):
    """
    Train a model with the provided data, where cross-validation should ideally be
    implemented.

    Args:
        data: a Pandas DataFrame where the model can generate training and validation sets.

    Returns:
        A fully trained H2O model.
    """
    
    import h2o
    from h2o.automl import H2OAutoML
    
    h2o.init()
    
    features = data.columns.tolist()[:-1]
    label = data.columns.tolist()[-1]
    # Need to convert to H2O-compatible DataFrame
    data = h2o.H2OFrame(data, column_names=data.columns.tolist())
    data[label] = data[label].asfactor()
    model = H2OAutoML(max_models=3, balance_classes=True, seed=42)
    model.train(features, label, data)
    
    return model

def generate_prediction(data, model):
    """
    Generate predictions from a provided model and dataset.

    Args:
        data: a DataFrame where predictions shall be generated from.
        model: an H2O model to generate predictions with.

    Returns:
        A NumPy Series of the generated predictions
    """
    import h2o
    
    features = data.columns.tolist()[:-1]
    label = data.columns.tolist()[-1]
    # Need to convert to H2O-compatible DataFrame
    data = h2o.H2OFrame(data, column_names=data.columns.tolist())
    data[label] = data[label].asfactor()
    preds = model.predict(data)['predict'].as_data_frame().values
    
    return preds

def produce_reports(real, preds):
    """
    Generate model performance reports, such as Precision, Recall, RMSE, LogLoss, etc.
    
    Args:
        real: A NumPy array of the reference values.
        preds: A NumPy array of the predicted values.
    Returns:
        None
    """
    from sklearn.metrics import classification_report, matthews_corrcoef
    
    print(f'Matthews Correlation Coefficient: {matthews_corrcoef(real, preds)}')
    print(
        classification_report(real, preds, digits=4)
    )
    
def create_missing_data(df, delete_data, skip_cols=[]):
    import random
    import numpy as np
    
    df = df.copy()
    df_keep = df[skip_cols].copy()
    df = df[df.columns[~df.columns.isin(skip_cols)]]
    
    ix = [(row, col) for row in range(df.shape[0]) for col in range(df.shape[1])]
    for row, col in random.sample(ix, int(round(delete_data*len(ix)))):
        df.iat[row, col] = np.nan
    
    return df.merge(df_keep, left_index=True, right_index=True)

## Model Exploration

In [2]:
data_raw = load_data()

In [3]:
train, test = preprocess_data(data_raw, omit_digits=[2, 8], train_size=0.8, delete_data=0.1)

In [4]:
train_augmented, feature_generator = create_features(train, is_train=True)

In [5]:
test_augmented, _ = create_features(test, is_train=False, feature_generator=feature_generator)

In [6]:
model = generate_model(train_augmented)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /home/hadrian/anaconda3/envs/py36/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpl10728sg
  JVM stdout: /tmp/tmpl10728sg/h2o_hadrian_started_from_python.out
  JVM stderr: /tmp/tmpl10728sg/h2o_hadrian_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Asia/Manila
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.1.2
H2O cluster version age:,4 months and 18 days !!!
H2O cluster name:,H2O_from_python_hadrian_e699uf
H2O cluster total nodes:,1
H2O cluster free memory:,1.672 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%


In [7]:
preds = generate_prediction(test_augmented, model)

Parse progress: |█████████████████████████████████████████████████████████| 100%
xgboost prediction progress: |████████████████████████████████████████████| 100%


In [8]:
produce_reports(test['label'], preds)

Matthews Correlation Coefficient: 0.9411075284537671
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        45
           1     0.8684    1.0000    0.9296        33
           3     0.9500    0.9048    0.9268        42
           4     0.9722    0.9459    0.9589        37
           5     0.9722    0.9459    0.9589        37
           6     1.0000    0.8966    0.9455        29
           7     0.8750    1.0000    0.9333        28
           9     0.9459    0.8974    0.9211        39

    accuracy                         0.9483       290
   macro avg     0.9480    0.9488    0.9468       290
weighted avg     0.9514    0.9483    0.9484       290



### Finalizing for Prototype
This can be easily parallelized and exected in batches or on-the-fly.

In [9]:
data_raw = load_data()

In [10]:
_, data_preprocessed = preprocess_data(data_raw, train_size=0)

In [11]:
data_augmented, _ = create_features(data_preprocessed, is_train=False, feature_generator=feature_generator)

In [12]:
preds = generate_prediction(data_augmented, model)

Parse progress: |█████████████████████████████████████████████████████████| 100%
xgboost prediction progress: |████████████████████████████████████████████| 100%


In [13]:
produce_reports(data_augmented['label'], preds)

Matthews Correlation Coefficient: 0.78434739370246


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.9888    0.9944    0.9916       178
           1     0.7358    0.9945    0.8458       182
           2     0.0000    0.0000    0.0000       177
           3     0.5144    0.9781    0.6742       183
           4     0.9521    0.9890    0.9702       181
           5     0.8249    0.9835    0.8972       182
           6     0.9418    0.9834    0.9622       181
           7     0.9227    1.0000    0.9598       179
           8     0.0000    0.0000    0.0000       174
           9     0.7415    0.9722    0.8413       180

    accuracy                         0.7941      1797
   macro avg     0.6622    0.7895    0.7142      1797
weighted avg     0.6653    0.7941    0.7179      1797

