In [1]:
def load_data():
    """
    Load the dataset from scikit-learn datasets module.

    Returns:
        This returns a Pandas DataFrame of the MNIST dataset
    """
    from sklearn import datasets
    
    X, y = datasets.load_digits(return_X_y=True, as_frame=True)
    X['label'] = y
    return X


def preprocess_data(data, omit_digits=[], train_size=0.8):
    """
    Preprocesses the provided MNIST data and excludes provided digits, if any. 

    Args:
        data: the input DataFrame to be preprocessed.
        omit_digits: a list of digits to be ommitted from the dataset. 

    Returns:
        This returns a tuple of training and testing DataFrames of the MNIST dataset
        that may have some digits ommitted.
    """
    from sklearn import model_selection
    
    # Remove duplicates in the input list of digits
    omit_digits = set(omit_digits)
    
    if train_size == 0:
        return (None, data[~(data['label'].isin(omit_digits))])
    
    train, test = model_selection.train_test_split(data[~(data['label'].isin(omit_digits))],
                                                   train_size=train_size, random_state=42)
    return (train, test)

def create_features(data, is_train, feature_generator=None):
    """
    Generates features for the provided dataset and returns the augmented dataset.
    If the dataset provided is for training, a stateful feature generator will also be
    returned.

    Args:
        data: a DataFrame containing the MNIST dataset.
        is_train: a boolean that indicates whether the input data is a training set.
                  This also indicates whether a feature_generator will be initialized or not.
        feature_generator: This is a fitted scikit-learn Pipeline that contains feature transformations
                            generated from a training set.

    Returns:
        augmented_data: the Pandas DataFrame containing the original and augmented datasets.
        feature_generator: Optionally returned scikit-learn Pipeline of feature transformations.
    """
    import numpy as np
    import pandas as pd
    from sklearn.pipeline import Pipeline, FeatureUnion
    from sklearn.preprocessing import StandardScaler, RobustScaler
    from sklearn.decomposition import PCA, TruncatedSVD
    
    if not is_train and feature_generator is None:
        raise ValueError('A test set should have a feature_generator provided.')
        
    if is_train:
        scaler = FeatureUnion([
            ('standard_scaler', StandardScaler()),
            ('robust_scaler', RobustScaler()),
        ])
        decomposer = FeatureUnion([
            ('PCA', PCA(n_components=10, random_state=42)),
            ('SVD', TruncatedSVD(random_state=42))
        ])
        
        feature_generator =  Pipeline([
            ('scaler', scaler),
            ('decomposer', decomposer)
        ])
        
        features = feature_generator.fit_transform(data.drop('label', axis=1))
    
    else:
        features = feature_generator.transform(data.drop('label', axis=1))
        
    augmented_data = np.concatenate([
        data.drop('label', axis=1).values, 
        features,
    ], axis=1)
    
    augmented_data = pd.DataFrame(augmented_data)
    augmented_data.columns = augmented_data.columns.astype(str)
    augmented_data['label'] = data['label'].values
    
    return (augmented_data, feature_generator)


def generate_model(data):
    """
    Train a model with the provided data, where cross-validation should ideally be
    implemented.

    Args:
        data: a Pandas DataFrame where the model can generate training and validation sets.

    Returns:
        A fully trained H2O model.
    """
    
    import h2o
    from h2o.automl import H2OAutoML
    
    h2o.init()
    
    features = data.columns.tolist()[:-1]
    label = data.columns.tolist()[-1]
    # Need to convert to H2O-compatible DataFrame
    data = h2o.H2OFrame(data, column_names=data.columns.tolist())
    data[label] = data[label].asfactor()
    model = H2OAutoML(max_models=3, balance_classes=True, seed=42)
    model.train(features, label, data)
    
    return model

def generate_prediction(data, model):
    """
    Generate predictions from a provided model and dataset.

    Args:
        data: a DataFrame where predictions shall be generated from.
        model: an H2O model to generate predictions with.

    Returns:
        A NumPy Series of the generated predictions
    """
    import h2o
    
    features = data.columns.tolist()[:-1]
    label = data.columns.tolist()[-1]
    # Need to convert to H2O-compatible DataFrame
    data = h2o.H2OFrame(data, column_names=data.columns.tolist())
    data[label] = data[label].asfactor()
    preds = model.predict(data)['predict'].as_data_frame().values
    
    return preds

def produce_reports(real, preds):
    """
    Generate model performance reports, such as Precision, Recall, RMSE, LogLoss, etc.
    
    Args:
        real: A NumPy array of the reference values.
        preds: A NumPy array of the predicted values.
    Returns:
        None
    """
    from sklearn.metrics import classification_report, matthews_corrcoef
    
    print(f'Matthews Correlation Coefficient: {matthews_corrcoef(real, preds)}')
    print(
        classification_report(real, preds, digits=4)
    )

## Model Exploration

In [2]:
data_raw = load_data()

In [3]:
train, test = preprocess_data(data_raw, omit_digits=[8, 9], train_size=0.8)

In [4]:
train_augmented, feature_generator = create_features(train, is_train=True)

In [5]:
test_augmented, _ = create_features(test, is_train=False, feature_generator=feature_generator)

In [6]:
model = generate_model(train_augmented)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.1" 2018-10-16 LTS; OpenJDK Runtime Environment Zulu11.2+3 (build 11.0.1+13-LTS); OpenJDK 64-Bit Server VM Zulu11.2+3 (build 11.0.1+13-LTS, mixed mode)
  Starting server from /home/hadrian/anaconda3/envs/py36/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpcrg01srl
  JVM stdout: /tmp/tmpcrg01srl/h2o_hadrian_started_from_python.out
  JVM stderr: /tmp/tmpcrg01srl/h2o_hadrian_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Asia/Manila
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.1.2
H2O cluster version age:,4 months and 10 days !!!
H2O cluster name:,H2O_from_python_hadrian_omgcbv
H2O cluster total nodes:,1
H2O cluster free memory:,1.881 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%


In [7]:
preds = generate_prediction(test_augmented, model)

Parse progress: |█████████████████████████████████████████████████████████| 100%
xgboost prediction progress: |████████████████████████████████████████████| 100%


In [8]:
produce_reports(test['label'], preds)

Matthews Correlation Coefficient: 0.9684231781600441
              precision    recall  f1-score   support

           0     0.9767    0.9767    0.9767        43
           1     0.9737    1.0000    0.9867        37
           2     1.0000    1.0000    1.0000        23
           3     1.0000    0.9474    0.9730        38
           4     0.8913    0.9762    0.9318        42
           5     1.0000    1.0000    1.0000        30
           6     1.0000    0.9189    0.9577        37
           7     0.9744    0.9744    0.9744        39

    accuracy                         0.9723       289
   macro avg     0.9770    0.9742    0.9750       289
weighted avg     0.9739    0.9723    0.9725       289



### Finalizing for Prototype
This can be easily parallelized and exected in batches or on-the-fly.

In [9]:
data_raw = load_data()

In [10]:
_, data_preprocessed = preprocess_data(data_raw, train_size=0)

In [11]:
data_augmented, _ = create_features(data_preprocessed, is_train=False, feature_generator=feature_generator)

In [12]:
preds = generate_prediction(data_augmented, model)

Parse progress: |█████████████████████████████████████████████████████████| 100%
xgboost prediction progress: |████████████████████████████████████████████| 100%


In [13]:
produce_reports(data_augmented['label'], preds)

Matthews Correlation Coefficient: 0.7894216635256807


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.9077    0.9944    0.9491       178
           1     0.7879    1.0000    0.8814       182
           2     0.9267    1.0000    0.9620       177
           3     0.5171    0.9891    0.6792       183
           4     0.8911    0.9945    0.9399       181
           5     0.7712    1.0000    0.8708       182
           6     0.9622    0.9834    0.9727       181
           7     0.8599    0.9944    0.9223       179
           8     0.0000    0.0000    0.0000       174
           9     0.0000    0.0000    0.0000       180

    accuracy                         0.7986      1797
   macro avg     0.6624    0.7956    0.7177      1797
weighted avg     0.6641    0.7986    0.7199      1797

