## Define pipeline to train a model

In [1]:
# fairing:include-cell
from label_microservice.repo_config import RepoConfig
from label_microservice.mlp import MLPWrapper
from sklearn.neural_network import MLPClassifier
import dill as dpickle
import os
import yaml
from google.cloud import storage
import requests
import json
import numpy as np
from passlib.apps import custom_app_context as pwd_context
from collections import Counter
import logging

In [2]:
# fairing:include-cell
class RepoMLP(object):

    def __init__(self,
                 yaml_path=None,
                 owner=None,
                 repo=None,
                 min_freq=25,
                 activation='relu',
                 alpha=0.0001,
                 early_stopping=True,
                 epsilon=1e-08,
                 hidden_layer_sizes=(100,),
                 learning_rate='constant',
                 learning_rate_init=0.001,
                 max_iter=500,
                 momentum=0.9,
                 n_iter_no_change=5,
                 random_state=1234,
                 solver='adam',
                 validation_fraction=0.1):
        if not yaml_path:
            if 'YAML_PATH' in os.environ:
                logging.info('yaml_path not supplied; check environment variable')
                yaml_path = os.getenv('YAML_PATH')
            else:
                logging.info('yaml_path not supplied; using the default')
                yaml_path = 'issue_label_bot.yaml'
        self.yaml_path = yaml_path
        self.min_freq = min_freq # for filtering labels
        self.mlp_wrapper = None
        self.clf = MLPClassifier(activation=activation,
                                 alpha=alpha,
                                 early_stopping=early_stopping,
                                 epsilon=epsilon,
                                 hidden_layer_sizes=hidden_layer_sizes,
                                 learning_rate=learning_rate,
                                 learning_rate_init=learning_rate_init,
                                 max_iter=max_iter,
                                 momentum=momentum,
                                 n_iter_no_change=n_iter_no_change,
                                 random_state=random_state,
                                 solver=solver,
                                 validation_fraction=validation_fraction)
        self.all_labels = None
        self.probability_thresholds = None
        self.load_yaml(owner, repo)

    def load_yaml(self, owner, repo):
        config = RepoConfig(self.yaml_path, owner, repo)
        self.repo_owner = config.repo_owner
        self.repo_name = config.repo_name

        self.model_bucket_name = config.model_bucket_name
        self.model_file = config.model_local_path
        self.model_dest = config.model_gcs_path

        self.labels_file = config.labels_local_path
        self.labels_dest = config.labels_gcs_path

        self.embeddings_bucket_name = config.embeddings_bucket_name
        self.embeddings_file = config.embeddings_local_path
        self.embeddings_dest = config.embeddings_gcs_path

    def download_embeddings_from_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.embeddings_bucket_name)
        blob = bucket.get_blob(self.embeddings_dest)
        with open(self.embeddings_file, 'wb') as f:
            blob.download_to_file(f)

    def load_training_data(self):
        self.download_embeddings_from_gcs()
        with open(self.embeddings_file, 'rb') as f:
            data = dpickle.load(f)

        # filter labels
        c = Counter()
        for lbls in data['labels']:
            c.update(lbls)
        self.all_labels = [x for x in c if c[x] >= self.min_freq]

        X = []
        y = []
        for emb, lbls in zip(data['features'], data['labels']):
            mask = [self.all_labels.index(x) for x in lbls if c[x] >= self.min_freq]
            if mask == []:
                continue
            zer = np.zeros(len(self.all_labels))
            zer[mask] = 1
            y.append(zer)
            X.append(emb)
        return X, y

    def train(self):
        X, y = self.load_training_data()
        self.mlp_wrapper = MLPWrapper(clf=self.clf)
        # get probability thresholds before `fit` because it overwrites classifier
        self.mlp_wrapper.find_probability_thresholds(X, y)
        self.probability_thresholds = self.mlp_wrapper.probability_thresholds
        # train model using the whole data
        self.mlp_wrapper.fit(X, y)
        self.save_model()

    def save_model(self):
        self.mlp_wrapper.save_model(model_file=self.model_file)
        # dump label columns for prediction
        thresholds = {}
        for i in self.probability_thresholds:
            if self.probability_thresholds[i]:
                thresholds[i] = float(self.probability_thresholds[i])
            else:
                thresholds[i] = None
        label_dict = {
            'labels': self.all_labels,
            'probability_thresholds': thresholds
        }
        with open(self.labels_file, 'w') as f:
            yaml.dump(label_dict, f)

        self.upload_model_to_gcs()

    def upload_model_to_gcs(self):
        # upload model
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.model_dest)
        blob.upload_from_filename(self.model_file)

        # upload label columns
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.labels_dest)
        blob.upload_from_filename(self.labels_file)

## Run locally to test the code

In [3]:
r = RepoMLP(owner='kubeflow', repo='examples')

In [4]:
r.train()

## Create entry point using fairing
Kubeflow [Fairing](https://www.kubeflow.org/docs/fairing/) is a Python package that makes training and deploying machine learning models on Kubeflow easier.

Here, we use the preprocessor in Kubeflow Fairing to convert a notebook to be a Python script and create an entry point for that script. After preprocessing the notebook, we can call the command in the command line like the following to run
```
$ python repo_mlp.py train
```

In [5]:
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire

In [6]:
preprocessor = ConvertNotebookPreprocessorWithFire('RepoMLP')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['mlp.py', 'repo_config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()

[PosixPath('repo_mlp.py'), 'repo_config.py', 'mlp.py']