## Define pipeline to train a model

In [1]:
# fairing:include-cell
from config import Config
from mlp import MLP
import dill as dpickle
import os
import yaml
from google.cloud import storage
import requests
import json
import numpy as np
from passlib.apps import custom_app_context as pwd_context
from collections import Counter

In [2]:
# fairing:include-cell
class RepoMLP(object):

    def __init__(self, yaml_path=None, owner=None, repo=None, min_freq=25):
        if not yaml_path:
            if 'YAML_PATH' in os.environ:
                print('yaml_path not supplied; check environment variable')
                yaml_path = os.getenv('YAML_PATH')
            else:
                print('yaml_path not supplied; using the default')
                yaml_path = 'issue_label_bot.yaml'
        self.yaml_path = yaml_path
        self.min_freq = min_freq # for filtering labels
        self.clf = None
        self.all_labels = None
        self.load_yaml(owner, repo)

    def load_yaml(self, owner, repo):
        config = Config(self.yaml_path, owner, repo)
        self.repo_owner = config.repo_owner
        self.repo_name = config.repo_name

        self.model_bucket_name = config.model_bucket_name
        self.model_file = config.model_local_path
        self.model_dest = config.model_gcs_path

        self.labels_file = config.labels_local_path
        self.labels_dest = config.labels_gcs_path

        self.emb_bucket_name = config.emb_bucket_name
        self.emb_file = config.emb_local_path
        self.emb_dest = config.emb_gcs_path

    def download_embeddings_from_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.emb_bucket_name)
        blob = bucket.get_blob(self.emb_dest)
        with open(self.emb_file, 'wb') as f:
            blob.download_to_file(f)

    def load_training_data(self):
        self.download_embeddings_from_gcs()
        with open(self.emb_file, 'rb') as f:
            data = dpickle.load(f)

        # filter labels
        c = Counter()
        for lbls in data['labels']:
            c.update(lbls)
        self.all_labels = [x for x in c if c[x] >= self.min_freq]

        X = []
        y = []
        for emb, lbls in zip(data['features'], data['labels']):
            mask = [self.all_labels.index(x) for x in lbls if c[x] >= self.min_freq]
            if mask == []:
                continue
            zer = np.zeros(len(self.all_labels))
            zer[mask] = 1
            y.append(zer)
            X.append(emb)
        return X, y

    def train(self):
        X, y = self.load_training_data()
        self.clf = MLP()
        self.clf.fit(X, y)
        self.save_model()
    
    def save_model(self):
        self.clf.save_model(model_file=self.model_file)
        # dump label columns for prediction
        with open(self.labels_file, 'wb') as f:
            dpickle.dump(self.all_labels, f)

        self.upload_model_to_gcs()
 
    def upload_model_to_gcs(self):
        # upload model
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.model_dest)
        blob.upload_from_filename(self.model_file)

        # upload label columns
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.model_bucket_name)
        blob = bucket.blob(self.labels_dest)
        blob.upload_from_filename(self.labels_file)

## Run locally to test the code

In [3]:
r = RepoMLP()

yaml_path not supplied; using the default


In [4]:
r.train()

In [5]:
r.save_model()

## Create entry point using fairing

In [6]:
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire

In [7]:
preprocessor = ConvertNotebookPreprocessorWithFire('RepoMLP')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['mlp.py', 'config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()

[PosixPath('Repo_MLP.py'), 'mlp.py', 'config.py']