## Define pipeline to put embeddings to GCS

In [1]:
# fairing:include-cell
from config import Config
from embeddings import pass_through
from embeddings import load_model_artifact
from embeddings import get_all_issue_text
import dill as dpickle
import os
import yaml
from pprint import pprint
from google.cloud import storage

In [2]:
# fairing:include-cell
class IssuesLoader(object):
    
    def __init__(self, yaml_path=None):
        if not yaml_path:
            if 'YAML_PATH' in os.environ:
                print('yaml_path not supplied; check environment variable')
                yaml_path = os.getenv('YAML_PATH')
            else:
                print('yaml_path not supplied; using the default')
                yaml_path = 'issue_label_bot.yaml'
        self.yaml_path = yaml_path
        self.load_yaml()

    def load_yaml(self):
        config = Config(self.yaml_path)
        self.repo_owner = config.repo_owner
        self.repo_name = config.repo_name

        self.bucket_name = config.emb_bucket_name
        self.emb_file = config.emb_local_path
        self.emb_dest = config.emb_gcs_path

    def load_lang_model(self):
        return load_model_artifact()

    def save_issue_embeddings(self):
        # check whether embeddings exist in gcs
        if self.check_embeddings_in_gcs():
            return

        inference_wrapper = self.load_lang_model()
        data = get_all_issue_text(owner=self.repo_owner, repo=self.repo_name,
                                  inf_wrapper=inference_wrapper)
        with open(self.emb_file, 'wb') as f:
            dpickle.dump(data, f)

        self.upload_embeddings_to_gcs()

    def check_embeddings_in_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.bucket_name)
        return storage.Blob(bucket=bucket, name=self.emb_dest).exists(storage_client)

    def upload_embeddings_to_gcs(self):
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(self.bucket_name)
        blob = bucket.blob(self.emb_dest)
        blob.upload_from_filename(self.emb_file)

## Run locally to test the code

In [3]:
yaml_path = '../issue_label_bot.yaml'
ldr = IssuesLoader(yaml_path=yaml_path)

In [4]:
ldr.save_issue_embeddings()

  0%|          | 0/207 [00:00<?, ?it/s]

Retrieved 207 issues.


100%|██████████| 207/207 [10:40<00:00,  2.04s/it]


In [5]:
# do not do anything because embeddings exist
ldr.save_issue_embeddings()

## Create entry point using fairing

In [6]:
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire

In [7]:
preprocessor = ConvertNotebookPreprocessorWithFire('IssuesLoader')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['embeddings.py', 'inference.py', 'config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()

[PosixPath('Issues_Loader.py'), 'config.py', 'inference.py', 'embeddings.py']