Label_Microservice worker should use the new Model classes

* Most of the inference logic in the Worker class can be deleted and we can instead use the newly created Model classes. * Use an in cluster deployment service * Log the model and thresholds. * cli.py is a simple CLI to send pubsub requests * It is ended for development and debugging. * Create a utility to parse issue specs of the form {org}/{repo}#number * We need to initialize IssueLabelPredictor() inside the pubsub thread; otherwise we have threading issues when trying to access the model. * The universal model needs to apply appropriate thresholds to its predictions. * Worker no longer needs to apply thresholding of predictions because that is now handled by the individual model classes. * Add a method to worker apply_repo_config that handles repo specific behavior specified in the YAML file in the repo * Apply aliases to the labels * If the config specifies which labels to predict then filter out all other labels. * When commenting on issues about the predictions, the bot should reate a markdown table with the probabilities for the labels in the comment. * Remove test_worker.py; the test is no longer relevant after the refactor. There is a TODO in worker.py to create an appropriate unittest for the new code. * Change how the GitHub App Private Keys are handled. * The private key should be mounted as a secret and an environment variable should specify the path of the PEM key. * Stop storing the PEM key as an environment variable and then in python code writing it to a file. * Change the name of the environment variables to add the prefix "GITHUB_" to make it more explanatory. * Move some of the helper methods for creating GitHubApp out of github_util and into the GitHubApp class Related to: kubeflow#70 combine universal and repo specific models.
jlewi · Dec 31, 2019 · d3b8009 · d3b8009
1 parent 32f8f30
commit d3b8009
Show file tree

Hide file tree

Showing 8 changed files with 283 additions and 323 deletions.
diff --git a/Label_Microservice/tests/test_worker.py b/Label_Microservice/tests/test_worker.py
diff --git a/py/code_intelligence/github_app.py b/py/code_intelligence/github_app.py
@@ -1,3 +1,6 @@
+import logging
+import os
+
 from collections import namedtuple, Counter
 from github3 import GitHub
 from pathlib import Path
@@ -25,6 +28,13 @@ def __init__(self, pem_path, app_id):
         if not self.path.is_file():
             raise ValueError(f'argument: `pem_path` must be a valid filename. {pem_path} was not found.')
 
+    @staticmethod
+    def create_from_env():
+        """Create a new instance based on environment variables."""
+        app_id = os.getenv('GITHUB_APP_ID')
+        key_file_path = os.getenv("GITHUB_APP_PEM_KEY")
+        return GitHubApp(pem_path=key_file_path, app_id=app_id)
+
     def get_app(self):
         with open(self.path, 'rb') as key_file:
             client = GitHub()
@@ -34,11 +44,13 @@ def get_app(self):
 
     def get_installation(self, installation_id):
         "login as app installation without requesting previously gathered data."
+        logging.info("Logging in as GitHub App")
         with open(self.path, 'rb') as key_file:
             client = GitHub()
             client.login_as_app_installation(private_key_pem=key_file.read(),
                                              app_id=self.app_id,
                                              installation_id=installation_id)
+        logging.info("Successfully logged in as GitHub App")
         return client
 
     def get_test_installation_id(self):
@@ -85,7 +97,9 @@ def get_installation_id(self, owner, repo):
 
         response = requests.get(url=url, headers=headers)
         if response.status_code != 200:
-            raise Exception(f'Status code : {response.status_code}, {response.json()}')
+            raise Exception(f"There was a problem requesting URL={URL} "
+                            f"Status code : {response.status_code}, "
+                            f"Response:{response.json()}")
         return response.json()['id']
 
     def get_installation_access_token(self, installation_id):

diff --git a/py/code_intelligence/github_util.py b/py/code_intelligence/github_util.py
@@ -1,44 +1,26 @@
 import os
 import logging
-from code_intelligence.github_app import GitHubApp
+from code_intelligence import github_app
 import yaml
 
-
-def init():
-    "Load all necessary artifacts to make predictions."
-    #save keyfile
-    pem_string = os.getenv('PRIVATE_KEY')
-    if not pem_string:
-        raise ValueError('Environment variable PRIVATE_KEY was not supplied.')
-
-    with open('private-key.pem', 'wb') as f:
-        f.write(str.encode(pem_string))
-
-# TODO(jlewi): init is taking the PRIVATE_KEY from an environment variable
-# and then writing it to a file. It would probably be better to follow
-# the pattern of GOOGLE_APPLICATION_CREDENTIALS; i.e. mount the K8s secret
-# to a volume and then use an environment variable to specify the path of
-# the key file.
-def get_app():
-    "grab a fresh instance of the app handle."
-    app_id = os.getenv('APP_ID')
-    key_file_path = 'private-key.pem'
-    ghapp = GitHubApp(pem_path=key_file_path, app_id=app_id)
-    return ghapp
-
 def get_issue_handle(installation_id, username, repository, number):
     "get an issue object."
-    ghapp = get_app()
+    ghapp = github_app.GitHubApp.create_from_env()
     install = ghapp.get_installation(installation_id)
     return install.issue(username, repository, number)
 
-def get_yaml(owner, repo):
+def get_yaml(owner, repo, ghapp=None):
     """
     Looks for the yaml file in a /.github directory.
 
     yaml file must be named issue_label_bot.yaml
     """
-    ghapp = get_app()
+
+    if not ghapp:
+        # TODO(jlewi): Should we deprecate this code path and always pass
+        # in the github app?
+        ghapp = github_app.GitHubApp.create_from_env()
+
     try:
         # get the app installation handle
         inst_id = ghapp.get_installation_id(owner=owner, repo=repo)

diff --git a/py/code_intelligence/util.py b/py/code_intelligence/util.py
@@ -1,5 +1,8 @@
 import logging
 import json
+import re
+
+ISSUE_RE = re.compile("([^/]*)/([^#]*)#([0-9]*)")
 
 # TODO(jlewi): Might be better to just write it
 # as a json list
@@ -8,4 +11,18 @@ def write_items_to_json(output_file, results):
     for i in results:
       json.dump(i, hf)
       hf.write("\n")
-  logging.info("Wrote %s items to %s", len(results), output_file)
+  logging.info("Wrote %s items to %s", len(results), output_file)
+
+def parse_issue_spec(issue):
+  """Parse an issue in the form {owner}/{repo}#{number}
+
+  Args:
+    isue: An issue in the form {owner}/{repo}#{number}
+
+  Returns:
+    owner, repo, number
+  """
+  m = ISSUE_RE.match(issue)
+  if not m:
+    return None, None, None
+  return m.group(1), m.group(2), int(m.group(3))
diff --git a/py/code_intelligence/util_test.py b/py/code_intelligence/util_test.py
@@ -0,0 +1,35 @@
+import logging
+import pytest
+
+from code_intelligence import util
+
+def test_parse_issue_spec():
+  """A unittest for parsing issues. """
+
+  test_cases = [
+    {
+      "issue": "kubeflow/tfjob#153",
+      "expected": ("kubeflow", "tfjob", 153)
+    },
+    {
+      "issue": "kubeflow/tfjob/tfjob",
+      "expected": (None, None, None)
+    }
+  ]
+
+  for c in test_cases:
+    owner, repo, number = util.parse_issue_spec(c["issue"])
+    assert owner == c["expected"][0]
+    assert repo == c["expected"][1]
+    assert number == c["expected"][2]
+
+if __name__ == "__main__":
+  logging.basicConfig(
+      level=logging.INFO,
+      format=('%(levelname)s|%(asctime)s'
+              '|%(pathname)s|%(lineno)d| %(message)s'),
+      datefmt='%Y-%m-%dT%H:%M:%S',
+  )
+  logging.getLogger().setLevel(logging.INFO)
+
+  pytest.main()
diff --git a/py/label_microservice/cli.py b/py/label_microservice/cli.py
@@ -0,0 +1,47 @@
+"""A cli for interacting with the models.
+
+The CLI can be used to publish issues to perform inference on to pubsub
+to be picked up by the backends.
+"""
+import logging
+import fire
+from code_intelligence import util
+from google.cloud import pubsub
+
+DEFAULT_TOPIC = "projects/issue-label-bot-dev/topics/TEST_event_queue"
+class Cli:
+
+  @staticmethod
+  def label_issue(issue, pubsub_topic=DEFAULT_TOPIC):
+    """Label a specific issue.
+
+    Args:
+      issue: The issue in the form {owner}/{repo}#{issue}
+      pubsub_topic: (Optional) the pubsub topic to publish to. This should
+        be in the form projects/{project}/topics/{topic_name}
+    """
+    publisher = pubsub.PublisherClient()
+    repo_owner, repo_name, issue_num = util.parse_issue_spec(issue)
+
+    if not repo_owner:
+      raise ValueError(f"issue={issue} didn't match regex "
+                       f"{util.ISSUE_RE.pattern}")
+
+    # all attributes being published to pubsub must be sent as text strings
+    publisher.publish(pubsub_topic,
+                      b'New issue.',
+                      # TODO(jlewi): Does the backend depend on the client
+                      # providing the installation id
+                      installation_id="",
+                      repo_owner=repo_owner,
+                      repo_name=repo_name,
+                      issue_num=str(issue_num))
+
+if __name__ == "__main__":
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(message)s|%(pathname)s|%(lineno)d|'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+
+  fire.Fire(Cli)
diff --git a/py/label_microservice/universal_kind_label_model.py b/py/label_microservice/universal_kind_label_model.py
@@ -1,4 +1,4 @@
-
+from collections import defaultdict
 import tensorflow as tf
 from tensorflow.keras import models as keras_models
 from tensorflow.keras import utils as keras_utils
@@ -45,6 +45,13 @@ def __init__(self,  class_names=['bug', 'feature', 'question']):
 
     self.class_names = class_names
 
+    # set the prediction threshold for everything except for the label question
+    # which has a different threshold.
+    # These values were copied from the original code.
+    # https://github.com/machine-learning-apps/Issue-Label-Bot/blob/536e8bf4928b03d522dd021c0464587747e90a87/flask_app/app.py#L43
+    self._prediction_threshold = defaultdict(lambda: .52)
+    self._prediction_threshold["question"] = .60
+
   def predict_issue_labels(self,  title:str, body:str):
     """
     Get probabilities for the each class.
@@ -76,4 +83,11 @@ def predict_issue_labels(self,  title:str, body:str):
     with self._graph.as_default():
       probs = self.model.predict(x=[vec_body, vec_title]).tolist()[0]
 
-    return {k:v for k,v in zip(self.class_names, probs)}
+    results = {}
+
+    for label, p in zip(self.class_names, probs):
+      if p < self._prediction_threshold[label]:
+        continue
+      results[label] = p
+
+    return results