Skip to content

Commit

Permalink
Label_Microservice worker should use the new Model classes
Browse files Browse the repository at this point in the history
* Most of the inference logic in the Worker class can be deleted and
we can instead use the newly created Model classes.

* Use an in cluster deployment service
* Log the model and thresholds.

* cli.py is a simple CLI to send pubsub requests
  * It is ended for development and debugging.

* Create a utility to parse issue specs of the form {org}/{repo}#number

* We need to initialize IssueLabelPredictor() inside the pubsub thread;
otherwise we have threading issues when trying to access the model.

* The universal model needs to apply appropriate thresholds to its predictions.

* Worker no longer needs to apply thresholding of predictions because that is
  now handled by the individual model classes.

* Add a method to worker apply_repo_config that handles repo specific behavior
specified in the YAML file in the repo

  * Apply aliases to the labels
  * If the config specifies which labels to predict then filter out all other
    labels.

* When commenting on issues about the predictions, the bot should reate a markdown table with the probabilities for the labels in the comment.

* Remove test_worker.py; the test is no longer relevant after the refactor.
  There is a TODO in worker.py to create an appropriate unittest for the new
  code.

* Change how the GitHub App Private Keys are handled.

  * The private key should be mounted as a secret and an environment variable
    should specify the path of the PEM key.

  * Stop storing the PEM key as an environment variable and then in python code writing it to a file.

  * Change the name of the environment variables to add the prefix "GITHUB_"
to make it more explanatory.

  * Move some of the helper methods for creating GitHubApp out of github_util
    and into the GitHubApp class

Related to: kubeflow#70 combine universal and repo specific models.
  • Loading branch information
Jeremy Lewi committed Dec 31, 2019
1 parent 32f8f30 commit d3b8009
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 323 deletions.
93 changes: 0 additions & 93 deletions Label_Microservice/tests/test_worker.py

This file was deleted.

16 changes: 15 additions & 1 deletion py/code_intelligence/github_app.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import logging
import os

from collections import namedtuple, Counter
from github3 import GitHub
from pathlib import Path
Expand Down Expand Up @@ -25,6 +28,13 @@ def __init__(self, pem_path, app_id):
if not self.path.is_file():
raise ValueError(f'argument: `pem_path` must be a valid filename. {pem_path} was not found.')

@staticmethod
def create_from_env():
"""Create a new instance based on environment variables."""
app_id = os.getenv('GITHUB_APP_ID')
key_file_path = os.getenv("GITHUB_APP_PEM_KEY")
return GitHubApp(pem_path=key_file_path, app_id=app_id)

def get_app(self):
with open(self.path, 'rb') as key_file:
client = GitHub()
Expand All @@ -34,11 +44,13 @@ def get_app(self):

def get_installation(self, installation_id):
"login as app installation without requesting previously gathered data."
logging.info("Logging in as GitHub App")
with open(self.path, 'rb') as key_file:
client = GitHub()
client.login_as_app_installation(private_key_pem=key_file.read(),
app_id=self.app_id,
installation_id=installation_id)
logging.info("Successfully logged in as GitHub App")
return client

def get_test_installation_id(self):
Expand Down Expand Up @@ -85,7 +97,9 @@ def get_installation_id(self, owner, repo):

response = requests.get(url=url, headers=headers)
if response.status_code != 200:
raise Exception(f'Status code : {response.status_code}, {response.json()}')
raise Exception(f"There was a problem requesting URL={URL} "
f"Status code : {response.status_code}, "
f"Response:{response.json()}")
return response.json()['id']

def get_installation_access_token(self, installation_id):
Expand Down
36 changes: 9 additions & 27 deletions py/code_intelligence/github_util.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,26 @@
import os
import logging
from code_intelligence.github_app import GitHubApp
from code_intelligence import github_app
import yaml


def init():
"Load all necessary artifacts to make predictions."
#save keyfile
pem_string = os.getenv('PRIVATE_KEY')
if not pem_string:
raise ValueError('Environment variable PRIVATE_KEY was not supplied.')

with open('private-key.pem', 'wb') as f:
f.write(str.encode(pem_string))

# TODO(jlewi): init is taking the PRIVATE_KEY from an environment variable
# and then writing it to a file. It would probably be better to follow
# the pattern of GOOGLE_APPLICATION_CREDENTIALS; i.e. mount the K8s secret
# to a volume and then use an environment variable to specify the path of
# the key file.
def get_app():
"grab a fresh instance of the app handle."
app_id = os.getenv('APP_ID')
key_file_path = 'private-key.pem'
ghapp = GitHubApp(pem_path=key_file_path, app_id=app_id)
return ghapp

def get_issue_handle(installation_id, username, repository, number):
"get an issue object."
ghapp = get_app()
ghapp = github_app.GitHubApp.create_from_env()
install = ghapp.get_installation(installation_id)
return install.issue(username, repository, number)

def get_yaml(owner, repo):
def get_yaml(owner, repo, ghapp=None):
"""
Looks for the yaml file in a /.github directory.
yaml file must be named issue_label_bot.yaml
"""
ghapp = get_app()

if not ghapp:
# TODO(jlewi): Should we deprecate this code path and always pass
# in the github app?
ghapp = github_app.GitHubApp.create_from_env()

try:
# get the app installation handle
inst_id = ghapp.get_installation_id(owner=owner, repo=repo)
Expand Down
19 changes: 18 additions & 1 deletion py/code_intelligence/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
import json
import re

ISSUE_RE = re.compile("([^/]*)/([^#]*)#([0-9]*)")

# TODO(jlewi): Might be better to just write it
# as a json list
Expand All @@ -8,4 +11,18 @@ def write_items_to_json(output_file, results):
for i in results:
json.dump(i, hf)
hf.write("\n")
logging.info("Wrote %s items to %s", len(results), output_file)
logging.info("Wrote %s items to %s", len(results), output_file)

def parse_issue_spec(issue):
"""Parse an issue in the form {owner}/{repo}#{number}
Args:
isue: An issue in the form {owner}/{repo}#{number}
Returns:
owner, repo, number
"""
m = ISSUE_RE.match(issue)
if not m:
return None, None, None
return m.group(1), m.group(2), int(m.group(3))
35 changes: 35 additions & 0 deletions py/code_intelligence/util_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import logging
import pytest

from code_intelligence import util

def test_parse_issue_spec():
"""A unittest for parsing issues. """

test_cases = [
{
"issue": "kubeflow/tfjob#153",
"expected": ("kubeflow", "tfjob", 153)
},
{
"issue": "kubeflow/tfjob/tfjob",
"expected": (None, None, None)
}
]

for c in test_cases:
owner, repo, number = util.parse_issue_spec(c["issue"])
assert owner == c["expected"][0]
assert repo == c["expected"][1]
assert number == c["expected"][2]

if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)

pytest.main()
47 changes: 47 additions & 0 deletions py/label_microservice/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""A cli for interacting with the models.
The CLI can be used to publish issues to perform inference on to pubsub
to be picked up by the backends.
"""
import logging
import fire
from code_intelligence import util
from google.cloud import pubsub

DEFAULT_TOPIC = "projects/issue-label-bot-dev/topics/TEST_event_queue"
class Cli:

@staticmethod
def label_issue(issue, pubsub_topic=DEFAULT_TOPIC):
"""Label a specific issue.
Args:
issue: The issue in the form {owner}/{repo}#{issue}
pubsub_topic: (Optional) the pubsub topic to publish to. This should
be in the form projects/{project}/topics/{topic_name}
"""
publisher = pubsub.PublisherClient()
repo_owner, repo_name, issue_num = util.parse_issue_spec(issue)

if not repo_owner:
raise ValueError(f"issue={issue} didn't match regex "
f"{util.ISSUE_RE.pattern}")

# all attributes being published to pubsub must be sent as text strings
publisher.publish(pubsub_topic,
b'New issue.',
# TODO(jlewi): Does the backend depend on the client
# providing the installation id
installation_id="",
repo_owner=repo_owner,
repo_name=repo_name,
issue_num=str(issue_num))

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(message)s|%(pathname)s|%(lineno)d|'),
datefmt='%Y-%m-%dT%H:%M:%S',
)

fire.Fire(Cli)
18 changes: 16 additions & 2 deletions py/label_microservice/universal_kind_label_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

from collections import defaultdict
import tensorflow as tf
from tensorflow.keras import models as keras_models
from tensorflow.keras import utils as keras_utils
Expand Down Expand Up @@ -45,6 +45,13 @@ def __init__(self, class_names=['bug', 'feature', 'question']):

self.class_names = class_names

# set the prediction threshold for everything except for the label question
# which has a different threshold.
# These values were copied from the original code.
# https://github.com/machine-learning-apps/Issue-Label-Bot/blob/536e8bf4928b03d522dd021c0464587747e90a87/flask_app/app.py#L43
self._prediction_threshold = defaultdict(lambda: .52)
self._prediction_threshold["question"] = .60

def predict_issue_labels(self, title:str, body:str):
"""
Get probabilities for the each class.
Expand Down Expand Up @@ -76,4 +83,11 @@ def predict_issue_labels(self, title:str, body:str):
with self._graph.as_default():
probs = self.model.predict(x=[vec_body, vec_title]).tolist()[0]

return {k:v for k,v in zip(self.class_names, probs)}
results = {}

for label, p in zip(self.class_names, probs):
if p < self._prediction_threshold[label]:
continue
results[label] = p

return results
Loading

0 comments on commit d3b8009

Please sign in to comment.