# Running this Notebook

This notebook is run in the container: [hamelsmu/ml-gpu-issue-lang-model](https://github.com/machine-learning-apps/IssuesLanguageModel/blob/master/gpu.Dockerfile)

This container is publicly available [on Dockerhub](https://cloud.docker.com/u/hamelsmu/repository/docker/hamelsmu/ml-gpu-issue-lang-model)

In [1]:
from bs4 import BeautifulSoup
import requests
from fastai.core import parallel, partial
from collections import Counter
from tqdm import tqdm_notebook

## Write Functions To Get Data

In [2]:
def find_max_issue_num(owner, repo):
    """
    Find the maximum issue number associated with a repo.
    
    Returns
    -------
    int
        the highest issue number associated with this repo.
    """
    url = f'https://github.com/{owner}/{repo}/issues'
    r = requests.get(url)
    if not r.ok:
        r.raise_for_status()
    soup = BeautifulSoup(r.content)
    # get grey text under issue preview cards
    issue_meta = soup.find('span', class_="opened-by").text
    # parse the first issue number visible, which is also the highest issue number
    issue_num = issue_meta.strip().split('\n')[0][1:]
    return int(issue_num)

In [3]:
def verify_issue(owner, repo, num):
    """
    Verify that owner/repo/issues/num exists.  
    
    Returns
    -------
    bool
        True/False if issue exists.
    
    Note that pull requests are also issues but will 
    get redirected with a status code 302, allowing
    this function to return False.
    """
    
    url = f'https://github.com/{owner}/{repo}/issues/{num}'
    
    if requests.head(url).status_code != 200:
        return False
    else:
        return True

In [4]:
def get_issue_text(num, idx, owner, repo, skip_issue=True):
    """
    Get the raw text of an issue body and label.
    
    Returns
    ------
    dict
        {'title':str, 'body':str}
    """
    url = f'https://github.com/{owner}/{repo}/issues/{num}'
    if not verify_issue(owner, repo, num):
        if skip_issue:
            return None
        raise Exception(f'{url} is not an issue.')
        
    soup = BeautifulSoup(requests.get(url).content)
    title_find = soup.find("span", class_="js-issue-title")
    body_find = soup.find("td", class_="js-comment-body")
    label_find = soup.find(class_='js-issue-labels')
    
    if not title_find or not body_find:
        return None
    
    title = title_find.get_text().strip()
    body = body_find.get_text().strip()
    labels = label_find.get_text().strip().split('\n')
    
    if labels[0] == 'None yet':
        return None
    
    return {'title':title,
            'url':url,
            'body': body,
            'labels': labels}

In [60]:
def get_all_issue_text(owner, repo, inf_wrapper, workers=64, min_freq=25):
    c = Counter()
    # prepare list of issue nums
    owner=owner
    repo=repo
    max_num = find_max_issue_num(owner, repo)
    
    get = partial(get_issue_text, owner=owner, repo=repo, skip_issue=True)
    issues = parallel(get, list(range(1, max_num+1)), max_workers=workers)
    # filter out issues with problems
    filtered_issues = []
    
    for issue in issues:
        if issue:
            c.update(issue['labels'])
            filtered_issues.append(issue)
    
    frequent_issues = [x for x in c if c[x] >= min_freq]
    
    print(f'Retrieved {len(filtered_issues)} issues.')
    
    # only retain top n issues
    features = []
    labels = []
    for issue in tqdm_notebook(filtered_issues):
        lbls = [i for i in issue['labels'] if i in frequent_issues]
        if lbls:
            labels.append(lbls)
            # calculate embedding
            text = inf_wrapper.process_dict(issue)['text']
            feature = inf_wrapper.get_pooled_features(text).detach().cpu()
            features.append(feature)
            
    print(f'{len(features)} issues remaining after minimum frequency filter of {min_freq}.')
    
    assert len(features) == len(labels), 'Error you have mismatch b/w number of observations and labels.'
    
    return {'features':torch.cat(features).numpy(), 
            'labels': labels}

## Get The Data

In [61]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from inference import InferenceWrapper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Model Artifacts (Download from GC if not on local)

In [62]:
from pathlib import Path
from urllib import request as request_url

def pass_through(x):
    return x

model_url = 'https://storage.googleapis.com/issue_label_bot/model/lang_model/models_22zkdqlr/trained_model_22zkdqlr.pkl'
path = Path('./model_files')
full_path = path/'model.pkl'

if not full_path.exists():
    print('Loading model.')
    path.mkdir(exist_ok=True)
    request_url.urlretrieve(model_url, path/'model.pkl') 
inference_wrapper = InferenceWrapper(model_path=path, model_file_name='model.pkl')

In [64]:
%%time
test = get_all_issue_text(owner='kubeflow', repo='kubeflow', inf_wrapper=inference_wrapper)

Retrieved 1541 issues.


HBox(children=(IntProgress(value=0, max=1541), HTML(value='')))


1526 issues remaining after minimum frequency filter of 25.
CPU times: user 2min 58s, sys: 43.3 s, total: 3min 41s
Wall time: 4min 18s


In [65]:
test['features'].shape

(1526, 2400)

In [66]:
len(test['labels'])

1526

# Notes

It takes 4min to retrieve embeddings and labels for `Kubeflow\Kubeflow` this time can likely be brought down to 1 minute by batching the text instead of feeding the language model one by one.  