In [176]:
import re
import numpy as np
import os
import requests
import time

# Issue - PR - Commits - Files Changed Linking Exploration

In [90]:
issue_url = 'https://github.com/kubernetes/kubernetes/issues/16661'

In [91]:
pattern = re.compile(r'.*?github.com/(.*)/(.*)/.*', re.I)
repo_name = pattern.search(issue_url).group(1)
repo_name

'kubernetes/kubernetes'

In [92]:
issue_num_pattern = re.compile('https://github.com/.*/issues/(.*)', re.I)
issue_num = issue_num_pattern.search(issue_url).group(1)
issue_num

'16661'

In [93]:
pr_search_url = 'https://api.github.com/search/issues?q=is:pr issue:{issue_num} repo:{repo_name}'
pr_search_url = pr_search_url.format(issue_num=issue_num, repo_name=repo_name)
pr_search_url

'https://api.github.com/search/issues?q=is:pr issue:16661 repo:kubernetes/kubernetes'

In [95]:
response = requests.get(pr_search_url, auth=('dipanjanS', os.environ['GITHUB_TOKEN']))

In [112]:
response.status_code

200

In [97]:
content = response.json()
content['total_count']

1

In [99]:
pr_details = [record['pull_request'] for record in content['items']]
pr_urls = [record['html_url'] for record in pr_details]
pr_urls

['https://github.com/kubernetes/kubernetes/pull/16668']

In [101]:
patch_urls = [record['patch_url'] for record in pr_details]
patch_urls

['https://github.com/kubernetes/kubernetes/pull/16668.patch']

In [109]:
repo_url = 'https://github.com/'+repo_name
commits = []
changed_files = []

for patch_url in patch_urls:
    response = requests.get(patch_urls[0])
    data = response.text
    
    commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
    if commit_hashes:
        commit_hashes = [item for item in commit_hashes if item.isalnum()]
        commit_links = [repo_url+'/commit/'+item.rstrip('/') for item in commit_hashes]
        commits.extend(commit_links) 
    
    changed_file_paths = re.findall(r'(?:\n|^)diff\s--git\sa.*?\sb(.*)', data, re.I)
    if changed_file_paths:
        changed_file_paths = [item.rstrip('/') for item in changed_file_paths]
        changed_files.extend(changed_file_paths)

commits = np.unique(commits)
changed_files = np.unique(changed_files)

In [110]:
commits

array(['https://github.com/kubernetes/kubernetes/commit/fd1c8e096a0ff352453376d47408a111f0c36439'],
      dtype='<U88')

In [111]:
changed_files

array(['/api/swagger-spec/v1beta1.json',
       '/docs/api-reference/extensions/v1beta1/definitions.html',
       '/docs/api-reference/extensions/v1beta1/operations.html',
       '/docs/api-reference/v1/definitions.html',
       '/docs/api-reference/v1/operations.html',
       '/pkg/apis/extensions/deep_copy_generated.go',
       '/pkg/apis/extensions/types.generated.go',
       '/pkg/apis/extensions/types.go',
       '/pkg/apis/extensions/v1beta1/conversion_generated.go',
       '/pkg/apis/extensions/v1beta1/deep_copy_generated.go',
       '/pkg/apis/extensions/v1beta1/types.generated.go',
       '/pkg/apis/extensions/v1beta1/types.go',
       '/pkg/apis/extensions/v1beta1/types_swagger_doc_generated.go',
       '/pkg/controller/podautoscaler/horizontal.go',
       '/pkg/controller/podautoscaler/horizontal_test.go',
       '/pkg/kubectl/autoscale.go', '/pkg/kubectl/cmd/autoscale.go',
       '/pkg/kubectl/describe.go', '/pkg/kubectl/resource_printer.go',
       '/test/e2e/horizontal_po

In [84]:
pr_url = 'https://github.com/kubernetes/kubernetes/pull/16668'

In [85]:
pattern = re.compile(r'.*?github.com/(.*)/(.*)/.*', re.I)
repo_name = pattern.search(issue_url).group(1)
repo_name

'kubernetes/kubernetes'

In [117]:
pr_num_pattern = re.compile('https://github.com/.*/pull/(.*)', re.I)
pr_num = pr_num_pattern.search(pr_url).group(1)
pr_num

'16668'

In [119]:
issue_search_url = 'https://api.github.com/search/issues?q=is:issue pr:{pr_num} repo:{repo_name}'
issue_search_url = issue_search_url.format(pr_num=pr_num, repo_name=repo_name)
issue_search_url

'https://api.github.com/search/issues?q=is:issue pr:16668 repo:kubernetes/kubernetes'

# Building Generic Function to get GitHub Events Linkage

In [185]:
def link_github_events(url, event_type, github_user, github_auth):


    def get_repo_name(url):
        '''
          Helps in getting the repository name which is useful
          for querying in GitHub API queries. 
        '''
        pattern = re.compile(r'.*?github.com/(.*)/(.*)/.*', re.I)
        repo_name = pattern.search(url).group(1)
        return repo_name


    def get_commits_changed_files(patch_urls, repo_url):
        '''
          Helps in getting the relevant commits and files changed
          from a list of GitHub patch URLs for a specfic repository
        '''
        commits = []
        changed_files = []

        for patch_url in patch_urls:
            response = requests.get(patch_url)
            data = response.text
            if data:
                commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
                if commit_hashes:
                    commit_hashes = [item for item in commit_hashes if item.isalnum()]
                    commit_links = [repo_url+'/commit/'+item.rstrip('/') for item in commit_hashes]
                    commits.extend(commit_links) 

                changed_file_paths = re.findall(r"(?:\n|^)diff\s--git\sa.*?\.go\sb(.*\.go)", data, re.I)
                if changed_file_paths:
                    changed_file_paths = [item.rstrip('/') for item in changed_file_paths]
                    changed_files.append([patch_url] + list(np.unique(changed_file_paths)))

        commits = list(np.unique(commits))
        return commits, changed_files


    def get_dependent_links_from_issue(url, repo_name, repo_url, requests_made):
        '''
          Helps in getting the relevant and related PRs, commits 
          and files changed based on a GitHub issue, 
          for a specfic repository
        '''
        issue_urls = [url]
        pr_urls = []
        patch_urls = []
        commit_urls = []
        files_changed = []
        
        issue_num_pattern = re.compile('https://github.com/.*/issues/(.*)', re.I)
        issue_num = issue_num_pattern.search(url).group(1)

        pr_search_url = 'https://api.github.com/search/issues?q=is:pr issue:{issue_num} repo:{repo_name}'
        pr_search_url = pr_search_url.format(issue_num=issue_num, repo_name=repo_name)
        response = requests.get(pr_search_url, auth=(github_user, github_auth))
        requests_made += 1
        if response.status_code == 200:
            content = response.json()
            if content and content.get('items'):
                                     
                pr_details = list(filter(None, [record.get('pull_request') 
                                                      for record in content['items']]))
                pr_urls = list(filter(None, [record.get('html_url') 
                                                 for record in pr_details]))
                if pr_details:
                    patch_urls = list(filter(None, [record.get('patch_url') 
                                                        for record in pr_details]))
                    commit_urls, files_changed = get_commits_changed_files(patch_urls, repo_url)

        return issue_urls, pr_urls, commit_urls, files_changed, requests_made


    def get_dependent_links_from_pr(url, repo_name, repo_url, requests_made):
        '''
          Helps in getting the relevant and related issues, commits 
          and files changed based on a GitHub pull request (PR), 
          for a specfic repository
        '''
        issue_urls = []
        pr_urls = [url]
        patch_urls = []
        commit_urls = []
        files_changed = []

        pr_num_pattern = re.compile('https://github.com/.*/pull/(.*)', re.I)
        pr_num = pr_num_pattern.search(url).group(1)

        issue_search_url = 'https://api.github.com/search/issues?q=is:issue pr:{pr_num} repo:{repo_name}'
        issue_search_url = issue_search_url.format(pr_num=pr_num, repo_name=repo_name)
        response = requests.get(issue_search_url, auth=(github_user, github_auth))
        requests_made += 1
        if response.status_code == 200:
            content = response.json()
            if content and content.get('items'):
                issue_urls = list(filter(None, [record.get('html_url') 
                                                    for record in content['items']]))

        patch_urls = [record+'.patch' for record in pr_urls]
        commit_urls, files_changed = get_commits_changed_files(patch_urls, repo_url)

        return issue_urls, pr_urls, commit_urls, files_changed, requests_made


    def get_dependent_links_from_commit(url, repo_name, repo_url, requests_made):
        '''
          Helps in getting the relevant and related issues, PRs 
          and files changed based on a GitHub commit, 
          for a specfic repository
        '''
        issue_urls = []
        pr_urls = []
        patch_urls = []
        commit_urls = [url]
        files_changed = []

        commit_num_pattern = re.compile('https://github.com/.*/commit/(.*)', re.I)
        commit_num = commit_num_pattern.search(url).group(1)

        pr_search_url = 'https://api.github.com/search/issues?q=is:pr commit:{commit_num} repo:{repo_name}'
        pr_search_url = pr_search_url.format(commit_num=commit_num, repo_name=repo_name)
        response = requests.get(pr_search_url, auth=(github_user, github_auth))
        requests_made += 1
        if response.status_code == 200:
            content = response.json()
            if content and content.get('items'):
                pr_urls = list(filter(None, [record.get('html_url') 
                                                for record in content['items']]))
                if pr_urls:
                    patch_urls = [record+'.patch' for record in pr_urls]
                    commit_urls, files_changed = get_commits_changed_files(patch_urls, repo_url)

        pr_num_pattern = re.compile('https://github.com/.*/pull/(.*)', re.I)
        for pr_url in pr_urls:
            pr_num = pr_num_pattern.search(pr_url).group(1)
            issue_search_url = 'https://api.github.com/search/issues?q=is:issue pr:{pr_num} repo:{repo_name}'
            issue_search_url = issue_search_url.format(pr_num=pr_num, repo_name=repo_name)
            response = requests.get(issue_search_url, auth=(github_user, github_auth))
            requests_made += 1
            if response.status_code == 200:
                content = response.json()
                if content and content.get('items'):
                    issue_urls.extend(list(filter(None, [record.get('html_url') 
                                                             for record in content['items']])))

        return issue_urls, pr_urls, commit_urls, files_changed, requests_made
    
    
    try:
        issue_urls = []
        pr_urls = []
        patch_urls = []
        commit_urls = []
        files_changed = []
        requests_made=0
        repo_name = get_repo_name(url)
        repo_url = 'https://github.com/'+repo_name

        if event_type.lower() == 'issue':
            issue_urls, pr_urls, \
            commit_urls, files_changed, requests_made = get_dependent_links_from_issue(url, repo_name, 
                                                                                             repo_url, 
                                                                                             requests_made)
        elif event_type.lower() == 'pull request':
            issue_urls, pr_urls, \
            commit_urls, files_changed, requests_made = get_dependent_links_from_pr(url, repo_name, 
                                                                                          repo_url, 
                                                                                          requests_made)
        elif event_type.lower() == 'commit':
            issue_urls, pr_urls, \
            commit_urls, files_changed, requests_made = get_dependent_links_from_commit(url, repo_name, 
                                                                                              repo_url,
                                                                                              requests_made)
    except Exception as e:
        print(repr(e)) # TODO: logging in the future
            
    return ({
            'issue_url': issue_urls, 
            'fixed_url': pr_urls, 
            'commit_url': commit_urls, 
            'files_changed': files_changed        
    }, requests_made) 

In [186]:
def generate_github_events_dependency_data(gh_urls, gh_event_types, github_user, github_auth):
    total_requests_made = 0
    events_link_data = []
    # TODO: needs to be handled better
    # what if there is 10+ URLs in one function call itself 
    # (though probability is less, still need to be handled)
    for gh_url, gh_event_type in zip(gh_urls, gh_event_types):
        if total_requests_made >= 25:
            time.sleep(65)
            total_requests_made = 0
        data, requests_made  = link_github_events(url, event_type, github_user, github_auth)
        events_link_data.append(data)
        total_requests_made += requests_made
        
    return events_link_data

# Test Linkage Function

In [178]:
url = 'https://github.com/kubernetes/kubernetes/issues/16661'
link_github_events(url=url, event_type='issue', 
                   github_user='dipanjanS', github_auth=os.environ['GITHUB_TOKEN'])

({'issue_url': ['https://github.com/kubernetes/kubernetes/issues/16661'],
  'fixed_url': ['https://github.com/kubernetes/kubernetes/pull/16668'],
  'commit_url': ['https://github.com/kubernetes/kubernetes/commit/fd1c8e096a0ff352453376d47408a111f0c36439'],
  'files_changed': [['https://github.com/kubernetes/kubernetes/pull/16668.patch',
    '/pkg/apis/extensions/deep_copy_generated.go',
    '/pkg/apis/extensions/types.generated.go',
    '/pkg/apis/extensions/types.go',
    '/pkg/apis/extensions/v1beta1/conversion_generated.go',
    '/pkg/apis/extensions/v1beta1/deep_copy_generated.go',
    '/pkg/apis/extensions/v1beta1/types.generated.go',
    '/pkg/apis/extensions/v1beta1/types.go',
    '/pkg/apis/extensions/v1beta1/types_swagger_doc_generated.go',
    '/pkg/controller/podautoscaler/horizontal.go',
    '/pkg/controller/podautoscaler/horizontal_test.go',
    '/pkg/kubectl/autoscale.go',
    '/pkg/kubectl/cmd/autoscale.go',
    '/pkg/kubectl/describe.go',
    '/pkg/kubectl/resource_print

In [179]:
url = 'https://github.com/kubernetes/kubernetes/pull/16668'
link_github_events(url=url, event_type='pull request', 
                   github_user='dipanjanS', github_auth=os.environ['GITHUB_TOKEN'])

({'issue_url': ['https://github.com/kubernetes/kubernetes/issues/16661'],
  'fixed_url': ['https://github.com/kubernetes/kubernetes/pull/16668'],
  'commit_url': ['https://github.com/kubernetes/kubernetes/commit/fd1c8e096a0ff352453376d47408a111f0c36439'],
  'files_changed': [['https://github.com/kubernetes/kubernetes/pull/16668.patch',
    '/pkg/apis/extensions/deep_copy_generated.go',
    '/pkg/apis/extensions/types.generated.go',
    '/pkg/apis/extensions/types.go',
    '/pkg/apis/extensions/v1beta1/conversion_generated.go',
    '/pkg/apis/extensions/v1beta1/deep_copy_generated.go',
    '/pkg/apis/extensions/v1beta1/types.generated.go',
    '/pkg/apis/extensions/v1beta1/types.go',
    '/pkg/apis/extensions/v1beta1/types_swagger_doc_generated.go',
    '/pkg/controller/podautoscaler/horizontal.go',
    '/pkg/controller/podautoscaler/horizontal_test.go',
    '/pkg/kubectl/autoscale.go',
    '/pkg/kubectl/cmd/autoscale.go',
    '/pkg/kubectl/describe.go',
    '/pkg/kubectl/resource_print

In [180]:
url = 'https://github.com/kubernetes/kubernetes/commit/fd1c8e096a0ff352453376d47408a111f0c36439'
link_github_events(url=url, event_type='commit', 
                   github_user='dipanjanS', github_auth=os.environ['GITHUB_TOKEN'])

({'issue_url': ['https://github.com/kubernetes/kubernetes/issues/16661'],
  'fixed_url': ['https://github.com/kubernetes/kubernetes/pull/16668'],
  'commit_url': ['https://github.com/kubernetes/kubernetes/commit/fd1c8e096a0ff352453376d47408a111f0c36439'],
  'files_changed': [['https://github.com/kubernetes/kubernetes/pull/16668.patch',
    '/pkg/apis/extensions/deep_copy_generated.go',
    '/pkg/apis/extensions/types.generated.go',
    '/pkg/apis/extensions/types.go',
    '/pkg/apis/extensions/v1beta1/conversion_generated.go',
    '/pkg/apis/extensions/v1beta1/deep_copy_generated.go',
    '/pkg/apis/extensions/v1beta1/types.generated.go',
    '/pkg/apis/extensions/v1beta1/types.go',
    '/pkg/apis/extensions/v1beta1/types_swagger_doc_generated.go',
    '/pkg/controller/podautoscaler/horizontal.go',
    '/pkg/controller/podautoscaler/horizontal_test.go',
    '/pkg/kubectl/autoscale.go',
    '/pkg/kubectl/cmd/autoscale.go',
    '/pkg/kubectl/describe.go',
    '/pkg/kubectl/resource_print

In [181]:
url = 'https://github.com/openshift/origin/pull/8434'
link_github_events(url=url, event_type='pull request', 
                   github_user='dipanjanS', github_auth=os.environ['GITHUB_TOKEN'])

({'issue_url': ['https://github.com/openshift/origin/issues/9584'],
  'fixed_url': ['https://github.com/openshift/origin/pull/8434'],
  'commit_url': ['https://github.com/openshift/origin/commit/c060bb84c8d0f6304170b429f95614dacf18bc04'],
  'files_changed': [['https://github.com/openshift/origin/pull/8434.patch',
    '/pkg/auth/authenticator/password/basicauthpassword/basicauthpassword.go',
    '/test/integration/oauth_basicauth_test.go']]},
 1)

In [190]:
url = 'https://github.com/kubernetes/kubernetes/pull/65751'
link_github_events(url=url, event_type='pull request', 
                   github_user='dipanjanS', github_auth=os.environ['GITHUB_TOKEN'])

({'issue_url': [],
  'fixed_url': ['https://github.com/kubernetes/kubernetes/pull/65751'],
  'commit_url': ['https://github.com/kubernetes/kubernetes/commit/27bc865cc1bffb97d4dff38492aa9f830f859e45'],
  'files_changed': [['https://github.com/kubernetes/kubernetes/pull/65751.patch',
    '/pkg/util/mount/mount_windows.go']]},
 1)

In [193]:
url = 'https://github.com/portainer/portainer/issues/2382'
link_github_events(url=url, event_type='issue', 
                   github_user='dipanjanS', github_auth=os.environ['GITHUB_TOKEN'])

https://api.github.com/search/issues?q=is:pr issue:2382 repo:portainer/portainer


({'issue_url': ['https://github.com/portainer/portainer/issues/2382'],
  'fixed_url': [],
  'commit_url': [],
  'files_changed': []},
 1)

In [None]:
# TODOs

# 1. for each patch file, store the patch link and the corresponding files changed in a separate list


# 2. Each list from Point 1 should go to a master list which should be the value for the files_changed key
# e.g depicted below
# files_changed: [
#                  [p1, f1, f2, f3],
#                 [p2, f5, f7, f9]
#                ]
# for this you just need to look at the logic for patch url and files changed 
# in the get_commits_changed_files(..) function


# 3. Change regex in the get_commits_changed_files(...) function in the changed_file_paths variable to include
#    only files with .go extension


# 4. The overall function should return the following fields as a dict
#  { issue_urls: [list of issue urls even if there is only 1],
#     'pr_urls': [list of PR urls even if there is only 1],
#     'commit_urls': [list of commit urls even if there is only 1],
#     'files_changed': [list of lists where each list has patch_url first 
#                          and then all .go files changed in that patch file]
#  } 


# 5. Test out the changes for existing test links and some new ones from the spreadsheet