In [None]:
import base64
import json
from urllib.error import HTTPError
import pip._vendor.requests as requests
import os
import pprint as pp
import time


def get_data_request(url):
    GITHUB_KEY = os.environ["GITHUB_KEY"]
    headers = {}

    if GITHUB_KEY:
        headers["Authorization"] = f"token {GITHUB_KEY}"

    r = requests.get(url, headers=headers)
    r.raise_for_status()
    print(r.raise_for_status)
    data = r.json()

    return data


def get_next_page_link_request(url):
    GITHUB_KEY = os.environ["GITHUB_KEY"]
    headers = {}

    if GITHUB_KEY:
        headers["Authorization"] = f"token {GITHUB_KEY}"

    r = requests.get(url, headers=headers)
    link = r.headers.get("link")

    return link

def print_rate_limits(url):

    GITHUB_KEY = os.environ["GITHUB_KEY"]
    headers = {}

    if GITHUB_KEY:
        headers["Authorization"] = f"token {GITHUB_KEY}"

    r = requests.get(url, headers=headers)

    rate_limit = r.headers["X-RateLimit-Limit"]
    print(f'Requests allowed per minute: {rate_limit}')

    rate_limit_reset = r.headers["X-RateLimit-Reset"]
    print(f'When rate limit will reset (unix): {rate_limit_reset}')

    rate_limit_remaining = r.headers["X-RateLimit-Remaining"]
    print(f'Requests remaining for current minute: {rate_limit_remaining}')

    rate_limit_used = r.headers["X-RateLimit-Used"]
    print(f'Request used in current minute: {rate_limit_used}')


def write_dependency_file(url, absolute_path):
    data = get_data_request(url)

    file_content = data["content"]
    file_content_encoding = data.get("encoding")
    if file_content_encoding == "base64":
        file_content = base64.b64decode(file_content).decode()
    f = open(absolute_path, "w")
    f.write(file_content)
    f.close()


def create_dependency_subdirectory(dependency_path, repository_name):
    path_split = dependency_path.split("/")  # split path string into list

    remove_dependency_file = path_split[
        :-1
    ]  # remove dependency file from filepath in order to create subdir only filepath

    relative_subdir_path = "/".join(
        remove_dependency_file
    )  # rejoin to list elements to create string with subdir path

    absolute_subdir_path = os.path.join(
        f"repos/{repository_name}/{relative_subdir_path}"
    )  # create absolute path from root repos folder

    # Check if folder has already been created, if not, create it
    if not os.path.exists(absolute_subdir_path):
        os.makedirs(absolute_subdir_path)

    return absolute_subdir_path


def search_dependency_files(username, repository_name, dependency_file):

    print(f'Searching {repository_name} for {dependency_file}...')
    # Retrieve all dependency file locations within root and all subdirectories of repo

    url = f"https://api.github.com/search/code?q=filename:{dependency_file}+org:{username}+repo:{username}/{repository_name}&page=1&per_page=100"

    print_rate_limits(url)

    data = get_data_request(url)

    # Loop through all dependency files locations

    for item in data["items"]:
        dependency_path = item["path"]
        dependency_name = item["name"]

        if (
            item["path"] == dependency_file
        ):  # If dependency file is inside project root directory (no need to generate subdirectories), write to file
            url = f"https://api.github.com/repos/{username}/{repository_name}/contents/{dependency_file}"
            absolute_path = f"repos/{repository_name}/{dependency_path}"
            write_dependency_file(url, absolute_path)

        else:  # if dependency files are nested in subdirectories
            # Create nested subdirectories

            absolute_subdir_path = create_dependency_subdirectory(
                dependency_path, repository_name
            )

            # Write dependencies to file inside relevant nested subdirectory

            url = item["url"]
            absolute_path = f"{absolute_subdir_path}/{dependency_name}"
            write_dependency_file(url, absolute_path)


def get_repo_names(username):
    all_repo_names = []
    page_number = 1
    per_page = 100
    pages_remaining = True
    while_count = 0

    # Make get request to fetch data of all repos and convert to JSON

    while pages_remaining:
        current_url = f"https://api.github.com/users/{username}/repos?page={page_number}&per_page={per_page}"
        print("")
        print(f"while_count - {while_count}")

        data = get_data_request(current_url)

        # Loop through data JSON object to extract all repo names

        for i in range(0, len(data)):
            all_repo_names.append(data[i]["name"])
        print(all_repo_names)

        nextPattern = 'rel="next"'
        link = get_next_page_link_request(current_url)

        if link and nextPattern in link:
            print("Found next pattern in link, continue to next page")
            page_number += 1
            while_count += 1
            print(current_url)
        else:
            print("Can't find next pattern, must be last page")
            pages_remaining = False

    pp.pprint(all_repo_names)

    return all_repo_names

def create_project_directories(all_repo_names):
    
    for i in range(0, len(all_repo_names)):

        # Create make filepath to create new repo directory:

        path = os.path.join("repos", all_repo_names[i])

        # Create repo subdirectories

        if not os.path.exists(path):
            os.mkdir(path)

def extract_repo_dependencies(all_repo_names, dependency_files, username, time_sleep):

    # Search for dependency files within project subdirectories

    for i in range(0, len(all_repo_names)):
        time.sleep(time_sleep)
        print("")
        for j in range(0, len(dependency_files)):
            search_dependency_files(username, all_repo_names[i], dependency_files[j])


def main():
    USERNAME = "Harrisman05"
    DEPENDENCY_FILES = ["package.json", "requirements.txt"]
    TIME_SLEEP = 15

    ALL_REPO_NAMES = get_repo_names(USERNAME)
    create_project_directories(ALL_REPO_NAMES)
    extract_repo_dependencies(ALL_REPO_NAMES, DEPENDENCY_FILES, USERNAME, TIME_SLEEP)

if __name__ == "__main__":
    main()


In [None]:
# Github Repo Dependency Checker

    # Inputs needed from user are:

    # USERNAME: A string containing a github username

    # DEPENDENCY_FILES: A list of strings that are names of the dependency files that need to be scanned within each repository. 'package.json' has been tested (and will also pull relevant package-lock.json) as well as requirements.txt

    # TIME_SLEEP: An integer that determines the speed of the loops executed to recursively pull data from the repositories. The higher the number of DEPENDENCY_FILES and the more repos, the higher the time sleep value needs to be to avoid rate limiting. Start with 5, and if a HTTP Forbidden Link error is thrown, increase TIME_SLEEP. 30 requests per minute are allowed

    # ALL_REPO_NAMES: A list of string containing all the visible repos in a user's github

        # How the code works

            # 1) Obtain all repo names from a user. This code is wrapped in a while loop as the Github API can only load 100 repos per get request at a maximum, so the loop allows for automatic navigation to the next page until no pages remain. For example, a user with 720 repos will have 8 pages worth of repos

            # 2) Create repo directories within 'repos' for each repo name. For example, a user with 82 repos will have 82 subdirectories in repos

            # 3) Retrieve all dependency files locations (in root and in nested subdirectories) of a repo with a single request

            # 4) Loop through all the dependency file locations:

                # If the dependency files are in the project root, then make a get request to extract the contents of the dependency file and then write to file at repos/{repo_name}

                # If the dependency files are nested in subdirectories:

                    # Create the dependency path and then the subdirectories. This will maintain the structure of nested dependency files

                    # Then make the request to extract contents and then write the dependency to file inside the relevant subdirectory. Again, this will maintain project structure

# Note, if a project doesn't have any dependencies, an empty subdirectory with the project name will just be created.