In [None]:
!pip install PyGithub

Collecting PyGithub
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.6.1-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.6.1 pynacl-1.5.0


In [None]:
import os
import logging
from github import Github
import concurrent.futures

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Initialize GitHub API client
g = Github("")  # Replace with your GitHub token

def extract_content(repo, path=""):
    """
    Recursively extracts the content of all files in a repository.

    :param repo: The GitHub repository object.
    :param path: The path within the repository to start extracting from.
    :return: A list of tuples containing file paths and their contents.
    """
    contents = repo.get_contents(path)
    extracted_content = []

    for content in contents:
        if content.type == "file":
            try:
                decoded_content = content.decoded_content.decode('utf-8', errors='ignore')
                extracted_content.append((content.path, decoded_content))
            except Exception as e:
                logging.warning(f"Skipping file {content.path}: {e}")
        elif content.type == "dir":
            extracted_content.extend(extract_content(repo, content.path))

    return extracted_content

def count_lines_and_files(extracted_content):
    """
    Counts the total lines of code and files from extracted content.

    :param extracted_content: A list of tuples containing file paths and their contents.
    :return: A tuple containing the total number of files and lines of code.
    """
    total_files = len(extracted_content)
    total_loc = sum(len(content.splitlines()) for _, content in extracted_content)

    return total_files, total_loc

def get_repo_data(repo_name):
    try:
        repo = g.get_repo(repo_name)
    except Exception as e:
        logging.error(f"Failed to fetch repository {repo_name}: {e}")
        return None

    # Extract content
    extracted_content = extract_content(repo)

    # Count lines and files
    total_files, total_loc = count_lines_and_files(extracted_content)

    # Fetch other repository data
    issues = repo.get_issues(state='all')
    pull_requests = repo.get_pulls(state='all')
    open_issues = sum(1 for issue in issues if issue.state == 'open')
    closed_issues = sum(1 for issue in issues if issue.state == 'closed')
    issue_resolution_ratio = (closed_issues / (closed_issues + open_issues)) * 100 if closed_issues + open_issues > 0 else 0
    stars = repo.stargazers_count
    forks = repo.forks_count
    contributors = repo.get_contributors().totalCount

    repo_data = {
        "Repository Name": repo_name,
        "Stars": stars,
        "Forks": forks,
        "Total Issues": issues.totalCount,
        "Total Pull Requests": pull_requests.totalCount,
        "Total Files": total_files,
        "Lines of Code": total_loc,
        "Contributors": contributors,
        "Open Issues": open_issues,
        "Closed Issues": closed_issues,
        "Issue Resolution Ratio (%)": issue_resolution_ratio
    }

    return repo_data




In [None]:

repo_names = ['nodejs/node']

# Use ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(get_repo_data, repo): repo for repo in repo_names}
    data = []

    for future in concurrent.futures.as_completed(futures):
        repo_name = futures[future]
        try:
            repo_data = future.result()
            if repo_data:
                data.append(repo_data)
        except Exception as e:
            logging.error(f"Error fetching {repo_name}: {e}")

print(data)

Request GET /repos/nodejs/node/contents/deps/nghttp2/lib/includes/config.h?ref=main failed with 403: Forbidden
INFO:github.GithubRetry:Request GET /repos/nodejs/node/contents/deps/nghttp2/lib/includes/config.h?ref=main failed with 403: Forbidden
Setting next backoff to 586.770213s
INFO:github.GithubRetry:Setting next backoff to 586.770213s
Request GET /repos/nodejs/node/contents/deps/openssl/config/archs/darwin-i386-cc/no-asm/providers/common/include/prov failed with 403: Forbidden
INFO:github.GithubRetry:Request GET /repos/nodejs/node/contents/deps/openssl/config/archs/darwin-i386-cc/no-asm/providers/common/include/prov failed with 403: Forbidden
Setting next backoff to 1301.758133s
INFO:github.GithubRetry:Setting next backoff to 1301.758133s
Request GET /repos/nodejs/node/contents/deps/openssl/openssl/doc/man3/CRYPTO_memcmp.pod?ref=main failed with 403: Forbidden
INFO:github.GithubRetry:Request GET /repos/nodejs/node/contents/deps/openssl/openssl/doc/man3/CRYPTO_memcmp.pod?ref=main f

In [None]:
# Define the CSV file name
csv_file = '/content/drive/MyDrive/ECS 260 Term Project/repositories_info.csv'

# Check if the file exists
if os.path.isfile(csv_file):
    # Append without header
    df.to_csv(csv_file, mode='a', header=False, index=False)
else:
    # Create a new file with header
    df.to_csv(csv_file, index=False)