In [1]:
!pip install requests



In [4]:
import re
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/6551',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/6551/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/6551/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/6551/events',
  'html_url': 'https://github.com/huggingface/datasets/pull/6551',
  'id': 2062768400,
  'node_id': 'PR_kwDODunzps5jEi1C',
  'number': 6551,
  'title': 'Fix parallel downloads for datasets without scripts',
  'user': {'login': 'lhoestq',
   'id': 42851186,
   'node_id': 'MDQ6VXNlcjQyODUxMTg2',
   'avatar_url': 'https://avatars.githubusercontent.com/u/42851186?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/lhoestq',
   'html_url': 'https://github.com/lhoestq',
   'followers_url': 'https://api.github.com/users/lhoestq/followers',
   'following_url': 'ht

In [5]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


In [16]:
from dotenv import dotenv_values

config = dotenv_values()
config.keys()

odict_keys(['GITHUB_TOKEN'])

In [17]:
GITHUB_TOKEN = config["GITHUB_TOKEN"]
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [18]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("output_dir/git_issues"),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [19]:
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...
