In [1]:
import requests
from dotenv import load_dotenv
from datasets import load_dataset
import os

load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url, headers=headers)

In [2]:
response.status_code

200

In [3]:
print(response.json())

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/6827', 'repository_url': 'https://api.github.com/repos/huggingface/datasets', 'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/6827/labels{/name}', 'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/6827/comments', 'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/6827/events', 'html_url': 'https://github.com/huggingface/datasets/issues/6827', 'id': 2254011833, 'node_id': 'I_kwDODunzps6GWX25', 'number': 6827, 'title': 'Loading a remote dataset fails in the last release (v2.19.0)', 'user': {'login': 'zrthxn', 'id': 35369637, 'node_id': 'MDQ6VXNlcjM1MzY5NjM3', 'avatar_url': 'https://avatars.githubusercontent.com/u/35369637?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/zrthxn', 'html_url': 'https://github.com/zrthxn', 'followers_url': 'https://api.github.com/users/zrthxn/followers', 'following_url': 'https://api.github.com/users/zrthxn/foll

In [4]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=4_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )
    return df

In [5]:
df = fetch_issues()

  0%|          | 0/40 [00:00<?, ?it/s]

Downloaded all the issues for datasets! Dataset stored at ./datasets-issues.jsonl


In [6]:
df.head()

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,closed_at,author_association,active_lock_reason,body,reactions,timeline_url,performed_via_github_app,state_reason,draft,pull_request
0,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,2254011833,I_kwDODunzps6GWX25,6827,Loading a remote dataset fails in the last rel...,...,,NONE,,While loading a dataset with multiple splits I...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,
1,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/6826,2252445242,PR_kwDODunzps5tJMZh,6826,Set dev version,...,2024-04-19T08:52:14Z,MEMBER,,,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
2,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/pull/6825,2252404599,PR_kwDODunzps5tJEMw,6825,Release: 2.19.0,...,2024-04-19T08:44:57Z,MEMBER,,,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,False,{'url': 'https://api.github.com/repos/huggingf...
3,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,2251076197,I_kwDODunzps6GLLJl,6824,Winogrande does not seem to be compatible with...,...,2024-04-19T09:52:33Z,NONE,,### Describe the bug\n\nI get the following er...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,completed,,
4,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,2250775569,I_kwDODunzps6GKBwR,6823,Loading problems of Datasets with a single shard,...,,NONE,,### Describe the bug\r\n\r\nWhen saving a data...,{'url': 'https://api.github.com/repos/huggingf...,https://api.github.com/repos/huggingface/datas...,,,,


In [7]:
df.columns

Index(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url',
       'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels',
       'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments',
       'created_at', 'updated_at', 'closed_at', 'author_association',
       'active_lock_reason', 'body', 'reactions', 'timeline_url',
       'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
      dtype='object')

In [12]:
df = df.dropna(axis="columns")

In [15]:
repo="datasets",
df.to_json(f"./{repo}-issues.jsonl", orient="records", lines=True)

In [16]:
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignees', 'comments', 'created_at', 'updated_at', 'author_association', 'reactions', 'timeline_url'],
    num_rows: 4000
})

In [19]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url  in zip(sample["html_url"]):
    print(f">> URL: {url}")

>> URL: ('https://github.com/huggingface/datasets/pull/5860',)
>> URL: ('https://github.com/huggingface/datasets/issues/4728',)
>> URL: ('https://github.com/huggingface/datasets/pull/4406',)
