In [1]:
import io
import zipfile
import requests
import frontmatter

In [2]:
import io
import zipfile
from typing import List, Dict, Optional
import requests
import frontmatter


class GitHubRepoError(Exception):
    """Custom exception for repository-related errors."""
    pass


def read_repo_data(repo_owner, repo_name,branch = 'main',timeout = 30):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
        branch: Branch name (default: 'main')
        timeout: Request timeout in seconds (default: 30)
    
    Returns:
        List of dictionaries containing file content and metadata
        
    Raises:
        GitHubRepoError: If repository download fails
        Timeout: to stop after the timeout and prevent the program from hanging forever if GitHub is slow/down.
    """
    
    # request to download content
    prefix = 'https://codeload.github.com'
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/{branch}'
    
    try:
        resp = requests.get(url, timeout=timeout)
        resp.raise_for_status()
    except requests.exceptions.Timeout:
        raise GitHubRepoError(f"Request timed out after {timeout} seconds")
    except requests.exceptions.RequestException as e:
        raise GitHubRepoError(f"Failed to download repository: {e}")

    # initialize the list of the downloaded content
    repository_data = []
    
    try:
        zf = zipfile.ZipFile(io.BytesIO(resp.content))
        for file_info in zf.infolist():
            filename = file_info.filename
            filename_lower = filename.lower()

            if not (filename_lower.endswith('.md') or filename_lower.endswith('.mdx')):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode('utf-8', errors='ignore')
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    repository_data.append(data)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
    
    except zipfile.BadZipFile:
        raise GitHubRepoError("Downloaded file is not a valid ZIP archive")
    
    print(f"\nSuccessfully processed {len(repository_data)} markdown file(s)")

    zf.close()
    return repository_data

In [3]:
# download fastapi document files
fastapi_docs = read_repo_data('fastapi', 'fastapi', branch='master')


Successfully processed 1060 markdown file(s)


In [4]:
# print the 1st doc
print(fastapi_docs[1])

{'content': '<p align="center">\n  <a href="https://fastapi.tiangolo.com"><img src="https://fastapi.tiangolo.com/img/logo-margin/logo-teal.png" alt="FastAPI"></a>\n</p>\n<p align="center">\n    <em>FastAPI framework, high performance, easy to learn, fast to code, ready for production</em>\n</p>\n<p align="center">\n<a href="https://github.com/fastapi/fastapi/actions?query=workflow%3ATest+event%3Apush+branch%3Amaster" target="_blank">\n    <img src="https://github.com/fastapi/fastapi/actions/workflows/test.yml/badge.svg?event=push&branch=master" alt="Test">\n</a>\n<a href="https://coverage-badge.samuelcolvin.workers.dev/redirect/fastapi/fastapi" target="_blank">\n    <img src="https://coverage-badge.samuelcolvin.workers.dev/fastapi/fastapi.svg" alt="Coverage">\n</a>\n<a href="https://pypi.org/project/fastapi" target="_blank">\n    <img src="https://img.shields.io/pypi/v/fastapi?color=%2334D058&label=pypi%20package" alt="Package version">\n</a>\n<a href="https://pypi.org/project/fastapi"

In [5]:
# download evidently docs as zip files
evidently_docs = read_repo_data('evidentlyai', 'docs')


Successfully processed 95 markdown file(s)


In [6]:
# print first file of evidently_docs
print(evidently_docs[0])

{'title': 'Create Plant', 'openapi': 'POST /plants', 'content': ''}
