In [1]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data


In [2]:
houseprices = read_repo_data('lanre4PF', "House-Price-Prediction-Ames-Housing-Dataset-")
alien_invasion = read_repo_data('lanre4PF', "Alien-invasion-clone-")

print(len(houseprices))
print(len(alien_invasion))

1
1


In [9]:
print(houseprices)


[{'content': '# 🏠 House Price Prediction (Kaggle - Ames Housing Dataset)\n\nA machine learning project for predicting house prices using the **Ames Housing Dataset**, as featured in the Kaggle competition *House Prices: Advanced Regression Techniques*. This project applies regression models to estimate house values based on property features such as location, size, quality, and condition.\n\n---\n\n## 📌 Project Overview\nThis repository demonstrates the application of machine learning algorithms to structured/tabular data prediction.  \n- **Dataset**: [Kaggle – House Prices: Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques) (2,930 properties, 80+ features).  \n- **Frameworks**: **Scikit-learn, Pandas, NumPy, Matplotlib/Seaborn, XGBoost**  \n- **Task**: Supervised learning (regression).  \n\nThe goal is to preprocess the dataset, build predictive models, and evaluate their performance in estimating house prices.\n\n---\n\n## 🧠 Features

In [4]:
print(alien_invasion)

[{'content': "# 👾 Alien Invasion Clone  \n\nA fun **2D arcade-style game** built with Python’s `pygame` library. Inspired by the classic *Space Invaders*, this project is a clone of the **Alien Invasion** game from Eric Matthes' *Python Crash Course*.  \n\n---\n\n## 🚀 Features  \n- Player-controlled spaceship with smooth movement.  \n- Laser bullets to shoot down alien fleets.  \n- Increasing difficulty with each wave.  \n- Game-over and restart mechanics.  \n- Simple yet engaging retro-style gameplay.  \n\n---\n\n## 🛠️ Tech Stack  \n- **Python 3.x**  \n- **Pygame**  \n\n---\n\n## 📦 Installation  \n\nClone the repository and install dependencies:  \n\n```bash\n# Clone the repo\ngit clone https://github.com/your-username/alien-invasion-clone.git\ncd alien-invasion-clone\n\n# (Optional) Create virtual environment\npython -m venv venv\nsource venv/bin/activate   # On Mac/Linux\nvenv\\Scripts\\activate      # On Windows", 'filename': 'Alien-invasion-clone--main/README.md'}]


# Simple Chunking 

In [12]:
ultralytics = read_repo_data('ultralytics', 'ultralytics')
print(f"ultralytics: {len(ultralytics)}")

ultralytics: 409


In [13]:
for data in ultralytics: 
    print (data["filename"])

ultralytics-main/CONTRIBUTING.md
ultralytics-main/README.md
ultralytics-main/README.zh-CN.md
ultralytics-main/docs/README.md
ultralytics-main/docs/coming_soon_template.md
ultralytics-main/docs/en/datasets/classify/caltech101.md
ultralytics-main/docs/en/datasets/classify/caltech256.md
ultralytics-main/docs/en/datasets/classify/cifar10.md
ultralytics-main/docs/en/datasets/classify/cifar100.md
ultralytics-main/docs/en/datasets/classify/fashion-mnist.md
ultralytics-main/docs/en/datasets/classify/imagenet.md
ultralytics-main/docs/en/datasets/classify/imagenet10.md
ultralytics-main/docs/en/datasets/classify/imagenette.md
ultralytics-main/docs/en/datasets/classify/imagewoof.md
ultralytics-main/docs/en/datasets/classify/index.md
ultralytics-main/docs/en/datasets/classify/mnist.md
ultralytics-main/docs/en/datasets/detect/african-wildlife.md
ultralytics-main/docs/en/datasets/detect/argoverse.md
ultralytics-main/docs/en/datasets/detect/brain-tumor.md
ultralytics-main/docs/en/datasets/detect/coco.

In [14]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [19]:
ultralytics_chunks =[]

for doc in ultralytics:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop("content")
    chunks = sliding_window(doc_content, 2000, 1000 )
    for chunk in chunks: 
        chunk.update(doc_copy)
    ultralytics_chunks.extend(chunks)

In [22]:
print(len(ultralytics_chunks))

3382


we obtained 3382 chunks from 409 documents 

# Paragraph chunking

In [23]:
import re
text = ultralytics[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

In [24]:
paragraphs 

['# Oriented Bounding Box (OBB) Datasets Overview',
 'Training a precise [object detection](https://www.ultralytics.com/glossary/object-detection) model with oriented bounding boxes (OBB) requires a thorough dataset. This guide explains the various OBB dataset formats compatible with Ultralytics YOLO models, offering insights into their structure, application, and methods for format conversions.',
 '## Supported OBB Dataset Formats',
 '### YOLO OBB Format',
 'The YOLO OBB format designates bounding boxes by their four corner points with coordinates normalized between 0 and 1. It follows this format:',
 '```bash\nclass_index x1 y1 x2 y2 x3 y3 x4 y4\n```',
 "Internally, YOLO processes losses and outputs in the `xywhr` format, which represents the [bounding box](https://www.ultralytics.com/glossary/bounding-box)'s center point (xy), width, height, and rotation.",
 '<p align="center"><img width="800" src="https://github.com/ultralytics/docs/releases/download/0/obb-format-examples.avif" alt

# chunking by sections and headers

In [25]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections


In [27]:
ultralytics_chunks2 = []

for doc in ultralytics:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        ultralytics_chunks2.append(section_doc)

In [28]:
print(len(ultralytics_chunks2))

2459


# Intelligent chunking 

This process requires time and incurs costs. As mentioned before, use this only when really necessary.

In [31]:
from tqdm.auto import tqdm

In [None]:
from openai import OpenAI

openai_client = OpenAI()


def llm(prompt, model='gpt-4o-mini'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages
    )

    return response.output_text

In [None]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [None]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [None]:
ultralytics_chunks3 = []
for doc in tqdm(ultralytics_chunks3):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        ultralytics_chunks3.append(section_doc)
