In [12]:
# 1
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

workout_docs = read_repo_data('ilhamksyuriadi', 'workout-recommendation')
# dtc_faq = read_repo_data('DataTalksClub', 'faq')
# evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"workout documents: {len(workout_docs)}")
# print(f"FAQ documents: {len(dtc_faq)}")
# print(f"Evidently documents: {len(evidently_docs)}")

print(workout_docs[0])

workout documents: 1
{'content': '# üèãÔ∏è Workout Type Recommendation System\n\nA machine learning-based system that recommends workout types (Cardio, Strength, Yoga, or HIIT) based on user physical attributes and fitness metrics.\n\n## Table of Contents\n\n- [Problem Description](#problem-description)\n- [Dataset](#dataset)\n- [Project Structure](#project-structure)\n- [Installation](#installation)\n- [Running the Project](#running-the-project)\n- [Model Performance](#model-performance)\n- [API Documentation](#api-documentation)\n- [Deployment](#deployment)\n- [Technologies Used](#technologies-used)\n- [Future Improvements](#future-improvements)\n\n---\n\n## Problem Description\n\n### The Challenge\n\nRecommending appropriate workout types based on user physical characteristics and fitness levels. The goal is to build a machine learning model that can predict which type of workout (Cardio, Strength, Yoga, or HIIT) would be most suitable for a person based on their:\n\n- Physical att

In [13]:
# 2

print(f"Number of documents: {len(workout_docs)}")
print(f"Sample document keys: {workout_docs[0].keys()}")
print(f"Sample content length: {len(workout_docs[0]['content'])}")

Number of documents: 1
Sample document keys: dict_keys(['content', 'filename'])
Sample content length: 12897


In [15]:
# 2.1 simple chunking
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

workout_docs_chunks = []

for doc in workout_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    workout_docs_chunks.extend(chunks)

print(len(workout_docs_chunks))

12


In [None]:
# 2.2