## Multiyear Code-to-Prompt Dataset Builder for 3Blue1Brown Repository
#
 This notebook traverses a multi-year code repository (such as 3Blue1Brown's "videos-master"), extracts all Python files, and uses an LLM to generate a natural language prompt for each code file. It then saves the results as year-wise and combined datasets, and summarizes key statistics.

 **Main Features:**
 - Processes all years (2015–2025) or a selection
 - Handles nested subdirectories and skips hidden/cache folders
 - Generates robust (prompt, code, metadata) records for each file
 - Outputs CSV datasets and summary statistics for downstream ML/LLM research


## 1. Imports & Setup

In [1]:
import json
import os
import textwrap
from IPython.display import display, Markdown, HTML
import tensorflow as tf
import openai
import re
import pandas as pd 
import numpy as np
from pathlib import Path
import time

2025-05-29 17:29:38.236593: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-29 17:29:38.775976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-29 17:29:38.932421: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-29 17:29:38.992503: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-29 17:29:39.364588: I tensorflow/core/platform/cpu_feature_guar

### 2. API Key Setup and Utility Imports

In [2]:

# Reading API Key from file
folder_path = "/scratch/h/Hassan.Mo/LLM/"
os.chdir(folder_path)
API_KEY = open("open_ai_API.txt", "r").read().strip()
openai.api_key = API_KEY
from utils import llm_tools, tools_local
lt = llm_tools(api_key=openai.api_key)
tl = tools_local()

  backends.update(_get_backends("networkx.backends"))


### 3. Comprehensive Directory Processing Function

In [None]:

def process_directory_comprehensive(
    base_path="/scratch/h/Hassan.Mo/LLM/Scrapping/videos-master",
    years_to_process=None,  # None means all years
    max_files_per_dir=None,  # None means all files
    output_dir="/scratch/h/Hassan.Mo/LLM/datasets/",
    create_separate_datasets=True,  # Create both individual year datasets and combined
    verbose=True
):
    """
    Comprehensive processor for all 3B1B video directories
    
    Args:
        base_path: Base path to videos-master directory
        years_to_process: List of years to process (e.g., [2015, 2016]) or None for all
        max_files_per_dir: Maximum files to process per directory (None for all)
        output_dir: Directory to save output CSV files
        create_separate_datasets: Whether to create individual year datasets
        verbose: Whether to print detailed processing info
    """
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Determine years to process
    if years_to_process is None:
        years_to_process = list(range(2015, 2026))  # 2015-2025
    
    # Storage for all data
    all_datasets = {}  # year -> [(prompt, code, metadata), ...]
    combined_dataset = []
    
    print(f"🚀 Starting comprehensive processing of 3B1B repository")
    print(f"📁 Base path: {base_path}")
    print(f"📅 Years to process: {years_to_process}")
    print(f"💾 Output directory: {output_dir}")
    print("-" * 60)
    
    for year in years_to_process:
        year_dir = os.path.join(base_path, f"_{year}")
        
        if not os.path.exists(year_dir):
            if verbose:
                print(f"⚠️  Year {year} directory not found, skipping...")
            continue
            
        print(f"\n📂 Processing Year: {year}")
        print(f"   Path: {year_dir}")
        
        year_data = []
        year_stats = {"directories": 0, "files": 0, "success": 0, "errors": 0}
        
        # Process all subdirectories in the year folder
        for root, dirs, files in os.walk(year_dir):
            # Skip hidden directories and __pycache__
            dirs[:] = [d for d in dirs if not d.startswith('.') and d != '__pycache__']
            
            if not files:  # Skip directories with no files
                continue
                
            # Filter for Python files
            python_files = [f for f in files if f.endswith('.py')]
            if not python_files:
                continue
                
            rel_path = os.path.relpath(root, year_dir)
            year_stats["directories"] += 1
            
            if verbose:
                print(f"   📁 Processing subdirectory: {rel_path}")
                print(f"      Found {len(python_files)} Python files")
            
            # Process Python files in this directory
            files_processed = 0
            for py_file in sorted(python_files):
                if max_files_per_dir and files_processed >= max_files_per_dir:
                    if verbose:
                        print(f"      ⏹️  Reached max files limit ({max_files_per_dir})")
                    break
                
                file_path = os.path.join(root, py_file)
                year_stats["files"] += 1
                
                try:
                    # Read the Python file
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        file_content = f.read()
                    
                    # Skip empty files or files with only imports
                    if len(file_content.strip()) < 50:
                        continue
                    
                    if verbose:
                        print(f"      🔄 Processing: {py_file} ({len(file_content)} chars)")
                    
                    # Generate prompt using GPT
                    response = lt.prompt_generator(file_content)
                    response_text = response.choices[0].message.content
                    
                    # Extract the actual prompt from GPT response
                    extracted_prompts = tl.extract_quoted_text([response_text], verbose=False)
                    
                    if extracted_prompts and extracted_prompts[0].strip():
                        prompt = extracted_prompts[0].strip()
                    else:
                        # Fallback: use the full response if extraction fails
                        prompt = response_text.strip()
                    
                    # Create metadata
                    metadata = {
                        "year": year,
                        "subdirectory": rel_path,
                        "filename": py_file,
                        "file_size": len(file_content),
                        "relative_path": os.path.join(rel_path, py_file)
                    }
                    
                    # Store the data
                    data_entry = (prompt, file_content, metadata)
                    year_data.append(data_entry)
                    combined_dataset.append(data_entry)
                    
                    year_stats["success"] += 1
                    files_processed += 1
                    
                    # Small delay to avoid overwhelming the API
                    time.sleep(0.1)
                    
                except Exception as e:
                    year_stats["errors"] += 1
                    if verbose:
                        print(f"      ❌ Error processing {py_file}: {str(e)[:100]}...")
        
        # Store year data
        all_datasets[year] = year_data
        
        # Print year summary
        print(f"   ✅ Year {year} Summary:")
        print(f"      Directories: {year_stats['directories']}")
        print(f"      Files found: {year_stats['files']}")
        print(f"      Successfully processed: {year_stats['success']}")
        print(f"      Errors: {year_stats['errors']}")
        
        # Save individual year dataset if requested
        if create_separate_datasets and year_data:
            save_dataset(year_data, f"3b1b_{year}_dataset.csv", output_dir, verbose)
    
    # Save combined dataset
    if combined_dataset:
        save_dataset(combined_dataset, "3b1b_complete_dataset.csv", output_dir, verbose)
        
        # Create summary statistics
        create_dataset_summary(all_datasets, output_dir, verbose)
    
    print(f"\n🎉 Processing complete!")
    print(f"📊 Total entries processed: {len(combined_dataset)}")
    print(f"💾 Datasets saved to: {output_dir}")
    
    return all_datasets, combined_dataset


### 4. Utility Functions for Saving & Summarizing Datasets

In [None]:

def save_dataset(data, filename, output_dir, verbose=True):
    """Save dataset to CSV with proper formatting"""
    if not data:
        return
    
    # Convert to DataFrame
    prompts = [entry[0] for entry in data]
    codes = [entry[1] for entry in data]
    metadata_list = [entry[2] for entry in data]
    
    # Create comprehensive DataFrame
    df = pd.DataFrame({
        'prompt': prompts,
        'code': codes,
        'year': [m['year'] for m in metadata_list],
        'subdirectory': [m['subdirectory'] for m in metadata_list],
        'filename': [m['filename'] for m in metadata_list],
        'file_size': [m['file_size'] for m in metadata_list],
        'relative_path': [m['relative_path'] for m in metadata_list]
    })
    
    # Save to CSV
    filepath = os.path.join(output_dir, filename)
    df.to_csv(filepath, index=False)
    
    if verbose:
        print(f"💾 Saved {len(df)} entries to: {filepath}")

def create_dataset_summary(all_datasets, output_dir, verbose=True):
    """Create a summary of the dataset"""
    summary_data = []
    
    for year, data in all_datasets.items():
        if not data:
            continue
            
        # Calculate statistics
        file_sizes = [entry[2]['file_size'] for entry in data]
        prompt_lengths = [len(entry[0]) for entry in data]
        
        # Get unique subdirectories
        subdirs = list(set(entry[2]['subdirectory'] for entry in data))
        
        summary_data.append({
            'year': year,
            'total_files': len(data),
            'unique_subdirectories': len(subdirs),
            'subdirectories': ', '.join(sorted(subdirs)),
            'avg_file_size': np.mean(file_sizes),
            'avg_prompt_length': np.mean(prompt_lengths),
            'total_code_chars': sum(file_sizes),
            'total_prompt_chars': sum(prompt_lengths)
        })
    
    # Create summary DataFrame
    summary_df = pd.DataFrame(summary_data)
    
    # Save summary
    summary_path = os.path.join(output_dir, "dataset_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    
    if verbose:
        print(f"📊 Dataset summary saved to: {summary_path}")
        display(HTML(summary_df.to_html(index=False)))




### 5. Example usage configurations:

In [None]:


def process_all_years():
    """Process all years (2015-2025)"""
    return process_directory_comprehensive(
        years_to_process=None,  # All years
        max_files_per_dir=None,  # All files
        create_separate_datasets=True,
        verbose=True
    )

def process_specific_years(years=[2015, 2016, 2017]):
    """Process specific years only"""
    return process_directory_comprehensive(
        years_to_process=years,
        max_files_per_dir=None,
        create_separate_datasets=True,
        verbose=True
    )

def process_sample_dataset():
    """Create a sample dataset (max 5 files per directory)"""
    return process_directory_comprehensive(
        years_to_process=[2015, 2016],  # Just a few years for testing
        max_files_per_dir=5,  # Limit files per directory
        create_separate_datasets=True,
        verbose=True
    )

# Quick execution functions
def run_full_processing():
    """Run the complete processing of all years"""
    print("🚀 Starting FULL processing of all 3B1B years...")
    all_datasets, combined_dataset = process_all_years()
    return all_datasets, combined_dataset

def run_sample_processing():
    """Run a sample processing for testing"""
    print("🧪 Starting SAMPLE processing for testing...")
    all_datasets, combined_dataset = process_sample_dataset()
    return all_datasets, combined_dataset


### 6. Run Processing (Choose Full or Sample)
#
 Uncomment the desired line below to run either full or sample processing.
 This will trigger the pipeline and produce datasets/statistics.

In [None]:

all_datasets, combined_dataset = run_full_processing()
# all_datasets, combined_dataset = run_sample_processing()

🚀 Starting FULL processing of all 3B1B years...
🚀 Starting comprehensive processing of 3B1B repository
📁 Base path: /scratch/h/Hassan.Mo/LLM/Scrapping/videos-master
📅 Years to process: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
💾 Output directory: /scratch/h/Hassan.Mo/LLM/datasets/
------------------------------------------------------------

📂 Processing Year: 2015
   Path: /scratch/h/Hassan.Mo/LLM/Scrapping/videos-master/_2015
   📁 Processing subdirectory: .
      Found 14 Python files
      🔄 Processing: complex_multiplication_article.py (8148 chars)
      🔄 Processing: counting_in_binary.py (14636 chars)
      🔄 Processing: eulers_characteristic_formula.py (39442 chars)
      🔄 Processing: generate_logo.py (2376 chars)
      🔄 Processing: inventing_math.py (73398 chars)
      🔄 Processing: inventing_math_images.py (4498 chars)
      🔄 Processing: matrix_as_transform_2d.py (18433 chars)
      🔄 Processing: moser_intro.py (8483 chars)
      🔄 Processing: moser

year,total_files,unique_subdirectories,subdirectories,avg_file_size,avg_prompt_length,total_code_chars,total_prompt_chars
2015,19,2,"., ka_playgrounds",19265.842105,676.105263,366051,12846
2016,36,5,"., brachistochrone, eola, hilbert, triangle_of_power",36456.25,666.277778,1312425,23986
2017,35,4,"., dominos, eoc, nn",75341.171429,655.085714,2636941,22928
2018,66,7,"., basel, eop, eop/chapter0, eop/chapter1, eop/chapter2, eop/reusables",38233.166667,603.090909,2523389,39804
2019,54,10,"., bayes, clacks, clacks/solution2, diffyq, diffyq/part1, diffyq/part2, diffyq/part3, diffyq/part4, diffyq/part5",30908.518519,627.888889,1669060,33906
2020,22,3,"., 18S191, beta",56608.545455,618.772727,1245388,13613
2021,9,1,.,98935.0,630.777778,890415,5677
2022,21,11,"borwein, convolutions, galois, infinity, piano, puzzles, quintic, some2, visual_proofs, wordle, zeta",42042.714286,669.761905,882897,14065
2023,29,9,"SoME3, clt, clt_proof, convolutions2, gauss_int, moser_reboot, numberphile, optics_puzzles, standup_maths",37915.068966,755.965517,1099537,21923
2024,24,7,"antp, holograms, inscribed_rect, linalg, manim_demo, puzzles, transformers",53375.791667,784.083333,1281019,18818



🎉 Processing complete!
📊 Total entries processed: 328
💾 Datasets saved to: /scratch/h/Hassan.Mo/LLM/datasets/


In [None]:
type(combined_dataset) # Should be `list` of (prompt, code, metadata) tuples

list

# ---
### Next Steps & Notes
#
 - Inspect the CSV files and summary table produced.
 - For large-scale ML or research use cases, consider inspecting random samples or adding more validation logic.
 - For more efficient large-scale processing (thousands of files), investigate batching or asynchrony.

 **This notebook is now fully documented with comments and markdown.**