# Extract text and code from notebooks
In this notebook we will semi-automatically iterate through Python Jupyter Notebooks and extract a list of text-code pairs that might be useful for LLM fine tuning.

The resulting data is stored in a jsonl file.

In [2]:
import json
import openai
import time
from bia_bob import bob
import os
from bia_bob._utilities import filter_out_blacklist, save_jsonl_file
import json
import ipywidgets as widgets
from IPython.display import display

In [8]:
training_data = []

In [9]:
def parse_notebook(notebook_path:str):

    import nbformat
    
    # Reading the notebook
    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = nbformat.read(f, as_version=4)
    
    metadata = notebook['metadata']
    
    last_text = ""
    last_code = ""
    
    first_text = ""
    first_code = ""
    all_code = ""
    
    cells = notebook['cells']
    for cell in cells:
        cell_type = cell['cell_type']
        if cell_type == 'code':
            # Access code cell
            last_code = last_code + "\n\n" + cell['source']
        elif cell_type == 'markdown':
            
            if len(last_code) > 0:
                # remove inital line breaks, tabs and spaces
                while last_code[0] in ["\n", "\t", " "]:
                    last_code = last_code[1:]
                    if len(last_code) == 0:
                        break
                    
                #print("----", last_code[:6])
                if not (last_code.startswith("from") or last_code.startswith("import")):
                    # we skip the first block which contains a lot of introductory text and import statements
                    training_data.append({
                        "imports":first_code,
                        "text":last_text, 
                        "code":last_code
                    })
                else:
                    # but we keep it for later
                    first_text = last_text
                    first_code = last_code
                all_code = all_code + "\n\n" + last_code
                last_code = ""
                last_text = ""

            
            # Access markdown cell
            last_text = last_text + "\n\n" + cell['source']

In [10]:


def parse_notebooks(directory, extension=".ipynb"):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(extension) and ".ipynb_checkpoints" not in root:
                # print(os.path.join(root, file))

                parse_notebook(os.path.join(root, file))

parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\12_image_analysis_basics")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\15_gpu_acceleration")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\16_3d_image_visualization")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\18_image_filtering")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\19_spatial_transforms")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\20a_pixel_classification")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\20b_deep_learning")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\20h_segmentation_post_processing")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\20_image_segmentation")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\22_feature_extraction")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\25_neighborhood_relationships_between_cells")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\27_cell_classification")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\32_tiled_image_processing")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\34_timelapse_analysis")
parse_notebooks(r"C:\structure\code\BioImageAnalysisNotebooks\docs\60_data_visualization")

f"{len(training_data)} conversations extracted"

'592 conversations extracted'

In [11]:

training_data = filter_out_blacklist(training_data, [
    "napari",
    "nbscreenshot",
    "def ",
    "print",
    "openai",
    "https://"
])

f"{len(training_data)} conversations remaining"

'307 conversations remaining'

In [12]:
save_jsonl_file(training_data, "imports_text_code.jsonl")