In [None]:
pip install PyPDF2

In [3]:
# importing required libraries for extracting text from the PDF
import PyPDF2
import os
from google.colab import files

In [9]:
# Uploading pdf
uploaded  = files.upload()
safety_manual = next(iter(uploaded))

Saving ADM_04-00-003.pdf to ADM_04-00-003.pdf


In [11]:
# Extractin text from the PDF
def extract_text_from_pdf(pdf_path):
  text = ""
  with open(pdf_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    num_pages = len(reader.pages)
    for page_num in range(num_pages):
      page = reader.pages[page_num]
      text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf(safety_manual)

print(f'Extracted {len(pdf_text)} characters from the PDF')
print("First 500 Characters!")
print(pdf_text[:500] + "...")

Extracted 582028 characters from the PDF
First 500 Characters!
 
 Abstract - 1 
 
 
OSHA Field  
 Safety and Health  
Manual
 
 
 
 
 
 
 
 
* OSHA ARCHIVE DOCUMENT * 
NOTICE: This is an OSHA ARCHIVE Document, and may no longer represent OSHA policy.
* OSHA ARCHIVE DOCUMENT * 
NOTICE: This document is presented here as historical content, for research and review purposes only. 
 Abstract - 2  
U.S. DDEPARTMENT OF LABOR  Occupational Safety and Health Administration  
 
DIRECTIVE NUMBER: ADM 04-00-003 EFFECTIVE DATE:  5/06/ 2020  
SUBJECT: OSHA Safety and He...


In [13]:
# Cleaning text basically removing long spaces
def clean_text(text):
  cleaned_text = '\n'.join([line.strip() for line in text.split('\n') if line.strip()])

  cleaned_text = ' '.join(cleaned_text.split())

  return cleaned_text
cleaned_text = clean_text(pdf_text)

print("cleaned_text Sample")
print(cleaned_text[:500] + "...")

cleaned_text Sample
Abstract - 1 OSHA Field Safety and Health Manual * OSHA ARCHIVE DOCUMENT * NOTICE: This is an OSHA ARCHIVE Document, and may no longer represent OSHA policy. * OSHA ARCHIVE DOCUMENT * NOTICE: This document is presented here as historical content, for research and review purposes only. Abstract - 2 U.S. DDEPARTMENT OF LABOR Occupational Safety and Health Administration DIRECTIVE NUMBER: ADM 04-00-003 EFFECTIVE DATE: 5/06/ 2020 SUBJECT: OSHA Safety and Health Management System ABSTRACT Purpose: Th...


In [14]:
!pip install -q langchain

In [15]:
# Recursive semantic Chunking
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    MarkdownHeaderTextSplitter,
    Language,
    HTMLHeaderTextSplitter
)
from langchain.docstore.document import Document
from pprint import pprint

In [17]:
# setting up text splitter function
def setup_text_splitter():
  return RecursiveCharacterTextSplitter(
      separators = ['\n\n','\n','. ',' ',''],
      chunk_size = 1000,
      chunk_overlap = 200,
      length_function = len,
      add_start_index = True
  )
text_splitter = setup_text_splitter()

In [19]:
from typing import List

In [22]:
# Chunking with metadata preservation
def langchain_chunk_text(text:str, source:str) -> List[Document]:
  # langchain document
  doc = Document(page_content=text, metadata={'source':source})

  # split the document
  chunks = text_splitter.split_documents([doc])

  # metadata for each chunk
  for i, chunk in enumerate(chunks):
    chunk.metadata.update({
        "chunk_id" : f"{source}_chunk_{i}",
        "total_chunks" : len(chunks),
        "chunk_num" : i,
        "length" : len(chunk.page_content)
    })
  return chunks

langchain_chunks = langchain_chunk_text(cleaned_text, safety_manual)

print(f"Created {len(langchain_chunks)} chunks using langchain splitter")
print("sample chunk structure")
pprint(langchain_chunks[0].metadata)

Created 717 chunks using langchain splitter
sample chunk structure
{'chunk_id': 'ADM_04-00-003.pdf_chunk_0',
 'chunk_num': 0,
 'length': 998,
 'source': 'ADM_04-00-003.pdf',
 'start_index': 0,
 'total_chunks': 717}


In [23]:
# Analyzing Quality of the chunks
def analyze_langchain_chunks(chunks: List[Document]):

    lengths = [len(c.page_content) for c in chunks]
    avg_len = sum(lengths) / len(lengths)

    print("\nChunk Quality Analysis:")
    print(f"Total chunks: {len(chunks)}")
    print(f"Average length: {avg_len:.1f} chars")
    print(f"Min length: {min(lengths)} chars")
    print(f"Max length: {max(lengths)} chars")

    # Check for bad chunks (too short/long)
    bad_chunks = [c for c in chunks if len(c.page_content) < 50 or len(c.page_content) > 1500]
    print(f"\nFound {len(bad_chunks)} potentially problematic chunks")

    if bad_chunks:
        print("\nProblematic chunk examples:")
        for c in bad_chunks[:2]:
            print(f"\nChunk {c.metadata['chunk_num']} (length: {len(c.page_content)}):")
            print(c.page_content[:100] + "...")

analyze_langchain_chunks(langchain_chunks)


Chunk Quality Analysis:
Total chunks: 717
Average length: 905.9 chars
Min length: 128 chars
Max length: 1000 chars

Found 0 potentially problematic chunks


In [24]:
import re

In [29]:
def preprocess_safety_text(text: str) -> str:

    # Preserve important section breaks
    text = text.replace("WARNING:", "\n\nWARNING:\n\n")
    text = text.replace("CAUTION:", "\n\nCAUTION:\n\n")
    text = text.replace("NOTE:", "\n\nNOTE:\n\n")

    # Handle numbered procedures
    text = re.sub(r'(\d+\. )', r'\n\n\1', text)  # Ensure newlines before numbered items

    return text

# Apply specialized preprocessing
preprocessed_text = preprocess_safety_text(cleaned_text)

# Re-chunk with specialized preprocessing
enhanced_chunks = langchain_chunk_text(preprocessed_text, safety_manual)

print("\nAfter specialized preprocessing:")
analyze_langchain_chunks(enhanced_chunks)

# Explanation:
# - Adds extra spacing around critical safety sections
# - Better handles numbered procedures
# - Results in more semantically meaningful chunks


After specialized preprocessing:

Chunk Quality Analysis:
Total chunks: 788
Average length: 779.1 chars
Min length: 5 chars
Max length: 1000 chars

Found 2 potentially problematic chunks

Problematic chunk examples:

Chunk 398 (length: 5):
NOTE:...

Chunk 618 (length: 5):
NOTE:...


In [30]:
# if we get short chunks above we have 2 so we merge with next chunk
def merge_short_chunks(chunks, min_len=20):
    """
    Merge short chunks (like 'NOTE:') with the next chunk instead of deleting them.
    """
    merged_chunks = []
    i = 0
    while i < len(chunks):
        current_chunk = chunks[i]
        current_text = current_chunk.page_content.strip()

        # If the chunk is too short and looks like a header (e.g., "NOTE:")
        if len(current_text) < min_len and current_text.upper() in ["NOTE:", "CAUTION:", "WARNING:"]:
            if i + 1 < len(chunks):
                # Merge with next chunk
                next_chunk = chunks[i + 1]
                merged_text = current_text + " " + next_chunk.page_content

                # Create a new Document with merged text and updated metadata
                merged_doc = Document(
                    page_content=merged_text,
                    metadata={
                        **next_chunk.metadata,
                        "merged_with_previous": True,
                        "chunk_id": f"{next_chunk.metadata['chunk_id']}_merged",
                        "chunk_num": next_chunk.metadata["chunk_num"],
                        "length": len(merged_text)
                    }
                )
                merged_chunks.append(merged_doc)
                i += 2  # Skip next chunk (already merged)
            else:
                # Last chunk, no one to merge with — keep it as-is
                merged_chunks.append(current_chunk)
                i += 1
        else:
            merged_chunks.append(current_chunk)
            i += 1
    return merged_chunks


In [31]:
# Apply merging after enhanced chunking
final_chunks = merge_short_chunks(enhanced_chunks)

# Re-analyze quality
analyze_langchain_chunks(final_chunks)


Chunk Quality Analysis:
Total chunks: 786
Average length: 781.1 chars
Min length: 57 chars
Max length: 1000 chars

Found 0 potentially problematic chunks


In [63]:
import json

def prepare_for_pinecone(chunks: List[Document]) -> List[dict]:
    """
    Converts LangChain Documents to Pinecone-ready format.
    """
    pinecone_records = []
    for chunk in chunks:
        record = {
            "id": chunk.metadata["chunk_id"],
            "text": chunk.page_content,
            "metadata": chunk.metadata
        }
        pinecone_records.append(record)
    return pinecone_records

pinecone_ready = prepare_for_pinecone(final_chunks)

# Save to JSON
with open('pinecone_ready_chunks.json', 'w') as f:
    json.dump(pinecone_ready, f, indent=2)

print(f"\nSaved {len(pinecone_ready)} Pinecone-ready chunks to 'pinecone_ready_chunks.json'")

# Explanation:
# - Converts LangChain's Document format to Pinecone's expected structure
# - Preserves all metadata
# - Each record has: id, text, and metadata fields


Saved 786 Pinecone-ready chunks to 'pinecone_ready_chunks.json'


In [None]:
!pip install sentence-transformers torch

In [42]:
# Testing Embedding
from sentence_transformers import SentenceTransformer
import torch

# Initialize device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the open-source model
model = SentenceTransformer('BAAI/bge-small-en', device=device)

# Test the model
sentence = "Sample safety instruction about fire extinguishers"
embedding = model.encode(sentence)

print("📌 Embedded text:")
print(sentence)

print("\n🔢 Embedding (first 10 values):")
print(embedding[:10])  # Show first 10 dimensions

print(f"\n📏 Total dimensions: {len(embedding)}")

# Explanation:
# - BAAI/bge-small-en outperforms many paid models on benchmarks
# - Runs locally (no API costs)
# - Automatically uses GPU if available
# - Fixed 384-dimension output (more efficient storage)

📌 Embedded text:
Sample safety instruction about fire extinguishers

🔢 Embedding (first 10 values):
[-0.07474203  0.02710574  0.03077703 -0.07079734  0.02252393  0.03854874
  0.03483909  0.0447201  -0.03685036 -0.01119724]

📏 Total dimensions: 384


In [64]:
# Generating embeddings in Batches
def generate_embeddings(texts: list, batch_size=64) -> list:
    """Generate embeddings in optimized batches"""
    return model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True  # Important for cosine similarity
    )

# Generate embeddings for all chunks
chunk_texts = [chunk["text"] for chunk in pinecone_ready]
print("Generating embeddings with BGE model...")
chunk_embeddings = generate_embeddings(chunk_texts)

# Add to our records
for i, record in enumerate(pinecone_ready):
    record["values"] = chunk_embeddings[i].tolist()  # Convert numpy to list

# Explanation:
# - Batched processing optimized for GPU
# - normalize_embeddings=True ensures proper cosine similarity
# - Progress bar shows embedding generation status

Generating embeddings with BGE model...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [44]:
import numpy as np

for i in range(3):
    print(f"\n📄 Text Chunk {i}:\n{pinecone_ready[i]['text'][:150]}...")  # Print first 150 chars
    print(f"\n🧠 Embedding {i} (First 10 dims): {np.round(chunk_embeddings[i][:10], 4)}")



📄 Text Chunk 0:
Abstract - 1 OSHA Field Safety and Health Manual * OSHA ARCHIVE DOCUMENT * NOTICE: This is an OSHA ARCHIVE Document, and may no longer represent OSHA ...

🧠 Embedding 0 (First 10 dims): [-0.0171 -0.0041  0.0272 -0.0251  0.0088  0.0061  0.0431  0.0405 -0.0433
  0.0158]

📄 Text Chunk 1:
. It is the intent of this program that all employees will participate in all aspects including reporting hazards, incidents, and injury/illness witho...

🧠 Embedding 1 (First 10 dims): [-0.0288  0.0092  0.0283 -0.0163 -0.0118  0.0475  0.0851  0.0242 -0.0675
 -0.0023]

📄 Text Chunk 2:
. Cancellations: OSHA Instruction ADM 04 -00-002, OSHA Field Safety and Health Manual, October 5, 2016 State Impact: None. For State reference only. A...

🧠 Embedding 2 (First 10 dims): [-0.0152  0.0192  0.0625  0.034   0.0402 -0.0008  0.048   0.04   -0.0458
 -0.0187]


In [68]:
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone (free tier)
pc = Pinecone(api_key="pcsk_oRen9_PnQ5DEGXJ5K1Tap43YZ7AGn6p8rk2uweQzcePUjdgM9nsrdDBFpxjs81pTCn7YA")

INDEX_NAME = "safety-manuals-bge"

# Delete if exists
if INDEX_NAME in pc.list_indexes().names():
    pc.delete_index(INDEX_NAME)

# Create index with free-tier compatible settings
pc.create_index(
    name=INDEX_NAME,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",  # Free tier only supports AWS
        region="us-east-1"  # Supported free tier region
    )
)

index = pc.Index(INDEX_NAME)
print("Index created successfully")

Index created successfully


In [69]:
from tqdm.auto import tqdm
import math

def prepare_upsert_data(chunks_with_metadata, embeddings):
    """Prepare data for upsert in Pinecone format"""
    return [
        {
            "id": chunk["id"],
            "values": embedding,
            "metadata": {**chunk["metadata"], "text": chunk["text"]}  # ✅ Include text
        }
        for chunk, embedding in zip(chunks_with_metadata, embeddings)
    ]

# Prepare the upsert data
upsert_data = prepare_upsert_data(pinecone_ready, chunk_embeddings)

# Upload in batches with progress tracking
BATCH_SIZE = 100  # Pinecone’s recommended max
total_batches = math.ceil(len(upsert_data) / BATCH_SIZE)

print(f"Uploading {len(upsert_data)} vectors in {total_batches} batches...")

with tqdm(total=len(upsert_data), desc="Uploading vectors") as pbar:
    for i in range(0, len(upsert_data), BATCH_SIZE):
        batch = upsert_data[i:i + BATCH_SIZE]
        index.upsert(vectors=batch)
        pbar.update(len(batch))

# Verify upload
stats = index.describe_index_stats()
print("\n✅ Upload complete!")
print(f"📌 Total vectors: {stats['total_vector_count']}")
print(f"📏 Index dimension: {stats['dimension']}")
print(f"📊 Index fullness: {stats['index_fullness']:.4f}")


Uploading 786 vectors in 8 batches...


Uploading vectors:   0%|          | 0/786 [00:00<?, ?it/s]


✅ Upload complete!
📌 Total vectors: 0
📏 Index dimension: 384
📊 Index fullness: 0.0000


In [70]:
# Get index description to understand default configuration
index_description = pc.describe_index(INDEX_NAME)
print("Index Configuration:")
print(f"Name: {index_description.name}")
print(f"Dimension: {index_description.dimension}")
print(f"Metric: {index_description.metric}")
print(f"Spec: {index_description.spec}")
print(f"Status: {index_description.status.state}")

# Explanation:
# - Serverless indexes automatically handle HNSW configuration
# - Pinecone optimizes these parameters based on usage patterns
# - We can influence performance through query-time parameters


Index Configuration:
Name: safety-manuals-bge
Dimension: 384
Metric: cosine
Spec: {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}}
Status: Ready


In [71]:
def optimized_query(query: str, top_k: int = 5, filters: dict = None):
    """
    Perform optimized query with serverless index best practices.

    Args:
        query: Natural language search string
        top_k: Number of results to return
        filters: Optional metadata filters (e.g., by source)

    Returns:
        List of top matching vectors with scores and metadata
    """
    # Generate embedding for query
    query_embedding = model.encode(query, normalize_embeddings=True).tolist()

    # Run query
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        filter=filters,
        include_values=False,
        ef_search=100  # Balances recall vs latency (recommended for serverless)
    )

    return results.matches


In [76]:
def search_and_display(query_text: str, top_k: int = 3):
    """
    Performs semantic search on Pinecone and prints top results.

    Args:
        query_text (str): The question or search phrase.
        top_k (int): Number of top results to return.
    """
    results = optimized_query(query_text, top_k=top_k)

    print(f"\n🔍 Query: \"{query_text}\"")
    print(f"{'=' * 60}")
    for idx, match in enumerate(results, 1):
        print(f"\n🔹 Result {idx}")
        print(f"📌 Score: {match.score:.3f}")
        print(f"📄 Source: {match.metadata.get('source', '[No source]')}")
        print(f"📝 Content Preview:\n{match.metadata.get('text', '[No text]')[:300]}...\n{'-'*60}")


In [77]:
search_and_display("What PPE is required for field inspection?")


🔍 Query: "What PPE is required for field inspection?"

🔹 Result 1
📌 Score: 0.880
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
1. At the start of any inspection/audit or other field activity, the employees will assess the need for PPE, which will include the employer’s PPE assessment. 

2. Employees including temporary, contract and visiting employees will abide by OSHA’s PPE Program (Chapter 8) or the program of the employ...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.876
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
1. Wear PPE as necessary; 

2. Attend PPE training sessions; 

3. Care for, clean, maintain and dispose of PPE as necessary; and 

4. Report any damaged or defective PPE to their responsible OSHA Manager(s). IV. Procedure Hazard Assessment 

1. Based on a general assessment of all work sites, it is ...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.870
📄 Source: ADM_04-00-003.pdf
📝 Content Pr

In [78]:
search_and_display("What are the roles and responsibilities of a Regional Administrator?")


🔍 Query: "What are the roles and responsibilities of a Regional Administrator?"

🔹 Result 1
📌 Score: 0.897
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
3. Regional Offices The Regional Administrator s bear responsibility for the health and safety of all Regional employees as well as temporary, contract and visiting employees . The Regional Administrat or will demonstrate leadership and commitment to employee safety and health. See Chapter 4 for rol...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.894
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
4. REGIONAL OFFICE I. Roles and Responsibilities The Regional Administrator will: 

1. Implement the SHMS and safety and health programs in accordance with this Instruction and existing laws and regulations applicable to all working conditions of employees in the Region. 

2. Serve as a role model t...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.873
📄 Source: ADM_

In [79]:
search_and_display("How are safety hazards supposed to be reported by employees?")


🔍 Query: "How are safety hazards supposed to be reported by employees?"

🔹 Result 1
📌 Score: 0.900
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
. d. Identify other site -specific hazards and how to protect Authorized Employees (e .g. noise, electricity). Determine if employees with specialized expertise to mitigate specific hazards are needed, for example, to verify proper lockout/tagout of hazardous energy sources and proper Radio Frequenc...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.895
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
1. Hazards and risks to employees' safety and health should be identified and assessed on an ongoing basis at b oth the office and field locations, such as at enforcement inspection and VPP onsite evaluation locations. Implementation of preventive and protective measures should: eliminate the hazard...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.888
📄 Source: ADM_04-00-00

In [80]:
search_and_display("What training requirements must supervisors follow?")


🔍 Query: "What training requirements must supervisors follow?"

🔹 Result 1
📌 Score: 0.886
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
. 2-11 Records of training will be maintained for th ree years at the Regional or Office level to ensure that all employees have been appropriately trained. Supervisors (or designees) will make available records of the training conducted to the Regional Administrator, and DTSEM when requested. Super...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.873
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
4. Supervisors will periodically evaluate the employee use of PPE to ensure that employees are adequately protected. VI. Safety and Health Training The following procedures apply to supervisors (or designees) in the Regions: The supervisor (or designee) will ensure that all employees are trained ini...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.863
📄 Source: ADM_04-00-003.pdf
📝 C

In [81]:
search_and_display("Explain the corrective action tracking process after a hazard report.")


🔍 Query: "Explain the corrective action tracking process after a hazard report."

🔹 Result 1
📌 Score: 0.883
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
. 2-10 VII. SPECIFIC SAFETY AND HEALTH PROGRAMS ................................ ................................ .... 2-11 APPENDIX A: CORRECTIVE ACTIONS LIST ................................ ................................ .............. 2-12 APPENDIX B : HAZARD REPORTING AND INCIDENT INVESTIGATI...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.882
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
2. Describe the procedures to oversee the activities of service / nested contractors who perform work in your office or building. B-

3. Accident/Incident I nvestigations * OSHA ARCHIVE DOCUMENT * NOTICE: This is an OSHA ARCHIVE Document, and may no longer represent OSHA policy. * OSHA ARCHIVE DOCUM...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.880
📄 Source: ADM

In [82]:
search_and_display("What is included in the SHMS self-evaluation process?")


🔍 Query: "What is included in the SHMS self-evaluation process?"

🔹 Result 1
📌 Score: 0.887
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
15. e. The Regional Office will receive copies of the latest Annual SHMS Self -Evaluation from the Area and District Offices. By February 15 of each year, the RSHM will review the Annual SHMS Self -Evaluations from each Area and D istrict Office, then summarize and brief the Regional Administrator a...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.880
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
. h. The RSHM will share general summaries of the Annual SHMS Self -Evaluations with the RSHC. A copy of the Annual SHMS Self -Evaluation must be forwarded to DTSEM. * OSHA ARCHIVE DOCUMENT * NOTICE: This is an OSHA ARCHIVE Document, and may no longer represent OSHA policy. * OSHA ARCHIVE DOCUMENT *...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.874
📄 Source: ADM_04-00-003.pdf
📝

In [83]:
search_and_display("How is the SHMS changed and who approves it?")


🔍 Query: "How is the SHMS changed and who approves it?"

🔹 Result 1
📌 Score: 0.886
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
. Changes related to the implementation of SHMS may be made with local SHMS committee approval. Changes to the SHMS or programs that alter SHMS or program policies require National Labor -Management Steering Committee review and approval. The SHMS and its programs will be implemented in phases per t...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.866
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
. The SHMS and programs provide baseline guidance to OSHA in order to implement an effective SHMS to prevent employee injuries, illnesses and fatalities. Within established guidelines, Regional Administrators may suppleme nt or augment the SHMS and programs to address the unique needs within the Nat...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.864
📄 Source: ADM_04-00-003.pdf
📝 Content 

In [84]:
search_and_display("Describe the requirements for PPE during field inspections.")


🔍 Query: "Describe the requirements for PPE during field inspections."

🔹 Result 1
📌 Score: 0.898
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
1. At the start of any inspection/audit or other field activity, the employees will assess the need for PPE, which will include the employer’s PPE assessment. 

2. Employees including temporary, contract and visiting employees will abide by OSHA’s PPE Program (Chapter 8) or the program of the employ...
------------------------------------------------------------

🔹 Result 2
📌 Score: 0.893
📄 Source: ADM_04-00-003.pdf
📝 Content Preview:
1. Wear PPE as necessary; 

2. Attend PPE training sessions; 

3. Care for, clean, maintain and dispose of PPE as necessary; and 

4. Report any damaged or defective PPE to their responsible OSHA Manager(s). IV. Procedure Hazard Assessment 

1. Based on a general assessment of all work sites, it is ...
------------------------------------------------------------

🔹 Result 3
📌 Score: 0.889
📄 Source: ADM_04-00-003

In [None]:
import json

def clean_widgets_from_notebook(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    if 'widgets' in notebook.get('metadata', {}):
        print("🧹 Removing corrupted widget metadata...")
        del notebook['metadata']['widgets']

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(notebook, f, indent=2)

    print(f"✅ Cleaned notebook saved to: {output_path}")

# Example usage:
clean_widgets_from_notebook("vectorsLearning.ipynb", "vectorsLearning_clean.ipynb")
