### JSON Parsing and Processing


In [2]:
import json 
import os

os.makedirs("data/json_files2", exist_ok=True)


In [3]:
json_data = {
  "company": "TechCorp",
  "employees": [
    {
      "id": 1,
      "name": "John Doe",
      "role": "Software Engineer",
      "skills": [
        "Python",
        "JavaScript",
        "React"
      ],
      "projects": [
        {
          "name": "RAG System",
          "status": "In Progress"
        },
        {
          "name": "Data Pipeline",
          "status": "Completed"
        }
      ]
    },
    {
      "id": 2,
      "name": "Jane Smith",
      "role": "Data Scientist",
      "skills": [
        "Python",
        "Machine Learning",
        "SQL"
      ],
      "projects": [
        {
          "name": "ML Model",
          "status": "In Progress"
        },
        {
          "name": "Analytics Dashboard",
          "status": "Planning"
        }
      ]
    }
  ],
  "departments": {
    "engineering": {
      "head": "Mike Johnson",
      "budget": 1000000,
      "team_size": 25
    },
    "data_science": {
      "head": "Sarah Williams",
      "budget": 750000,
      "team_size": 15
    }
  }
}

with open("data/json_files2/company_data.json", "w") as f:
    json.dump(json_data, f, indent=2)

In [4]:
jsonl_data = [
    {"timestamp": "2024-01-01", "event": "user_login", "user_id": 123},
    {"timestamp": "2024-01-01", "event": "page_view","user_id": 123, "page": "/home"},
    {"timestamp": "2024-01-01", "event": "purchase", "user_id": 123, "amount": 99.99}
]

with open("data/json_files2/events.jsonl", "w") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + "\n")

### JSON Processing Strategies

In [6]:
from langchain_community.document_loaders import JSONLoader
import json

## Method1 - JSONLoader with jq_schema
employee_loader = JSONLoader(
    file_path="data/json_files2/company_data.json",
    jq_schema='.employees[]', # Extracts each employee as a separate document
    text_content=False
)
employee_docs = employee_loader.load()
print(f"Loaded {len(employee_docs)} employee documents")
print(f"First employee document: {employee_docs[0].page_content[:200]}")
print(employee_docs)

Loaded 2 employee documents
First employee document: {"id": 1, "name": "John Doe", "role": "Software Engineer", "skills": ["Python", "JavaScript", "React"], "projects": [{"name": "RAG System", "status": "In Progress"}, {"name": "Data Pipeline", "status"
[Document(metadata={'source': '/Users/johnny/dev/ai/ragudemy/0-DataIngestParsing/data/json_files2/company_data.json', 'seq_num': 1}, page_content='{"id": 1, "name": "John Doe", "role": "Software Engineer", "skills": ["Python", "JavaScript", "React"], "projects": [{"name": "RAG System", "status": "In Progress"}, {"name": "Data Pipeline", "status": "Completed"}]}'), Document(metadata={'source': '/Users/johnny/dev/ai/ragudemy/0-DataIngestParsing/data/json_files2/company_data.json', 'seq_num': 2}, page_content='{"id": 2, "name": "Jane Smith", "role": "Data Scientist", "skills": ["Python", "Machine Learning", "SQL"], "projects": [{"name": "ML Model", "status": "In Progress"}, {"name": "Analytics Dashboard", "status": "Planning"}]}')]


In [None]:
from langchain_core.documents import Document
## Method2 - JSONLoader with custom function (for complex structures)
print("2️⃣ Custom JSON processing")
def process_json_intelligently(filepath: str) -> list[Document]:
    """Process JSON with custom logic for complex structures"""
    with open(filepath, 'r') as f:
        json_data = json.load(f)
    
    documents = []  
    for emp in json_data.get('employees', []):
        content = f"""Employee Information:
        Name: {emp['name']}
        Role: {emp['role']}
        Skills: {', '.join(emp['skills'])}
        Projects: """ 
        # Projects: {', '.join([p['name'] for p in emp['projects']])}"""
        for project in emp.get('projects', []):
            content += f"\n- {project['name']} ({project['status']})"
        
        doc = Document(
            page_content=content,
            metadata={
                'source': filepath,
                'employee_id': emp['id'],
                'employee_name': emp['name'],
                'role': emp['role'],
                'data_type': 'employee_info'
            }
        )
        documents.append(doc)
    return documents

process_json_intelligently("data/json_files2/company_data.json")

2️⃣ Custom JSON processing
