#### json splitter

In [1]:
import json
import requests
from langchain_text_splitters import RecursiveJsonSplitter

In [2]:
json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()
json_data

{'openapi': '3.1.0',
 'info': {'title': 'LangSmith', 'version': '0.1.0'},
 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'],
    'summary': 'Read Tracer Session',
    'description': 'Get a specific session.',
    'operationId': 'read_tracer_session_api_v1_sessions__session_id__get',
    'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}],
    'parameters': [{'name': 'session_id',
      'in': 'path',
      'required': True,
      'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
     {'name': 'include_stats',
      'in': 'query',
      'required': False,
      'schema': {'type': 'boolean',
       'default': False,
       'title': 'Include Stats'}},
     {'name': 'accept',
      'in': 'header',
      'required': False,
      'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
       'title': 'Accept'}}],
    'responses': {'200': {'description': 'Successful Response',
      'content': {'application/json': {'sch

In [3]:
splitter = RecursiveJsonSplitter(max_chunk_size = 512, min_chunk_size = 16)
chunked_json = splitter.split_json(json_data)
chunked_json

[{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}},
 {'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'],
     'summary': 'Read Tracer Session',
     'description': 'Get a specific session.',
     'operationId': 'read_tracer_session_api_v1_sessions__session_id__get',
     'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id',
       'in': 'path',
       'required': True,
       'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
      {'name': 'include_stats',
       'in': 'query',
       'required': False,
       'schema': {'type': 'boolean',
        'default': False,
        'title': 'Include Stats'}},
      {'name': 'accept',
       'in': 'header',
       'required': False,
       'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
        'title': 'Accept'}}]}}}},
 {'paths': {'/api/v1/sessions

In [4]:
for chunk in chunked_json[:5]:
    print(chunk)

{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'], 'summary': 'Read Tracer Session', 'description': 'Get a specific session.', 'operationId': 'read_tracer_session_api_v1_sessions__session_id__get', 'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id', 'in': 'path', 'required': True, 'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}}, {'name': 'include_stats', 'in': 'query', 'required': False, 'schema': {'type': 'boolean', 'default': False, 'title': 'Include Stats'}}, {'name': 'accept', 'in': 'header', 'required': False, 'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Accept'}}]}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'responses': {'200': {'description': 'Successful Response', 'content': {'application/json': {'schema'

#### Create Document object

In [5]:
doc = splitter.create_documents(texts = [json_data])
doc

[Document(metadata={}, page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}}'),
 Document(metadata={}, page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session.", "operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}'),
 Document(metadata={}, page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "include_stats", "in": "query", "required": false, "schema": {"type": "boolean", "default": false, "title": "Include Stats"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}'),
 Document(metadata={}, page_conten

In [6]:
for d in doc[:5]:
    print(d)

page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session.", "operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "include_stats", "in": "query", "required": false, "schema": {"type": "boolean", "default": false, "title": "Include Stats"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"responses": {"200": {"description": "Succ

In [7]:
txt = splitter.split_text(json_data)
print(txt[0])

{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}}
