In [1]:
#!pip install jq

In [2]:
import json
from pathlib import Path
from pprint import pprint

file_path = "./data/facebook_chat.json"
data = json.loads(Path(file_path).read_text())
pprint(data)

# with open(file_path, "r") as f:
#     data = json.load(f)
#
# print(data)


{'image': {'creation_timestamp': 1675549016, 'uri': 'image_of_the_chat.jpg'},
 'is_still_participant': True,
 'joinable_mode': {'link': '', 'mode': 1},
 'magic_words': [],
 'messages': [{'content': 'Bye!',
               'sender_name': 'User 2',
               'timestamp_ms': 1675597571851},
              {'content': 'Oh no worries! Bye',
               'sender_name': 'User 1',
               'timestamp_ms': 1675597435669},
              {'content': 'No Im sorry it was my mistake, the blue one is not '
                          'for sale',
               'sender_name': 'User 2',
               'timestamp_ms': 1675596277579},
              {'content': 'I thought you were selling the blue one!',
               'sender_name': 'User 1',
               'timestamp_ms': 1675595140251},
              {'content': 'Im not interested in this bag. Im interested in the '
                          'blue one!',
               'sender_name': 'User 1',
               'timestamp_ms': 1675595109305},
   

In [3]:
from langchain_community.document_loaders import JSONLoader
from pprint import pprint

file_path = "./data/facebook_chat.json"
# 方法1：把 .messages[].content 中信息放到 document 中
loader = JSONLoader(file_path=file_path, jq_schema=".messages[].content", text_content=False)
documents = loader.load()
print(len(documents))
pprint(documents)

# 方法2：把 .messages[].content 中信息放到 document 中
loader = JSONLoader(file_path=file_path, jq_schema=".messages[]", text_content=False,
                    content_key=".content",
                    is_content_key_jq_parsable=True, )
documents = loader.load()
print(len(documents))
pprint(documents)

# 不符合预期的写法
loader = JSONLoader(file_path=file_path, jq_schema=".", text_content=False,
                    content_key=".messages[].content",
                    is_content_key_jq_parsable=True, )
documents = loader.load()
print(len(documents))
pprint(documents)

# 把整个 messages 中全部数据，以 json 格式写到 document 中
loader = JSONLoader(file_path=file_path, jq_schema=".messages[]", text_content=False)
documents = loader.load()
print(len(documents))
pprint(documents)

# 整条 messages 列表变成了一条数据，不符合预期
loader = JSONLoader(file_path=file_path, jq_schema=".messages", text_content=False)
documents = loader.load()
print(len(documents))
pprint(documents)

# 简单粗暴把全部数据放作为一个 document 信息内容
loader = JSONLoader(file_path=file_path, jq_schema=".", text_content=False)
documents = loader.load()
print(len(documents))
pprint(documents)

11
[Document(metadata={'source': '/Users/gevin/projects/pycharm/laithw/notebook/04-langchain-rag/data/facebook_chat.json', 'seq_num': 1}, page_content='Bye!'),
 Document(metadata={'source': '/Users/gevin/projects/pycharm/laithw/notebook/04-langchain-rag/data/facebook_chat.json', 'seq_num': 2}, page_content='Oh no worries! Bye'),
 Document(metadata={'source': '/Users/gevin/projects/pycharm/laithw/notebook/04-langchain-rag/data/facebook_chat.json', 'seq_num': 3}, page_content='No Im sorry it was my mistake, the blue one is not for sale'),
 Document(metadata={'source': '/Users/gevin/projects/pycharm/laithw/notebook/04-langchain-rag/data/facebook_chat.json', 'seq_num': 4}, page_content='I thought you were selling the blue one!'),
 Document(metadata={'source': '/Users/gevin/projects/pycharm/laithw/notebook/04-langchain-rag/data/facebook_chat.json', 'seq_num': 5}, page_content='Im not interested in this bag. Im interested in the blue one!'),
 Document(metadata={'source': '/Users/gevin/projec

In [4]:
# collect meta data
# 配合上面方法2，把更多信息写到 metadata 中，以及如何修改默认的 meta 数据

from langchain_community.document_loaders import JSONLoader
from pprint import pprint


def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["sender_name"] = record.get("sender_name")
    metadata["timestamp_ms"] = record.get("timestamp_ms")

    if "source" in metadata:
        source = metadata["source"].split("/")
        # source = source[source.index("data"):]
        source = source[-1:]
        metadata["source"] = "/".join(source)

    return metadata


file_path = "./data/facebook_chat.json"
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.messages[]',
    text_content=False,
    content_key=".content",
    is_content_key_jq_parsable=True,
    metadata_func=metadata_func
)

documents = loader.load()
print(len(documents))
pprint(documents)


11
[Document(metadata={'source': 'facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}, page_content='Bye!'),
 Document(metadata={'source': 'facebook_chat.json', 'seq_num': 2, 'sender_name': 'User 1', 'timestamp_ms': 1675597435669}, page_content='Oh no worries! Bye'),
 Document(metadata={'source': 'facebook_chat.json', 'seq_num': 3, 'sender_name': 'User 2', 'timestamp_ms': 1675596277579}, page_content='No Im sorry it was my mistake, the blue one is not for sale'),
 Document(metadata={'source': 'facebook_chat.json', 'seq_num': 4, 'sender_name': 'User 1', 'timestamp_ms': 1675595140251}, page_content='I thought you were selling the blue one!'),
 Document(metadata={'source': 'facebook_chat.json', 'seq_num': 5, 'sender_name': 'User 1', 'timestamp_ms': 1675595109305}, page_content='Im not interested in this bag. Im interested in the blue one!'),
 Document(metadata={'source': 'facebook_chat.json', 'seq_num': 6, 'sender_name': 'User 2', 'timestamp_ms': 167