# Find json keys and associated values

I have a text file containing a list of deeply nested JSON objects.  The objects have keys associated with objects as well as lists, included nested lists.   Using Python, I want a function that can search each object recursively for a specific key.  If the key is found, I want the function to return the full path to the nested key as well as the value.  The function must be able to correctly parse nested lists.  
 There should be no duplicates in the results. 
I also want to be able to provide a substring and have the function find keys that contain the substring.

In [166]:
# def find_key_paths(data, key=None, substring=None):
#     """
#     Recursively search a nested JSON-like structure (dicts + lists)
#     for keys that match exactly (key=...) or contain a substring (substring=...).

#     Returns:
#         List of dicts:
#             [{"path": "...", "value": <object>}, ...]

#     Ensures no duplicates and does NOT use a set, so unhashable values are allowed.
#     """

#     if key is None and substring is None:
#         raise ValueError("You must supply either key= or substring=")

#     results = []   # list of {"path": str, "value": object}

#     def already_recorded(path, value):
#         """Prevent duplicates without using sets."""
#         for entry in results:
#             if entry["path"] == path:
#                 # path uniquely identifies the match
#                 return True
#         return False

#     def key_matches(k):
#         """Check exact match or substring match."""
#         if key is not None and k == key:
#             return True
#         if substring is not None and substring in k:
#             return True
#         return False

#     def _search(obj, path):
#         if isinstance(obj, dict):
#             for k, v in obj.items():
#                 new_path = f"{path}.{k}" if path else k

#                 if key_matches(k) and not already_recorded(new_path, v):
#                     results.append({"path": new_path, "value": v})

#                 _search(v, new_path)

#         elif isinstance(obj, list):
#             for i, item in enumerate(obj):
#                 new_path = f"{path}[{i}]"
#                 _search(item, new_path)

#         # primitives â†’ no recursion needed
#         else:
#             return

#     _search(data, "")
#     return results


In [1]:
# def find_keys(data, key=None, substring=None, path=""):
#     """
#     Recursively search for keys in nested JSON-like structures.

#     Args:
#         data: The object to search (dict, list, scalar).
#         key: Exact key to match (optional).
#         substring: Substring to match inside keys (optional).
#         path: Automatically used during recursion to build the full key path.

#     Returns:
#         List of {"path": <string>, "value": <object>} dictionaries.
#     """

#     results = []
#     seen_paths = set()   # Only store path strings (hashable) to avoid duplicates

#     def _search(obj, current_path):
#         # If obj is a dict, iterate over its items
#         if isinstance(obj, dict):
#             for k, v in obj.items():

#                 # Build path to this key
#                 new_path = f"{current_path}.{k}" if current_path else k

#                 # Check match conditions
#                 matched = False
#                 if key is not None and k == key:
#                     matched = True
#                 if substring is not None and substring in k:
#                     matched = True

#                 if matched and new_path not in seen_paths:
#                     seen_paths.add(new_path)
#                     results.append({"path": new_path, "value": v})

#                 # Recurse into values
#                 _search(v, new_path)

#         # If obj is a list, recurse into each element with index
#         elif isinstance(obj, list):
#             for idx, item in enumerate(obj):
#                 new_path = f"{current_path}[{idx}]"
#                 _search(item, new_path)

#         # Scalars (str, int, float, bool, None) do not recurse further

#     _search(data, path)
#     return results


In [1]:
from typing import Any, Dict, List, Optional


def find_keys(
    data: Any,
    key: Optional[str] = None,
    substring: Optional[str] = None,
    path: str = ""
) -> List[Dict[str, Any]]:
    """
    Recursively search for keys in nested JSON-like structures.

    Args:
        data: The object to search (dict, list, scalar).
        key: Exact key to match (optional).
        substring: Substring to match inside keys (optional).
        path: Automatically used during recursion to build the full key path.

    Returns:
        List of {"path": <string>, "value": <object>} dictionaries.
    """

    results: List[Dict[str, Any]] = []
    seen_paths: set[str] = set()

    def _search(obj: Any, current_path: str) -> None:
        """Recursive helper that walks the nested JSON structure."""

        if isinstance(obj, dict):
            for k, v in obj.items():
                new_path: str = f"{current_path}.{k}" if current_path else k

                matched: bool = False
                if key is not None and k == key:
                    matched = True
                if substring is not None and substring in k:
                    matched = True

                if matched and new_path not in seen_paths:
                    seen_paths.add(new_path)
                    results.append({"path": new_path, "value": v})

                _search(v, new_path)

        elif isinstance(obj, list):
            for idx, item in enumerate(obj):
                new_path: str = f"{current_path}[{idx}]"
                _search(item, new_path)

        # Scalars do not recurse

    _search(data, path)
    return results


In [2]:
# Get fake visa data from file
from pprint import pprint
with open("ds160_100.json") as file:
    visa_data = json.load(file)

# pprint(visa_data[5])

## Find all keys containing "date"

In [3]:
print(len(visa_data))
for index, item in enumerate(visa_data):
    if index > 1:
        break
    # results = find_keys(item, substring="date")
    print(f"{index=}")
    for result in find_keys(item, substring="date"):
        print(f"{result['path']=} -> {result['value']=}")


100
index=0
result['path']='submission_date' -> result['value']='2025-06-08'
result['path']='personal_information.date_of_birth' -> result['value']='1993-04-06'
result['path']='passport_information.issue_date' -> result['value']='2019-04-08'
result['path']='passport_information.expiration_date' -> result['value']='2029-04-08'
result['path']='travel_information.intended_date_of_arrival' -> result['value']='2026-06-03'
result['path']='work_education.present_employer.start_date' -> result['value']='2022-11-23'
result['path']='work_education.employment_history[0].start_date' -> result['value']='2008-11-24'
result['path']='work_education.employment_history[0].end_date' -> result['value']='2013-11-23'
result['path']='previous_us_travel.visits[0].arrival_date' -> result['value']='2024-01-18'
result['path']='previous_us_travel.visits[1].arrival_date' -> result['value']='2022-10-09'
result['path']='testing_nested.level1.level2.level3.documents[0].meta.issued_date' -> result['value']='2025-02-18

In [5]:
data1 = {
    "user": {
        "profile": {
            "name": "Alice",
            "details": {
                "bo": [[
                    {"age": 30, "location": {"city": "Boston"}},
                    {"projects": [{"title": "X"}, {"title": "Y", "city": "Miami"}]}
                ]]}
        }
    }
}
results = find_keys(data1, "city")
for result in find_keys(data1, "city"):
    print(f"{result['path']=} -> {result['value']=}")
# results = find_keys(data1, "city")
# print(f"{result['path']=} -> {result['value']=}")
# for result in results:
#     print(f"{result['path']=} -> {result['value']=}")
#     # print(f"{result['path']=}")
#     # print(f"{result['value']=}")

result['path']='user.profile.details.bo[0][0].location.city' -> result['value']='Boston'
result['path']='user.profile.details.bo[0][1].projects[1].city' -> result['value']='Miami'


In [6]:
data1 = {
    "user": {
        "profile": {
            "name": "Alice",
            "details": {
                "bo": [[
                    {"age": 30, "location": {"city": "Boston"}},
                    {"projects": [{"title": "X"}, {"title": "Y", "city": "Miami"}]}
                ]]}
        }
    }
}
results = find_keys(data1, "city")
for result in results:
    print(f"{result['path']=} -> {result['value']=}")

result['path']='user.profile.details.bo[0][0].location.city' -> result['value']='Boston'
result['path']='user.profile.details.bo[0][1].projects[1].city' -> result['value']='Miami'


In [7]:
results = find_keys(data1, "age")
for result in results:
    print(f"{result['path']=} -> {result['value']=}")

    # print(f"{result['path']=}")
    # print(f"{result['value']=}")

result['path']='user.profile.details.bo[0][0].age' -> result['value']=30


In [8]:
from pprint import pprint
results = find_keys(data1, "bo")
for result in results:
    print(f"{result['path']=} -> {result['value']=}")

    # print(f"{result['path']=}")
    pprint(result['value'])

result['path']='user.profile.details.bo' -> result['value']=[[{'age': 30, 'location': {'city': 'Boston'}}, {'projects': [{'title': 'X'}, {'title': 'Y', 'city': 'Miami'}]}]]
[[{'age': 30, 'location': {'city': 'Boston'}},
  {'projects': [{'title': 'X'}, {'city': 'Miami', 'title': 'Y'}]}]]


In [9]:
data2 = {
    "user": {
        "profile": {
            "name": "Alice",
            "details": {
                "bo": [[
                    {"age": 30, "address_location": {"city": "Boston"}},
                    {"projects_location": [{"title": "X"}, {"title": "Y", "city": "Miami"}]}
                ]]}
        }
    }
}
results = find_keys(data2, substring="location")
for result in results:
    print(f"{result['path']=} -> {result['value']=}")

    # print(f"{result['path']=}")
    # print(f"{result['value']=}")

result['path']='user.profile.details.bo[0][0].address_location' -> result['value']={'city': 'Boston'}
result['path']='user.profile.details.bo[0][1].projects_location' -> result['value']=[{'title': 'X'}, {'title': 'Y', 'city': 'Miami'}]


In [10]:
nested = {
    "a": {
        "b": 10,
        "c": [10, {"d": 20, "e": 10}]
    },
    "f": [{"g": 10}, {"h": 20}]
}
print(f"{nested=}")
results = find_keys(nested, 'c')
print(len(results))
for result in results:
    print(f"{result['path']=} -> {result['value']=}")

    # print(f"{result['path']=}")
    # print(f"{result['value']=}")

nested={'a': {'b': 10, 'c': [10, {'d': 20, 'e': 10}]}, 'f': [{'g': 10}, {'h': 20}]}
1
result['path']='a.c' -> result['value']=[10, {'d': 20, 'e': 10}]


In [12]:
data1 = {
    "user": {
        "profile": {
            "name": "Alice",
            "details": {
                "bo": [[
                    {"age": 30, "location": {"city": "Boston"}},
                    {"projects": [{"title": "X"}, {"title": "Y", "city": "Miami"}]}
                ]]}
        }
    }
}
for result in find_keys(data1, "city"):
    print(f"{result['path']=} -> {result['value']=}")

    # print(f"{result['path']=}")
    # print(f"{result['value']=}")

print(find_keys(data1, "X"))

result['path']='user.profile.details.bo[0][0].location.city' -> result['value']='Boston'
result['path']='user.profile.details.bo[0][1].projects[1].city' -> result['value']='Miami'
[]
