In [28]:
import requests
import json
import time
from requests.exceptions import HTTPError
import pandas as pd
from pprint import pprint
import pickle
import ast
import numpy as np

# Load your API keys and other configurations
from config import prismClientID, prismClientSecret, prismPersonal

### Use if needed to create a table from Prism of all available communities uuids and slugs

In [29]:
## Request a list of communities from Prism

# Initialize a session
session = requests.Session()

## Possible API endpoint URLs querying Prism API
##url = f'https://vtfsmghslrepo02.fsm.northwestern.edu/api/communities/{slug}'
# url = f'https://vtfsmghslrepo02.fsm.northwestern.edu/api/communities?'
# url = f'https://prism-staging.fsm.northwestern.edu/api/communities?'
url = 'https://prism.northwestern.edu/api/communities?'

# List to store retrieved data
community_list = []

# Headers for API requests
headers = {
    'Authorization': 'Bearer ' + prismPersonal,
    'client-id': prismClientID,
    'Accept': 'application/json'
}

# Function to make API requests with rate limiting
def make_request(url):
    try:
        response = session.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    except HTTPError as http_error:
        print(f"HTTP error occurred: {http_error}")
        return None

# Make initial request
while url:
    # Make the API request
    community_json = make_request(url)
    
    # If request successful, process the data
    if community_json:
        community_list.extend(community_json["hits"]["hits"])
             
        # Check if there is a next URL
        next_url = community_json.get("links", {}).get("next")
        print(next_url)
        
        if next_url:
            url = next_url
            # Implement a delay between requests (e.g., 1 second)
            time.sleep(1)
        else:
            # If there is no next URL, exit the loop
            break
    else:
        # If request failed, exit the loop
        break

# Print the total number of retrieved items
print(len(community_list))

https://prism.northwestern.edu/api/communities?page=2&size=25&sort=newest
None
45


In [30]:
## Create a list of slugs and UUIDs to use for API calls

prism_uuids_list = []
prism_slug_list = []

## Create an id list
for community in community_list: 
    prism_uuids_list.append(community['id'])

## Create a slug list
for community in community_list: 
    prism_slug_list.append(community['slug'])
    
## Create an id list
# for community in community_dict["hits"]["hits"]:
#     prism_uuids_list.append(community['id'])

# ## Create a slug list
# for community in community_dict["hits"]["hits"]:
#     prism_slug_list.append(community['slug'])
    
print(len(prism_uuids_list))
print(prism_uuids_list)
print(len(prism_slug_list))
print(prism_slug_list)

45
['05df4a0a-80aa-47eb-b986-db454f4fbf7e', '0e0da8db-4528-426b-b7f5-63742f58cc80', 'c0e96cf3-8ad1-4f92-9174-6a45b6ca5c18', '7acb6bae-33da-46b4-89b9-d20b19151c0a', '0ea82171-47e4-492a-b569-6a949b4ca891', 'ed7d8416-14ca-47b5-a6d1-cbde3e786f0f', '5d46b99b-4925-42ff-9897-1d48d09e5c6f', 'dfd646ad-2502-4eeb-9db1-2d28ed0784e0', '61994f8d-0209-499d-b36b-65995afc2ca4', '155ff694-ae68-4b46-bab9-8b9fdff45cec', 'a2bf4fa0-66eb-4ca2-af0c-f78b5776404c', 'f9beab0b-6243-456f-ad69-6b6a0e8eb51e', 'a4fdff00-f419-4828-8a49-8e7ca7a12154', '988b4543-10be-4dc0-88cc-1fb961e54389', '61108e04-0091-421f-b146-a26e4957213d', '6c1c06b0-e48d-42d5-9572-82abfe0453bd', '3b15f7d4-d584-46af-81b7-41be026bfb9e', '67fbda69-7c71-45c0-b6db-ddd9740ed399', '56c8ccbc-8b1c-4457-a825-d0488288466d', '6937b8b2-19a6-4eca-83b2-773aebef1d6b', '62fe3cf0-b001-404b-be92-05bc209d60be', '8d5694dc-65a3-4361-b103-01781bcc4d06', '29596bf2-bfaf-463c-82dd-842b95f77d4e', 'b1416acf-847f-4169-96bc-82075639b670', '54170f02-4122-4ebe-b2f3-c12b2ac472b

In [31]:
## Create dataframe with slug and uuid as columns

# get the list of tuples from two lists.
# and merge them by using zip().
list_of_tuples = list(zip(prism_slug_list, prism_uuids_list))
 
# Converting lists of tuples into
# pandas Dataframe.
identifier_df = pd.DataFrame(list_of_tuples, columns = ['prism_slug', 'prism_uuid'])
identifier_df.head()  


Unnamed: 0,prism_slug,prism_uuid
0,hauser-lab-data,05df4a0a-80aa-47eb-b986-db454f4fbf7e
1,xiaopei,0e0da8db-4528-426b-b7f5-63742f58cc80
2,ladnerlab-cost-paper-hepatology,c0e96cf3-8ad1-4f92-9174-6a45b6ca5c18
3,daquilalab,7acb6bae-33da-46b4-89b9-d20b19151c0a
4,bcc,0ea82171-47e4-492a-b569-6a949b4ca891


In [32]:
## Save the dataframe to a CSV

with open(r"output/community_identifier_slug_df.csv", 'w', encoding='utf-8') as file:
    identifier_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

### Begin here once you know the uuid or slug for the community(ies) you want to request records from

In [33]:
## API request all records from a list of communities or one community in Prism
## This script makes API requests to retrieve all records for known communities in Prism.
## It takes a list of community identifiers (UUIDs) and, for each identifier, sends a GET request
## to the Prism API endpoint. The script also handles pagination for responses that include a
## 'next' link and logs the API response metadata in a list for further analysis.
    
## Note:
## - Ensure that the variables 'prismPersonal' and 'prismClientID' are defined and hold the proper authentication credentials for accessing the Prism API.
## - Adjust the API endpoint URL as needed (alternative URLs are provided in the commented code).
## - A short delay is introduced between each request to help avoid rate limiting issues.

## List of community UUIDs for which API records are requested.
prism_uuids_list=['ff968ff7-5cb4-4ad7-aff3-f62cf1707920']

## Global lists to store the API responses and response metadata.
multiple_identifier_list=[]
api_response_list=[]


def get_record_request(prism_identifier_list):
    """
    Sends API requests for each community identifier in the provided list.
    
    For each identifier:
    - Constructs the API endpoint URL.
    - Sends a GET request with the necessary headers.
    - Processes the JSON response and appends it to the global list.
    - Handles potential pagination using the 'next' link in the response.
    - Logs the response status (success or error) in a separate list.
    
    Returns:
        tuple: (list of all API response dictionaries, list of API response metadata)
    """
    # Iterate over each community identifier to create URL for request for each item in the prism_identifier_list
    for identifier in prism_identifier_list:
        try:
            # Print the identifier for debugging.
            #print(identifier)

            # Define the headers required for the API request.
            headers = {
                'Authorization': 'Bearer ' + prismPersonal,
                'client-id': prismClientID,
                'Accept': 'application/json'
            }
            
            # Construct the API endpoint URL.
            # Uncomment one of the following URLs if needed:
            # url = f'https://prism-staging.fsm.northwestern.edu/api/communities/{identifier}/records'
            # url = f'https://vtfsmghslrepo02.fsm.northwestern.edu/api/communities/{identifier}/records'
            url = f'https://prism.northwestern.edu/api/communities/{identifier}/records'

            # Make the API request.
            single_identifier_response = requests.get(url, headers=headers)

            # Convert single_identifier_response to dict using .json()
            single_identifier_dict = single_identifier_response.json()
            
            # Create API response dict
            api_response_dict = {}
            api_response_dict['identifier'] = identifier
            api_response_dict['Response'] = "Success"
            api_response_list.append(api_response_dict.copy())  # Append a copy of the dict
            
            # Append each single_identifier_response to multiple_identifier_list
            multiple_identifier_list.append(single_identifier_dict.copy())

        except requests.HTTPError as err:
            if err.response.status_code == 404 and "the persistent identifier does not exist" in err.response.text:
                print("The persistent identifier does not exist:", identifier)
                continue  # Skip to the next identifier
            print(f'HTTP error occurred: {err}')
            api_response_dict['Response'] = str(err)
            api_response_list.append(api_response_dict.copy())

        except requests.Timeout:
            print('The request timed out')
            api_response_dict['Response'] = "Request timed out"
            api_response_list.append(api_response_dict.copy())

        except Exception as err:
            print(f'Other error occurred: {err}. ')
            api_response_dict['Response'] = str(err)
            api_response_list.append(api_response_dict.copy())

        finally:
            time.sleep(0.3)  # Delay to help avoid rate limiting

        # Check if the response includes a 'links' key to handle pagination.
        if 'links' in single_identifier_dict:
            next_url = single_identifier_dict['links'].get('next')
        else:
            next_url = None

        # Handle pagination if a 'next' URL is provided.
        while next_url:
            try:
                headers = {
                    'Authorization': 'Bearer ' + prismPersonal,
                    'client-id': prismClientID,
                    'Accept': 'application/json'
                }
                params = {'all_versions': 'false'}
                single_identifier_response = requests.get(next_url, headers=headers, params=params)
                single_identifier_dict = single_identifier_response.json()

                api_response_dict = {}
                api_response_dict['identifier'] = identifier
                api_response_dict['Response'] = "Success"
                api_response_list.append(api_response_dict.copy())
                multiple_identifier_list.append(single_identifier_dict.copy())

                if 'next' in single_identifier_dict['links']:
                    next_url = single_identifier_dict['links']['next']
                else:
                    break

            except requests.HTTPError as err:
                if err.response.status_code == 404 and "the persistent identifier does not exist" in err.response.text:
                    print("The persistent identifier does not exist:", identifier)
                    break
                print(f'HTTP error occurred: {err}')
                api_response_dict = {'identifier': identifier, 'Response': str(err)}
                api_response_list.append(api_response_dict.copy())

            except requests.Timeout:
                print('The request timed out')
                api_response_dict = {'identifier': identifier, 'Response': "Request timed out"}
                api_response_list.append(api_response_dict.copy())

            except Exception as err:
                print(f'Other error occurred: {err}. ')
                api_response_dict = {'identifier': identifier, 'Response': str(err)}
                api_response_list.append(api_response_dict.copy())

            finally:
                time.sleep(0.3)

    return multiple_identifier_list, api_response_list

# Execute the API request function with the list of Prism UUIDs.
get_record_request(prism_uuids_list)



## Resources
## https://inveniordm.docs.cern.ch/reference/rest_api_communities/
## https://realpython.com/python-api/#request-and-response
## http://www.compciv.org/guides/python/how-tos/creating-proper-url-query-strings/
## https://stackoverflow.com/questions/17788445/constructing-requests-with-url-query-string-in-python
## Using paginated APIs (4 ways): https://www.youtube.com/watch?v=4Fdyft-ky0w
## https://vtfsmghslrepo02.fsm.northwestern.edu/api/communities/<SLUG OF THAT COMMUNITY>/records?access_token=<access token>

([{'hits': {'hits': [{'id': 'ccsqm-da936',
      'created': '2023-03-31T16:35:23.989111+00:00',
      'updated': '2025-01-30T19:15:44.834165+00:00',
      'links': {'self': 'https://prism.northwestern.edu/api/records/ccsqm-da936',
       'self_html': 'https://prism.northwestern.edu/records/ccsqm-da936',
       'self_doi': 'https://prism.northwestern.edu/doi/10.18131/g3-fczs-gg86',
       'doi': 'https://doi.org/10.18131/g3-fczs-gg86',
       'parent': 'https://prism.northwestern.edu/api/records/qshxd-j8y31',
       'parent_html': 'https://prism.northwestern.edu/records/qshxd-j8y31',
       'parent_doi': 'https://prism.northwestern.edu/doi/10.18131/qshxd-j8y31',
       'self_iiif_manifest': 'https://prism.northwestern.edu/api/iiif/record:ccsqm-da936/manifest',
       'self_iiif_sequence': 'https://prism.northwestern.edu/api/iiif/record:ccsqm-da936/sequence/default',
       'files': 'https://prism.northwestern.edu/api/records/ccsqm-da936/files',
       'media_files': 'https://prism.north

In [34]:
## A quick check to see whether you're accumulating responses across pages correctly.
print(len(multiple_identifier_list))
#pprint(multiple_identifier_list[0])

30


In [35]:
## Save the results of the Prism API query

with open("output/multiple_identifier_list", "wb") as fp:   #Pickling
    pickle.dump(multiple_identifier_list, fp)

with open("output/api_response_list", "wb") as fp:   #Pickling
    pickle.dump(api_response_list, fp)  
    


In [36]:
## Open the pickle files to continue work

with open("output/multiple_identifier_list", "rb") as fp:   # Unpickling
    multiple_identifier_list_2 = pickle.load(fp)
    
# with open("output/api_response_list", "rb") as fp:   # Unpickling
#     api_response_list_2 = pickle.load(fp)

In [37]:
## Process a list of API responses contained in 'multiple_identifier_list_2'
## where each page response includes records under the 'hits' -> 'hits' keys. The script 
## extracts these record dictionaries from each page and creates a consolidated list 
## of all records called 'final_community_list'. Finally, it prints the resulting list using pprint.

# Initialize an empty list to hold all record dictionaries across all pages.
final_community_list = []

# Iterate over each page of results in the list containing API response data.
for page in multiple_identifier_list_2:
  
    # For the current page, initialize an empty list to store its individual record dictionaries.
    final_record_list = []

    # Access the nested list of records:
    # 'page' is expected to be a dictionary containing a key 'hits', which itself contains a key 'hits'
    # where the actual records are stored.
    for record in page['hits']['hits']:

        # Append the current record (a dictionary) from the page's records to the temporary record list.
        final_record_list.append(record)
        
    # After iterating through all records in the current page,
    # extend the final_community_list with all records from final_record_list.
    final_community_list.extend(final_record_list)

# Use pprint to display the final community list in a human-friendly format.
#pprint(final_community_list)


In [38]:
## Check the first item in the final_community_list
## final_community_list[1]

In [39]:
##### SAMPLE DATA FOR THE NEXT PART ################
##### USE AS NEEDED ###############################
# data = [{'access': {'embargo': {'active': False, 'reason': None},
#             'files': 'public',
#             'record': 'public',
#             'status': 'open'},
#  'created': '2024-03-21T21:45:01.174142+00:00',
#  'custom_fields': {},
#  'files': {'enabled': True, 'order': []},
#  'id': 'vfjy1-jhr35',
#  'is_draft': False,
#  'is_published': True,
#  'links': {'access_links': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/access/links',
#            'archive': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/files-archive',
#            'doi': 'https://doi.org/10.24418/vfjy1-jhr35',
#            'draft': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/draft',
#            'files': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/files',
#            'latest': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/versions/latest',
#            'latest_html': 'https://prism-staging.fsm.northwestern.edu/records/vfjy1-jhr35/latest',
#            'reserve_doi': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/draft/pids/doi',
#            'self': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35',
#            'self_doi': 'https://prism-staging.fsm.northwestern.edu/doi/10.24418/vfjy1-jhr35',
#            'self_html': 'https://prism-staging.fsm.northwestern.edu/records/vfjy1-jhr35',
#            'self_iiif_manifest': 'https://prism-staging.fsm.northwestern.edu/api/iiif/record:vfjy1-jhr35/manifest',
#            'self_iiif_sequence': 'https://prism-staging.fsm.northwestern.edu/api/iiif/record:vfjy1-jhr35/sequence/default',
#            'versions': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/versions'},
#  'metadata': {'additional_descriptions': [{'description': '<p>Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit gravida. '
#                                                           'Mauris vitae '
#                                                           'ultricies leo '
#                                                           'integer malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Cursus in '
#                                                           'hac habitasse '
#                                                           'platea dictumst. '
#                                                           'Sit amet risus '
#                                                           'nullam eget. '
#                                                           'Bibendum enim '
#                                                           'facilisis gravida '
#                                                           'neque convallis a '
#                                                           'cras semper. '
#                                                           'Malesuada '
#                                                           'pellentesque elit '
#                                                           'eget gravida cum '
#                                                           'sociis natoque. '
#                                                           'Vulputate dignissim '
#                                                           'suspendisse in est. '
#                                                           'Luctus venenatis '
#                                                           'lectus magna '
#                                                           'fringilla urna. '
#                                                           'Arcu cursus euismod '
#                                                           'quis viverra nibh '
#                                                           'cras pulvinar.</p>',
#                                            'lang': {'id': 'asw',
#                                                     'title': {'en': 'Australian '
#                                                                     'Aborigines '
#                                                                     'Sign '
#                                                                     'Language'}},
#                                            'type': {'id': 'abstract',
#                                                     'title': {'en': 'Abstract'}}},
#                                           {'description': '<p>Accumsan sit '
#                                                           'amet nulla facilisi '
#                                                           'morbi tempus '
#                                                           'iaculis urna. '
#                                                           'Egestas sed sed '
#                                                           'risus pretium quam '
#                                                           'vulputate. Aenean '
#                                                           'sed adipiscing diam '
#                                                           'donec adipiscing. '
#                                                           'Eget egestas purus '
#                                                           'viverra '
#                                                           'accumsan.&nbsp;</p>',
#                                            'lang': {'id': 'arz',
#                                                     'title': {'en': 'Egyptian '
#                                                                     'Arabic'}},
#                                            'type': {'id': 'acknowledgements',
#                                                     'title': {'en': 'Acknowledgements'}}},
#                                           {'description': '<p>Nisl rhoncus '
#                                                           'mattis rhoncus '
#                                                           'urna. Scelerisque '
#                                                           'mauris pellentesque '
#                                                           'pulvinar '
#                                                           'pellentesque '
#                                                           'habitant morbi '
#                                                           'tristique senectus '
#                                                           'et. Molestie at '
#                                                           'elementum eu '
#                                                           'facilisis sed '
#                                                           'odio.</p>',
#                                            'lang': {'id': 'lat',
#                                                     'title': {'en': 'Latin'}},
#                                            'type': {'id': 'methods',
#                                                     'title': {'en': 'Methods'}}},
#                                           {'description': '<p>Morbi tincidunt '
#                                                           'augue interdum '
#                                                           'velit euismod in '
#                                                           'pellentesque massa. '
#                                                           'Ultrices vitae '
#                                                           'auctor eu augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum. Nec '
#                                                           'sagittis aliquam '
#                                                           'malesuada bibendum '
#                                                           'arcu vitae '
#                                                           'elementum.</p>',
#                                            'lang': {'id': 'ell',
#                                                     'title': {'en': 'Modern '
#                                                                     'Greek '
#                                                                     '(1453-)'}},
#                                            'type': {'id': 'other',
#                                                     'title': {'en': 'Other'}}},
#                                           {'description': '<p>Lacinia quis vel '
#                                                           'eros donec ac odio '
#                                                           'tempor. Volutpat '
#                                                           'odio facilisis '
#                                                           'mauris sit amet '
#                                                           'massa vitae tortor '
#                                                           'condimentum.</p>',
#                                            'lang': {'id': 'eng',
#                                                     'title': {'en': 'English'}},
#                                            'type': {'id': 'series-information',
#                                                     'title': {'en': 'Series '
#                                                                     'information'}}},
#                                           {'description': '<p>Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit '
#                                                           'gravida.&nbsp;</p>',
#                                            'lang': {'id': 'aib',
#                                                     'title': {'en': 'Ainu '
#                                                                     '(China)'}},
#                                            'type': {'id': 'table-of-contents',
#                                                     'title': {'en': 'Table of '
#                                                                     'contents'}}},
#                                           {'description': '<p>Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit '
#                                                           'gravida.&nbsp;</p>',
#                                            'lang': {'id': 'eng',
#                                                     'title': {'en': 'English'}},
#                                            'type': {'id': 'technical-info',
#                                                     'title': {'en': 'Technical '
#                                                                     'info'}}},
#                                           {'description': '<p><strong>Another '
#                                                           'abstract</strong>: '
#                                                           'Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit '
#                                                           'gravida.&nbsp;</p>',
#                                            'lang': {'id': 'fra',
#                                                     'title': {'en': 'French'}},
#                                            'type': {'id': 'abstract',
#                                                     'title': {'en': 'Abstract'}}}],
#               'additional_titles': [{'lang': {'id': 'eng',
#                                               'title': {'en': 'English'}},
#                                      'title': 'An Alternative title for this '
#                                               'record',
#                                      'type': {'id': 'alternative-title',
#                                               'title': {'de': 'Alternativer '
#                                                               'Titel',
#                                                         'en': 'Alternative '
#                                                               'title'}}},
#                                     {'lang': {'id': 'ita',
#                                               'title': {'en': 'Italian'}},
#                                      'title': 'A subtitle for this record',
#                                      'type': {'id': 'subtitle',
#                                               'title': {'de': 'Untertitel',
#                                                         'en': 'Subtitle'}}},
#                                     {'lang': {'id': 'deu',
#                                               'title': {'en': 'German'}},
#                                      'title': 'A translated title for this '
#                                               'record',
#                                      'type': {'id': 'translated-title',
#                                               'title': {'de': 'Übersetzter '
#                                                               'Titel',
#                                                         'en': 'Translated '
#                                                               'title'}}},
#                                     {'lang': {'id': 'fra',
#                                               'title': {'en': 'French'}},
#                                      'title': 'An other title for this record',
#                                      'type': {'id': 'other',
#                                               'title': {'de': 'Sonstiger Titel',
#                                                         'en': 'Other'}}},
#                                     {'lang': {'id': 'spa',
#                                               'title': {'en': 'Spanish'}},
#                                      'title': 'Another other title for this '
#                                               'record',
#                                      'type': {'id': 'other',
#                                               'title': {'de': 'Sonstiger Titel',
#                                                         'en': 'Other'}}}],
#               'contributors': [{'affiliations': [{'id': '015j35893',
#                                                   'name': 'English Heritage'},
#                                                  {'name': 'Totenham'}],
#                                 'person_or_org': {'family_name': 'Abram',
#                                                   'given_name': 'Donald',
#                                                   'identifiers': [{'identifier': '0000-0002-0575-6152',
#                                                                    'scheme': 'orcid'}],
#                                                   'name': 'Abram, Donald',
#                                                   'type': 'personal'},
#                                 'role': {'id': 'role-data-curator',
#                                          'title': {'en': 'Data curator role'}}},
#                                {'affiliations': [{'id': '018sx6595',
#                                                   'name': 'Archéologie des '
#                                                           'Amériques'},
#                                                  {'id': '00t6xza10',
#                                                   'name': 'Swiss Federal '
#                                                           'Archives'}],
#                                 'person_or_org': {'family_name': 'Valens',
#                                                   'given_name': 'Renee',
#                                                   'identifiers': [{'identifier': '0000-0002-7619-0205',
#                                                                    'scheme': 'orcid'}],
#                                                   'name': 'Valens, Renee',
#                                                   'type': 'personal'},
#                                 'role': {'id': 'role-visualization',
#                                          'title': {'en': 'Visualization '
#                                                          'role'}}},
#                                {'affiliations': [{'name': 'United States'}],
#                                 'person_or_org': {'identifiers': [{'identifier': '04h9g9s69',
#                                                                    'scheme': 'ror'}],
#                                                   'name': 'PreventAGE Health '
#                                                           'Care (United '
#                                                           'States)',
#                                                   'type': 'organizational'},
#                                 'role': {'id': 'role-data-curator',
#                                          'title': {'en': 'Data curator role'}}},
#                                {'affiliations': [{'id': '0472wp149',
#                                                   'name': 'United Animal '
#                                                           'Health (United '
#                                                           'States)'},
#                                                  {'name': 'United States'}],
#                                 'person_or_org': {'identifiers': [{'identifier': '05kkrdt60',
#                                                                    'scheme': 'ror'}],
#                                                   'name': 'Hebrew Health Care',
#                                                   'type': 'organizational'},
#                                 'role': {'id': 'role-software',
#                                          'title': {'en': 'Software role'}}}],
#               'creators': [{'affiliations': [{'name': 'Northwestern '
#                                                       'University'},
#                                              {'name': 'Another department'},
#                                              {'id': '01bj3aw27',
#                                               'name': 'United States '
#                                                       'Department of Energy'}],
#                             'person_or_org': {'family_name': 'Doe',
#                                               'given_name': 'Jane A',
#                                               'identifiers': [{'identifier': '0000-0002-6747-0985',
#                                                                'scheme': 'orcid'}],
#                                               'name': 'Doe, Jane A',
#                                               'type': 'personal'},
#                             'role': {'id': 'role-data-curator',
#                                      'title': {'en': 'Data curator role'}}},
#                            {'affiliations': [{'name': 'Northwestern '
#                                                       'University'},
#                                              {'name': 'a department'},
#                                              {'id': '02fh5m162',
#                                               'name': 'Národní Pedagogické '
#                                                       'Muzeum a Knihovna J. A. '
#                                                       'Komenského'}],
#                             'person_or_org': {'family_name': 'Taylor',
#                                               'given_name': 'John',
#                                               'identifiers': [{'identifier': '0000-0002-7619-0205',
#                                                                'scheme': 'orcid'}],
#                                               'name': 'Taylor, John',
#                                               'type': 'personal'},
#                             'role': {'id': 'role-editor',
#                                      'title': {'en': 'Editor role'}}},
#                            {'affiliations': [{'name': 'Company'},
#                                              {'name': 'Norcross'},
#                                              {'id': '000e0be47',
#                                               'name': 'Northwestern '
#                                                       'University'}],
#                             'person_or_org': {'identifiers': [{'identifier': '00by51808',
#                                                                'scheme': 'ror'}],
#                                               'name': 'Mölnlycke Health Care '
#                                                       '(United States)',
#                                               'type': 'organizational'},
#                             'role': {'id': 'role-formal-analysis',
#                                      'title': {'en': 'Formal analysis role'}}},
#                            {'affiliations': [{'name': 'Raleigh'},
#                                              {'id': '000e0be47',
#                                               'name': 'Northwestern '
#                                                       'University'}],
#                             'person_or_org': {'identifiers': [{'identifier': '030bsqk21',
#                                                                'scheme': 'ror'}],
#                                               'name': 'Plant Health Care '
#                                                       '(United States)',
#                                               'type': 'organizational'},
#                             'role': {'id': 'role-education-and-training',
#                                      'title': {'en': 'Education and training '
#                                                      'role'}}}],
#               'dates': [{'date': '2022-04-01',
#                          'description': 'date this was accepted',
#                          'type': {'id': 'accepted',
#                                   'title': {'de': 'Angenommen',
#                                             'en': 'Accepted'}}},
#                         {'date': '1945/1966',
#                          'description': 'date this was available',
#                          'type': {'id': 'available',
#                                   'title': {'de': 'Verfügbar',
#                                             'en': 'Available'}}},
#                         {'date': '1987',
#                          'description': 'date this was collected',
#                          'type': {'id': 'collected',
#                                   'title': {'de': 'Gesammelt',
#                                             'en': 'Collected'}}},
#                         {'date': '1988',
#                          'description': 'another date this was collected',
#                          'type': {'id': 'collected',
#                                   'title': {'de': 'Gesammelt',
#                                             'en': 'Collected'}}},
#                         {'date': '2024-01-01',
#                          'description': 'date this was copyrighted',
#                          'type': {'id': 'copyrighted',
#                                   'title': {'de': 'Mit Copyright versehen',
#                                             'en': 'Copyrighted'}}},
#                         {'date': '2024',
#                          'description': 'date this was created',
#                          'type': {'id': 'created',
#                                   'title': {'de': 'Erstellt',
#                                             'en': 'Created'}}},
#                         {'date': '2010',
#                          'description': 'date this was issued',
#                          'type': {'id': 'issued',
#                                   'title': {'de': 'Veröffentlicht',
#                                             'en': 'Issued'}}},
#                         {'date': '2011',
#                          'description': 'date other',
#                          'type': {'id': 'other',
#                                   'title': {'de': 'Sonstiges', 'en': 'Other'}}},
#                         {'date': '2013',
#                          'description': 'date this was submitted',
#                          'type': {'id': 'submitted',
#                                   'title': {'de': 'Eingereicht',
#                                             'en': 'Submitted'}}},
#                         {'date': '1112',
#                          'description': 'date this was updated',
#                          'type': {'id': 'updated',
#                                   'title': {'de': 'Aktualisiert',
#                                             'en': 'Updated'}}},
#                         {'date': '1498',
#                          'description': 'date this was valid',
#                          'type': {'id': 'valid',
#                                   'title': {'de': 'Gültig', 'en': 'Valid'}}},
#                         {'date': '1889',
#                          'description': 'date this was withdrawn',
#                          'type': {'id': 'withdrawn',
#                                   'title': {'de': 'Zurückgezogen',
#                                             'en': 'Withdrawn'}}},
#                         {'date': '6589',
#                          'description': 'another acceptance date',
#                          'type': {'id': 'accepted',
#                                   'title': {'de': 'Angenommen',
#                                             'en': 'Accepted'}}}],
#               'description': '<p>Lorem ipsum dolor sit amet,<strong> '
#                              'consectetur adipiscing eli</strong>t, sed do '
#                              '<i>eiusmod tempor inc</i>ididunt ut labore et '
#                              'dolore magna aliqua. Ut diam quam nulla '
#                              'porttitor massa. Arcu risus quis varius quam '
#                              'quisque id diam. Consequat nisl vel pretium '
#                              'lectus quam id leo in. Elementum nibh tellus '
#                              'molestie nunc non blandit massa. Tristique nulla '
#                              'aliquet enim tortor at. Tellus elementum '
#                              'sagittis vitae et. Molestie ac feugiat sed '
#                              'lectus vestibulum.&nbsp;</p><blockquote><p>Eget '
#                              'nunc lobortis mattis aliquam faucibus purus. '
#                              'Tempor orci dapibus ultrices in. Vulputate mi '
#                              'sit <a '
#                              'href="https://twitter.com/home">https://twitter.com/home</a>amet '
#                              'mauris commodo quis imperdiet. Nunc sed augue '
#                              'lacus viverra. Placerat duis ultricies lacus sed '
#                              'turpis tincidunt id aliquet. In ante metus '
#                              'dictum at tempor commodo. Ullamcorper morbi '
#                              'tincidunt ornare massa eget egestas purus '
#                              'viverra. Ornare arcu odio ut sem nulla. Vel '
#                              'turpis nunc eget lorem. Sit amet consectetur '
#                              'adipiscing elit ut. Bibendum ut tristique et '
#                              'egestas.</p></blockquote><p>Donec et odio '
#                              'pellentesque diam volutpat commodo sed. Magnis '
#                              'dis parturient montes nascetur ridiculus mus '
#                              'mauris vitae. Pharetra diam sit amet nisl. Nulla '
#                              'facilisi morbi tempus iaculis urna id volutpat '
#                              'lacus laoreet. Eleifend donec pretium vulputate '
#                              'sapien nec sagittis.&nbsp;</p><ul><li>Donec et '
#                              'odio pellentesque diam volutpat commodo sed. '
#                              'Magnis dis parturient montes nascetur ridiculus '
#                              'mus mauris vitae.&nbsp;</li><li>Pharetra diam '
#                              'sit amet nisl. Nulla facilisi morbi tempus '
#                              'iaculis urna id volutpat lacus laoreet. Eleifend '
#                              'donec pretium vulputate sapien nec '
#                              'sagittis.&nbsp;</li></ul><p>Quis auctor elit sed '
#                              'vulputate mi sit amet. Lectus proin nibh nisl '
#                              'condimentum id venenatis a condimentum. Gravida '
#                              'in fermentum et sollicitudin ac. Odio aenean sed '
#                              'adipiscing diam donec.</p><ol><li>Cursus sit '
#                              'amet dictum sit amet justo donec enim '
#                              'diam.&nbsp;</li><li>Vestibulum mattis '
#                              'ullamcorper velit sed ullamcorper morbi '
#                              'tincidunt ornare massa.</li></ol><p>Nec '
#                              'ullamcorper sit amet risus. Fringilla ut morbi '
#                              'tincidunt augue interdum velit euismod in.</p>',
#               'funding': [{'award': {'identifiers': [{'identifier': 'https://reporter.nih.gov/search/T41epJ74wk6O-KXRM6WlQA/project-details/10390349',
#                                                       'scheme': 'url'}],
#                                      'number': '5R01HS028003-02',
#                                      'title': {'en': 'Implementation of '
#                                                      'Digital Mental Health '
#                                                      'Tools in Ambulatory Care '
#                                                      'Coordination'}},
#                            'funder': {'id': '01cwqze88',
#                                       'name': 'National Institutes of Health'}},
#                           {'award': {'identifiers': [{'identifier': 'https://reporter.nih.gov/search/T41epJ74wk6O-KXRM6WlQA/project-details/10390458999999',
#                                                       'scheme': 'url'}],
#                                      'number': '5R01HS028003-0789',
#                                      'title': {'en': 'Canadian Implementation '
#                                                      'of Digital Mental Health '
#                                                      'Tools in Ambulatory Care '
#                                                      'Coordination'}},
#                            'funder': {'id': '01sdtdd95',
#                                       'name': 'Canadian Institute for Advanced '
#                                               'Research'}}],
#               'identifiers': [{'identifier': 'ark:/c8131/g3rp42',
#                                'scheme': 'ark'},
#                               {'identifier': 'arXiv:2109.13768',
#                                'scheme': 'arxiv'},
#                               {'identifier': '1974AJ.....79..819H',
#                                'scheme': 'bibcode'},
#                               {'identifier': '10.1002/acn3.51179',
#                                'scheme': 'doi'},
#                               {'identifier': '1234567890128',
#                                'scheme': 'ean13'},
#                               {'identifier': '1470-7330', 'scheme': 'eissn'},
#                               {'identifier': '20.1000/100', 'scheme': 'handle'},
#                               {'identifier': 'HRV003M16', 'scheme': 'igsn'},
#                               {'identifier': '978-3-16-148410-0',
#                                'scheme': 'isbn'},
#                               {'identifier': '1740-5025', 'scheme': 'issn'},
#                               {'identifier': 'A02-2009-000004BE-A',
#                                'scheme': 'istc'},
#                               {'identifier': '1748-7188', 'scheme': 'lissn'},
#                               {'identifier': 'urn:lsid:zoobank.org:pub:CDC8D258-8F57-41DC-B560-247E17D3DC8C',
#                                'scheme': 'lsid'},
#                               {'identifier': '23193287', 'scheme': 'pmid'},
#                               {'identifier': 'https://purl.fdlp.gov/GPO/gpo53258',
#                                'scheme': 'purl'},
#                               {'identifier': '042100005264', 'scheme': 'upc'},
#                               {'identifier': 'https://www.cnn.com',
#                                'scheme': 'url'},
#                               {'identifier': 'urn:isan:0000-0000-2CEA-0000-1-0000-0000-Y',
#                                'scheme': 'urn'},
#                               {'identifier': 'https://w3id.org/tree',
#                                'scheme': 'w3id'},
#                               {'identifier': '2-s2.0-85178352590',
#                                'scheme': 'other'}],
#               'languages': [{'id': 'pol', 'title': {'en': 'Polish'}}],
#               'publication_date': '2020-09-10',
#               'publisher': 'Prism. Galter Health Sciences Library. '
#                            'Northwestern University',
#               'related_identifiers': [{'identifier': '10.1007/s00216-024-05178-z',
#                                        'relation_type': {'id': 'iscitedby',
#                                                          'title': {'en': 'Is '
#                                                                          'cited '
#                                                                          'by'}},
#                                        'resource_type': {'id': 'article-journal_article',
#                                                          'title': {'en': 'Journal '
#                                                                          'Article'}},
#                                        'scheme': 'doi'},
#                                       {'identifier': '10.1016/j.ncrna.2023.11.005',
#                                        'relation_type': {'id': 'issupplementedby',
#                                                          'title': {'en': 'Is '
#                                                                          'supplemented '
#                                                                          'by'}},
#                                        'resource_type': {'id': 'article-journal_article',
#                                                          'title': {'en': 'Journal '
#                                                                          'Article'}},
#                                        'scheme': 'doi'},
#                                       {'identifier': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-85184204420&doi=10.1007%2fs00216-024-05178-z&partnerID=40&md5=2f5608ef0460e4e4cadf680800e59668',
#                                        'relation_type': {'id': 'ispartof',
#                                                          'title': {'en': 'Is '
#                                                                          'part '
#                                                                          'of'}},
#                                        'resource_type': {'id': 'article-evaluation_study',
#                                                          'title': {'en': 'Evaluation '
#                                                                          'Study'}},
#                                        'scheme': 'url'},
#                                       {'identifier': '2-s2.0-85184204420',
#                                        'relation_type': {'id': 'isnewversionof',
#                                                          'title': {'en': 'Is '
#                                                                          'new '
#                                                                          'version '
#                                                                          'of'}},
#                                        'resource_type': {'id': 'archival-exhibition',
#                                                          'title': {'en': 'Exhibition'}},
#                                        'scheme': 'other'}],
#               'resource_type': {'id': 'learning-problems_and_exercises',
#                                 'title': {'en': 'Problems and Exercises'}},
#               'rights': [{'description': {'en': 'This license lets others '
#                                                 'distribute, remix, adapt, and '
#                                                 'build upon this work, even '
#                                                 'commercially, and although '
#                                                 'their new works must also '
#                                                 'acknowledge the original '
#                                                 'creator(s), they don’t have '
#                                                 'to license their derivative '
#                                                 'works on the same terms.'},
#                           'icon': 'cc-by-icon',
#                           'id': 'cc-by-4.0',
#                           'props': {'scheme': 'spdx',
#                                     'url': 'https://creativecommons.org/licenses/by/4.0/legalcode'},
#                           'title': {'en': 'Creative Commons Attribution 4.0 '
#                                           'International'}},
#                          {'description': {'en': 'This license lets others '
#                                                 'remix, adapt, and build upon '
#                                                 'this work non-commercially, '
#                                                 'as long as they credit the '
#                                                 'original creator(s) and '
#                                                 'license their new creations '
#                                                 'under the identical terms.'},
#                           'icon': 'cc-by-nc-sa-icon',
#                           'id': 'cc-by-nc-sa-4.0',
#                           'props': {'scheme': 'spdx',
#                                     'url': 'https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode'},
#                           'title': {'en': 'Creative Commons Attribution Non '
#                                           'Commercial Share Alike 4.0 '
#                                           'International'}},
#                          {'description': {'en': 'Etiam dignissim diam quis '
#                                                 'enim lobortis scelerisque '
#                                                 'fermentum. Risus nullam eget '
#                                                 'felis eget. Malesuada nunc '
#                                                 'vel risus commodo. Id aliquet '
#                                                 'lectus proin nibh nisl '
#                                                 'condimentum id venenatis. '
#                                                 'Augue ut lectus arcu bibendum '
#                                                 'at. Sed id semper risus in '
#                                                 'hendrerit gravida. '},
#                           'link': 'http://www.cnn.com',
#                           'title': {'en': 'A new license'}}],
#               'subjects': [{'subject': 'Vulputate dignissim'},
#                            {'subject': 'Accumsan sit amet nulla'},
#                            {'id': 'https://id.loc.gov/authorities/subjects/sh2009006392',
#                             'scheme': 'LCSH',
#                             'subject': 'Ketolide antibiotics'},
#                            {'id': 'https://id.loc.gov/authorities/subjects/sh2010002443',
#                             'scheme': 'LCSH',
#                             'subject': 'Antibiotics--Biotechnology--Congresses'},
#                            {'id': 'https://id.nlm.nih.gov/mesh/D000080908',
#                             'scheme': 'MeSH',
#                             'subject': 'Broadly Neutralizing Antibodies'},
#                            {'id': 'https://id.nlm.nih.gov/mesh/D000907',
#                             'scheme': 'MeSH',
#                             'subject': 'Antibodies, Bacterial'},
#                            {'id': 'https://id.nlm.nih.gov/mesh/D000908',
#                             'scheme': 'MeSH',
#                             'subject': 'Antibodies, Fungal'},
#                            {'id': 'https://id.loc.gov/authorities/names/no2016048752',
#                             'scheme': 'LCNAF',
#                             'subject': 'Thompson, Mary Harris, 1829-1895'},
#                            {'id': 'https://id.loc.gov/authorities/names/n2011081397',
#                             'scheme': 'LCNAF',
#                             'subject': 'United States. Army. Mediterranean '
#                                        'Theater of Operations'}],
#               'title': 'Problems and Exercises for Data Cleaning - EDITED',
#               'version': 'my newest amazing version'},
#  'parent': {'communities': {}, 'id': 'mhckq-3r070'},
#  'pids': {'doi': {'client': 'datacite',
#                   'identifier': '10.24418/vfjy1-jhr35',
#                   'provider': 'datacite'},
#           'oai': {'identifier': 'oai:prism-staging.fsm.northwestern.edu:vfjy1-jhr35',
#                   'provider': 'oai'}},
#  'revision_id': 7,
#  'status': 'published',
#  'updated': '2024-03-21T21:57:40.899801+00:00',
#  'versions': {'index': 1, 'is_latest': True}},
#        {'access': {'embargo': {'active': False, 'reason': None},
#             'files': 'public',
#             'record': 'public',
#             'status': 'open'},
#  'created': '2024-03-21T21:45:01.174142+00:00',
#  'custom_fields': {},
#  'files': {'enabled': True, 'order': []},
#  'id': 'vfjy1-jhr35',
#  'is_draft': False,
#  'is_published': True,
#  'links': {'access_links': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/access/links',
#            'archive': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/files-archive',
#            'doi': 'https://doi.org/10.24418/vfjy1-jhr35',
#            'draft': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/draft',
#            'files': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/files',
#            'latest': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/versions/latest',
#            'latest_html': 'https://prism-staging.fsm.northwestern.edu/records/vfjy1-jhr35/latest',
#            'reserve_doi': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/draft/pids/doi',
#            'self': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35',
#            'self_doi': 'https://prism-staging.fsm.northwestern.edu/doi/10.24418/vfjy1-jhr35',
#            'self_html': 'https://prism-staging.fsm.northwestern.edu/records/vfjy1-jhr35',
#            'self_iiif_manifest': 'https://prism-staging.fsm.northwestern.edu/api/iiif/record:vfjy1-jhr35/manifest',
#            'self_iiif_sequence': 'https://prism-staging.fsm.northwestern.edu/api/iiif/record:vfjy1-jhr35/sequence/default',
#            'versions': 'https://prism-staging.fsm.northwestern.edu/api/records/vfjy1-jhr35/versions'},
#  'metadata': {'additional_descriptions': [{'description': '<p>Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit gravida. '
#                                                           'Mauris vitae '
#                                                           'ultricies leo '
#                                                           'integer malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Cursus in '
#                                                           'hac habitasse '
#                                                           'platea dictumst. '
#                                                           'Sit amet risus '
#                                                           'nullam eget. '
#                                                           'Bibendum enim '
#                                                           'facilisis gravida '
#                                                           'neque convallis a '
#                                                           'cras semper. '
#                                                           'Malesuada '
#                                                           'pellentesque elit '
#                                                           'eget gravida cum '
#                                                           'sociis natoque. '
#                                                           'Vulputate dignissim '
#                                                           'suspendisse in est. '
#                                                           'Luctus venenatis '
#                                                           'lectus magna '
#                                                           'fringilla urna. '
#                                                           'Arcu cursus euismod '
#                                                           'quis viverra nibh '
#                                                           'cras pulvinar.</p>',
#                                            'lang': {'id': 'asw',
#                                                     'title': {'en': 'Australian '
#                                                                     'Aborigines '
#                                                                     'Sign '
#                                                                     'Language'}},
#                                            'type': {'id': 'abstract',
#                                                     'title': {'en': 'Abstract'}}},
#                                           {'description': '<p>Accumsan sit '
#                                                           'amet nulla facilisi '
#                                                           'morbi tempus '
#                                                           'iaculis urna. '
#                                                           'Egestas sed sed '
#                                                           'risus pretium quam '
#                                                           'vulputate. Aenean '
#                                                           'sed adipiscing diam '
#                                                           'donec adipiscing. '
#                                                           'Eget egestas purus '
#                                                           'viverra '
#                                                           'accumsan.&nbsp;</p>',
#                                            'lang': {'id': 'arz',
#                                                     'title': {'en': 'Egyptian '
#                                                                     'Arabic'}},
#                                            'type': {'id': 'acknowledgements',
#                                                     'title': {'en': 'Acknowledgements'}}},
#                                           {'description': '<p>Nisl rhoncus '
#                                                           'mattis rhoncus '
#                                                           'urna. Scelerisque '
#                                                           'mauris pellentesque '
#                                                           'pulvinar '
#                                                           'pellentesque '
#                                                           'habitant morbi '
#                                                           'tristique senectus '
#                                                           'et. Molestie at '
#                                                           'elementum eu '
#                                                           'facilisis sed '
#                                                           'odio.</p>',
#                                            'lang': {'id': 'lat',
#                                                     'title': {'en': 'Latin'}},
#                                            'type': {'id': 'methods',
#                                                     'title': {'en': 'Methods'}}},
#                                           {'description': '<p>Morbi tincidunt '
#                                                           'augue interdum '
#                                                           'velit euismod in '
#                                                           'pellentesque massa. '
#                                                           'Ultrices vitae '
#                                                           'auctor eu augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum. Nec '
#                                                           'sagittis aliquam '
#                                                           'malesuada bibendum '
#                                                           'arcu vitae '
#                                                           'elementum.</p>',
#                                            'lang': {'id': 'ell',
#                                                     'title': {'en': 'Modern '
#                                                                     'Greek '
#                                                                     '(1453-)'}},
#                                            'type': {'id': 'other',
#                                                     'title': {'en': 'Other'}}},
#                                           {'description': '<p>Lacinia quis vel '
#                                                           'eros donec ac odio '
#                                                           'tempor. Volutpat '
#                                                           'odio facilisis '
#                                                           'mauris sit amet '
#                                                           'massa vitae tortor '
#                                                           'condimentum.</p>',
#                                            'lang': {'id': 'eng',
#                                                     'title': {'en': 'English'}},
#                                            'type': {'id': 'series-information',
#                                                     'title': {'en': 'Series '
#                                                                     'information'}}},
#                                           {'description': '<p>Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit '
#                                                           'gravida.&nbsp;</p>',
#                                            'lang': {'id': 'aib',
#                                                     'title': {'en': 'Ainu '
#                                                                     '(China)'}},
#                                            'type': {'id': 'table-of-contents',
#                                                     'title': {'en': 'Table of '
#                                                                     'contents'}}},
#                                           {'description': '<p>Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit '
#                                                           'gravida.&nbsp;</p>',
#                                            'lang': {'id': 'eng',
#                                                     'title': {'en': 'English'}},
#                                            'type': {'id': 'technical-info',
#                                                     'title': {'en': 'Technical '
#                                                                     'info'}}},
#                                           {'description': '<p><strong>Another '
#                                                           'abstract</strong>: '
#                                                           'Etiam dignissim '
#                                                           'diam quis enim '
#                                                           'lobortis '
#                                                           'scelerisque '
#                                                           'fermentum. Risus '
#                                                           'nullam eget felis '
#                                                           'eget. Malesuada '
#                                                           'nunc vel risus '
#                                                           'commodo. Id aliquet '
#                                                           'lectus proin nibh '
#                                                           'nisl condimentum id '
#                                                           'venenatis. Augue ut '
#                                                           'lectus arcu '
#                                                           'bibendum at. Sed id '
#                                                           'semper risus in '
#                                                           'hendrerit '
#                                                           'gravida.&nbsp;</p>',
#                                            'lang': {'id': 'fra',
#                                                     'title': {'en': 'French'}},
#                                            'type': {'id': 'abstract',
#                                                     'title': {'en': 'Abstract'}}}],
#               'additional_titles': [{'lang': {'id': 'eng',
#                                               'title': {'en': 'English'}},
#                                      'title': 'An Alternative title for this '
#                                               'record',
#                                      'type': {'id': 'alternative-title',
#                                               'title': {'de': 'Alternativer '
#                                                               'Titel',
#                                                         'en': 'Alternative '
#                                                               'title'}}},
#                                     {'lang': {'id': 'ita',
#                                               'title': {'en': 'Italian'}},
#                                      'title': 'A subtitle for this record',
#                                      'type': {'id': 'subtitle',
#                                               'title': {'de': 'Untertitel',
#                                                         'en': 'Subtitle'}}},
#                                     {'lang': {'id': 'deu',
#                                               'title': {'en': 'German'}},
#                                      'title': 'A translated title for this '
#                                               'record',
#                                      'type': {'id': 'translated-title',
#                                               'title': {'de': 'Übersetzter '
#                                                               'Titel',
#                                                         'en': 'Translated '
#                                                               'title'}}},
#                                     {'lang': {'id': 'fra',
#                                               'title': {'en': 'French'}},
#                                      'title': 'An other title for this record',
#                                      'type': {'id': 'other',
#                                               'title': {'de': 'Sonstiger Titel',
#                                                         'en': 'Other'}}},
#                                     {'lang': {'id': 'spa',
#                                               'title': {'en': 'Spanish'}},
#                                      'title': 'Another other title for this '
#                                               'record',
#                                      'type': {'id': 'other',
#                                               'title': {'de': 'Sonstiger Titel',
#                                                         'en': 'Other'}}}],
#               'contributors': [{'affiliations': [{'id': '015j35893',
#                                                   'name': 'English Heritage'},
#                                                  {'name': 'Totenham'}],
#                                 'person_or_org': {'family_name': 'Abram',
#                                                   'given_name': 'Donald',
#                                                   'identifiers': [{'identifier': '0000-0002-0575-6152',
#                                                                    'scheme': 'orcid'}],
#                                                   'name': 'Abram, Donald',
#                                                   'type': 'personal'},
#                                 'role': {'id': 'role-data-curator',
#                                          'title': {'en': 'Data curator role'}}},
#                                {'affiliations': [{'id': '018sx6595',
#                                                   'name': 'Archéologie des '
#                                                           'Amériques'},
#                                                  {'id': '00t6xza10',
#                                                   'name': 'Swiss Federal '
#                                                           'Archives'}],
#                                 'person_or_org': {'family_name': 'Valens',
#                                                   'given_name': 'Renee',
#                                                   'identifiers': [{'identifier': '0000-0002-7619-0205',
#                                                                    'scheme': 'orcid'}],
#                                                   'name': 'Valens, Renee',
#                                                   'type': 'personal'},
#                                 'role': {'id': 'role-visualization',
#                                          'title': {'en': 'Visualization '
#                                                          'role'}}},
#                                {'affiliations': [{'name': 'United States'}],
#                                 'person_or_org': {'identifiers': [{'identifier': '04h9g9s69',
#                                                                    'scheme': 'ror'}],
#                                                   'name': 'PreventAGE Health '
#                                                           'Care (United '
#                                                           'States)',
#                                                   'type': 'organizational'},
#                                 'role': {'id': 'role-data-curator',
#                                          'title': {'en': 'Data curator role'}}},
#                                {'affiliations': [{'id': '0472wp149',
#                                                   'name': 'United Animal '
#                                                           'Health (United '
#                                                           'States)'},
#                                                  {'name': 'United States'}],
#                                 'person_or_org': {'identifiers': [{'identifier': '05kkrdt60',
#                                                                    'scheme': 'ror'}],
#                                                   'name': 'Hebrew Health Care',
#                                                   'type': 'organizational'},
#                                 'role': {'id': 'role-software',
#                                          'title': {'en': 'Software role'}}}],
#               'creators': [{'affiliations': [{'name': 'Northwestern '
#                                                       'University'},
#                                              {'name': 'Another department'},
#                                              {'id': '01bj3aw27',
#                                               'name': 'United States '
#                                                       'Department of Energy'}],
#                             'person_or_org': {'family_name': 'Doe',
#                                               'given_name': 'Jane A',
#                                               'identifiers': [{'identifier': '0000-0002-6747-0985',
#                                                                'scheme': 'orcid'}],
#                                               'name': 'Doe, Jane A',
#                                               'type': 'personal'},
#                             'role': {'id': 'role-data-curator',
#                                      'title': {'en': 'Data curator role'}}},
#                            {'affiliations': [{'name': 'Northwestern '
#                                                       'University'},
#                                              {'name': 'a department'},
#                                              {'id': '02fh5m162',
#                                               'name': 'Národní Pedagogické '
#                                                       'Muzeum a Knihovna J. A. '
#                                                       'Komenského'}],
#                             'person_or_org': {'family_name': 'Taylor',
#                                               'given_name': 'John',
#                                               'identifiers': [{'identifier': '0000-0002-7619-0205',
#                                                                'scheme': 'orcid'}],
#                                               'name': 'Taylor, John',
#                                               'type': 'personal'},
#                             'role': {'id': 'role-editor',
#                                      'title': {'en': 'Editor role'}}},
#                            {'affiliations': [{'name': 'Company'},
#                                              {'name': 'Norcross'},
#                                              {'id': '000e0be47',
#                                               'name': 'Northwestern '
#                                                       'University'}],
#                             'person_or_org': {'identifiers': [{'identifier': '00by51808',
#                                                                'scheme': 'ror'}],
#                                               'name': 'Mölnlycke Health Care '
#                                                       '(United States)',
#                                               'type': 'organizational'},
#                             'role': {'id': 'role-formal-analysis',
#                                      'title': {'en': 'Formal analysis role'}}},
#                            {'affiliations': [{'name': 'Raleigh'},
#                                              {'id': '000e0be47',
#                                               'name': 'Northwestern '
#                                                       'University'}],
#                             'person_or_org': {'identifiers': [{'identifier': '030bsqk21',
#                                                                'scheme': 'ror'}],
#                                               'name': 'Plant Health Care '
#                                                       '(United States)',
#                                               'type': 'organizational'},
#                             'role': {'id': 'role-education-and-training',
#                                      'title': {'en': 'Education and training '
#                                                      'role'}}}],
#               'dates': [{'date': '2022-04-01',
#                          'description': 'date this was accepted',
#                          'type': {'id': 'accepted',
#                                   'title': {'de': 'Angenommen',
#                                             'en': 'Accepted'}}},
#                         {'date': '1945/1966',
#                          'description': 'date this was available',
#                          'type': {'id': 'available',
#                                   'title': {'de': 'Verfügbar',
#                                             'en': 'Available'}}},
#                         {'date': '1987',
#                          'description': 'date this was collected',
#                          'type': {'id': 'collected',
#                                   'title': {'de': 'Gesammelt',
#                                             'en': 'Collected'}}},
#                         {'date': '1988',
#                          'description': 'another date this was collected',
#                          'type': {'id': 'collected',
#                                   'title': {'de': 'Gesammelt',
#                                             'en': 'Collected'}}},
#                         {'date': '2024-01-01',
#                          'description': 'date this was copyrighted',
#                          'type': {'id': 'copyrighted',
#                                   'title': {'de': 'Mit Copyright versehen',
#                                             'en': 'Copyrighted'}}},
#                         {'date': '2024',
#                          'description': 'date this was created',
#                          'type': {'id': 'created',
#                                   'title': {'de': 'Erstellt',
#                                             'en': 'Created'}}},
#                         {'date': '2010',
#                          'description': 'date this was issued',
#                          'type': {'id': 'issued',
#                                   'title': {'de': 'Veröffentlicht',
#                                             'en': 'Issued'}}},
#                         {'date': '2011',
#                          'description': 'date other',
#                          'type': {'id': 'other',
#                                   'title': {'de': 'Sonstiges', 'en': 'Other'}}},
#                         {'date': '2013',
#                          'description': 'date this was submitted',
#                          'type': {'id': 'submitted',
#                                   'title': {'de': 'Eingereicht',
#                                             'en': 'Submitted'}}},
#                         {'date': '1112',
#                          'description': 'date this was updated',
#                          'type': {'id': 'updated',
#                                   'title': {'de': 'Aktualisiert',
#                                             'en': 'Updated'}}},
#                         {'date': '1498',
#                          'description': 'date this was valid',
#                          'type': {'id': 'valid',
#                                   'title': {'de': 'Gültig', 'en': 'Valid'}}},
#                         {'date': '1889',
#                          'description': 'date this was withdrawn',
#                          'type': {'id': 'withdrawn',
#                                   'title': {'de': 'Zurückgezogen',
#                                             'en': 'Withdrawn'}}},
#                         {'date': '6589',
#                          'description': 'another acceptance date',
#                          'type': {'id': 'accepted',
#                                   'title': {'de': 'Angenommen',
#                                             'en': 'Accepted'}}}],
#               'description': '<p>Lorem ipsum dolor sit amet,<strong> '
#                              'consectetur adipiscing eli</strong>t, sed do '
#                              '<i>eiusmod tempor inc</i>ididunt ut labore et '
#                              'dolore magna aliqua. Ut diam quam nulla '
#                              'porttitor massa. Arcu risus quis varius quam '
#                              'quisque id diam. Consequat nisl vel pretium '
#                              'lectus quam id leo in. Elementum nibh tellus '
#                              'molestie nunc non blandit massa. Tristique nulla '
#                              'aliquet enim tortor at. Tellus elementum '
#                              'sagittis vitae et. Molestie ac feugiat sed '
#                              'lectus vestibulum.&nbsp;</p><blockquote><p>Eget '
#                              'nunc lobortis mattis aliquam faucibus purus. '
#                              'Tempor orci dapibus ultrices in. Vulputate mi '
#                              'sit <a '
#                              'href="https://twitter.com/home">https://twitter.com/home</a>amet '
#                              'mauris commodo quis imperdiet. Nunc sed augue '
#                              'lacus viverra. Placerat duis ultricies lacus sed '
#                              'turpis tincidunt id aliquet. In ante metus '
#                              'dictum at tempor commodo. Ullamcorper morbi '
#                              'tincidunt ornare massa eget egestas purus '
#                              'viverra. Ornare arcu odio ut sem nulla. Vel '
#                              'turpis nunc eget lorem. Sit amet consectetur '
#                              'adipiscing elit ut. Bibendum ut tristique et '
#                              'egestas.</p></blockquote><p>Donec et odio '
#                              'pellentesque diam volutpat commodo sed. Magnis '
#                              'dis parturient montes nascetur ridiculus mus '
#                              'mauris vitae. Pharetra diam sit amet nisl. Nulla '
#                              'facilisi morbi tempus iaculis urna id volutpat '
#                              'lacus laoreet. Eleifend donec pretium vulputate '
#                              'sapien nec sagittis.&nbsp;</p><ul><li>Donec et '
#                              'odio pellentesque diam volutpat commodo sed. '
#                              'Magnis dis parturient montes nascetur ridiculus '
#                              'mus mauris vitae.&nbsp;</li><li>Pharetra diam '
#                              'sit amet nisl. Nulla facilisi morbi tempus '
#                              'iaculis urna id volutpat lacus laoreet. Eleifend '
#                              'donec pretium vulputate sapien nec '
#                              'sagittis.&nbsp;</li></ul><p>Quis auctor elit sed '
#                              'vulputate mi sit amet. Lectus proin nibh nisl '
#                              'condimentum id venenatis a condimentum. Gravida '
#                              'in fermentum et sollicitudin ac. Odio aenean sed '
#                              'adipiscing diam donec.</p><ol><li>Cursus sit '
#                              'amet dictum sit amet justo donec enim '
#                              'diam.&nbsp;</li><li>Vestibulum mattis '
#                              'ullamcorper velit sed ullamcorper morbi '
#                              'tincidunt ornare massa.</li></ol><p>Nec '
#                              'ullamcorper sit amet risus. Fringilla ut morbi '
#                              'tincidunt augue interdum velit euismod in.</p>',
#               'funding': [{'award': {'identifiers': [{'identifier': 'https://reporter.nih.gov/search/T41epJ74wk6O-KXRM6WlQA/project-details/10390349',
#                                                       'scheme': 'url'}],
#                                      'number': '5R01HS028003-02',
#                                      'title': {'en': 'Implementation of '
#                                                      'Digital Mental Health '
#                                                      'Tools in Ambulatory Care '
#                                                      'Coordination'}},
#                            'funder': {'id': '01cwqze88',
#                                       'name': 'National Institutes of Health'}},
#                           {'award': {'identifiers': [{'identifier': 'https://reporter.nih.gov/search/T41epJ74wk6O-KXRM6WlQA/project-details/10390458999999',
#                                                       'scheme': 'url'}],
#                                      'number': '5R01HS028003-0789',
#                                      'title': {'en': 'Canadian Implementation '
#                                                      'of Digital Mental Health '
#                                                      'Tools in Ambulatory Care '
#                                                      'Coordination'}},
#                            'funder': {'id': '01sdtdd95',
#                                       'name': 'Canadian Institute for Advanced '
#                                               'Research'}}],
#               'identifiers': [{'identifier': 'ark:/c8131/g3rp42',
#                                'scheme': 'ark'},
#                               {'identifier': 'arXiv:2109.13768',
#                                'scheme': 'arxiv'},
#                               {'identifier': 'arXiv:2109.13777',
#                                'scheme': 'arxiv'},
#                               {'identifier': '1974AJ.....79..819H',
#                                'scheme': 'bibcode'},
#                               {'identifier': '10.1002/acn3.51179',
#                                'scheme': 'doi'},
#                               {'identifier': '1234567890128',
#                                'scheme': 'ean13'},
#                               {'identifier': '1470-7330', 'scheme': 'eissn'},
#                               {'identifier': '20.1000/100', 'scheme': 'handle'},
#                               {'identifier': 'HRV003M16', 'scheme': 'igsn'},
#                               {'identifier': '978-3-16-148410-0',
#                                'scheme': 'isbn'},
#                               {'identifier': '1740-5025', 'scheme': 'issn'},
#                               {'identifier': 'A02-2009-000004BE-A',
#                                'scheme': 'istc'},
#                               {'identifier': '1748-7188', 'scheme': 'lissn'},
#                               {'identifier': 'urn:lsid:zoobank.org:pub:CDC8D258-8F57-41DC-B560-247E17D3DC8C',
#                                'scheme': 'lsid'},
#                               {'identifier': '23193287', 'scheme': 'pmid'},
#                               {'identifier': 'https://purl.fdlp.gov/GPO/gpo53258',
#                                'scheme': 'purl'},
#                               {'identifier': '042100005264', 'scheme': 'upc'},
#                               {'identifier': 'https://www.cnn.com',
#                                'scheme': 'url'},
#                               {'identifier': 'urn:isan:0000-0000-2CEA-0000-1-0000-0000-Y',
#                                'scheme': 'urn'},
#                               {'identifier': 'https://w3id.org/tree',
#                                'scheme': 'w3id'},
#                               {'identifier': '2-s2.0-85178352590',
#                                'scheme': 'other'}],
#               'languages': [{'id': 'pol', 'title': {'en': 'Polish'}}],
#               'publication_date': '2020-09-10',
#               'publisher': 'Prism. Galter Health Sciences Library. '
#                            'Northwestern University',
#               'related_identifiers': [{'identifier': '10.1007/s00216-024-05178-z',
#                                        'relation_type': {'id': 'iscitedby',
#                                                          'title': {'en': 'Is '
#                                                                          'cited '
#                                                                          'by'}},
#                                        'resource_type': {'id': 'article-journal_article',
#                                                          'title': {'en': 'Journal '
#                                                                          'Article'}},
#                                        'scheme': 'doi'},
#                                       {'identifier': '10.1016/j.ncrna.2023.11.005',
#                                        'relation_type': {'id': 'issupplementedby',
#                                                          'title': {'en': 'Is '
#                                                                          'supplemented '
#                                                                          'by'}},
#                                        'resource_type': {'id': 'article-journal_article',
#                                                          'title': {'en': 'Journal '
#                                                                          'Article'}},
#                                        'scheme': 'doi'},
#                                       {'identifier': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-85184204420&doi=10.1007%2fs00216-024-05178-z&partnerID=40&md5=2f5608ef0460e4e4cadf680800e59668',
#                                        'relation_type': {'id': 'ispartof',
#                                                          'title': {'en': 'Is '
#                                                                          'part '
#                                                                          'of'}},
#                                        'resource_type': {'id': 'article-evaluation_study',
#                                                          'title': {'en': 'Evaluation '
#                                                                          'Study'}},
#                                        'scheme': 'url'},
#                                       {'identifier': '2-s2.0-85184204420',
#                                        'relation_type': {'id': 'isnewversionof',
#                                                          'title': {'en': 'Is '
#                                                                          'new '
#                                                                          'version '
#                                                                          'of'}},
#                                        'resource_type': {'id': 'archival-exhibition',
#                                                          'title': {'en': 'Exhibition'}},
#                                        'scheme': 'other'}],
#               'resource_type': {'id': 'learning-problems_and_exercises',
#                                 'title': {'en': 'Problems and Exercises'}},
#               'rights': [{'description': {'en': 'This license lets others '
#                                                 'distribute, remix, adapt, and '
#                                                 'build upon this work, even '
#                                                 'commercially, and although '
#                                                 'their new works must also '
#                                                 'acknowledge the original '
#                                                 'creator(s), they don’t have '
#                                                 'to license their derivative '
#                                                 'works on the same terms.'},
#                           'icon': 'cc-by-icon',
#                           'id': 'cc-by-4.0',
#                           'props': {'scheme': 'spdx',
#                                     'url': 'https://creativecommons.org/licenses/by/4.0/legalcode'},
#                           'title': {'en': 'Creative Commons Attribution 4.0 '
#                                           'International'}},
#                          {'description': {'en': 'This license lets others '
#                                                 'remix, adapt, and build upon '
#                                                 'this work non-commercially, '
#                                                 'as long as they credit the '
#                                                 'original creator(s) and '
#                                                 'license their new creations '
#                                                 'under the identical terms.'},
#                           'icon': 'cc-by-nc-sa-icon',
#                           'id': 'cc-by-nc-sa-4.0',
#                           'props': {'scheme': 'spdx',
#                                     'url': 'https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode'},
#                           'title': {'en': 'Creative Commons Attribution Non '
#                                           'Commercial Share Alike 4.0 '
#                                           'International'}},
#                          {'description': {'en': 'Etiam dignissim diam quis '
#                                                 'enim lobortis scelerisque '
#                                                 'fermentum. Risus nullam eget '
#                                                 'felis eget. Malesuada nunc '
#                                                 'vel risus commodo. Id aliquet '
#                                                 'lectus proin nibh nisl '
#                                                 'condimentum id venenatis. '
#                                                 'Augue ut lectus arcu bibendum '
#                                                 'at. Sed id semper risus in '
#                                                 'hendrerit gravida. '},
#                           'link': 'http://www.cnn.com',
#                           'title': {'en': 'A new license'}}],
#               'subjects': [{'subject': 'Vulputate dignissim'},
#                            {'subject': 'Accumsan sit amet nulla'},
#                            {'id': 'https://id.loc.gov/authorities/subjects/sh2009006392',
#                             'scheme': 'LCSH',
#                             'subject': 'Ketolide antibiotics'},
#                            {'id': 'https://id.loc.gov/authorities/subjects/sh2010002443',
#                             'scheme': 'LCSH',
#                             'subject': 'Antibiotics--Biotechnology--Congresses'},
#                            {'id': 'https://id.nlm.nih.gov/mesh/D000080908',
#                             'scheme': 'MeSH',
#                             'subject': 'Broadly Neutralizing Antibodies'},
#                            {'id': 'https://id.nlm.nih.gov/mesh/D000907',
#                             'scheme': 'MeSH',
#                             'subject': 'Antibodies, Bacterial'},
#                            {'id': 'https://id.nlm.nih.gov/mesh/D000908',
#                             'scheme': 'MeSH',
#                             'subject': 'Antibodies, Fungal'},
#                            {'id': 'https://id.loc.gov/authorities/names/no2016048752',
#                             'scheme': 'LCNAF',
#                             'subject': 'Thompson, Mary Harris, 1829-1895'},
#                            {'id': 'https://id.loc.gov/authorities/names/n2011081397',
#                             'scheme': 'LCNAF',
#                             'subject': 'United States. Army. Mediterranean '
#                                        'Theater of Operations'}],
#               'title': 'Problems and Exercises for Data Cleaning - EDITED',
#               'version': 'my newest amazing version'},
#  'parent': {'communities': {}, 'id': 'mhckq-3r070'},
#  'pids': {'doi': {'client': 'datacite',
#                   'identifier': '10.24418/vfjy1-jhr35',
#                   'provider': 'datacite'},
#           'oai': {'identifier': 'oai:prism-staging.fsm.northwestern.edu:vfjy1-jhr35',
#                   'provider': 'oai'}},
#  'revision_id': 7,
#  'status': 'published',
#  'updated': '2024-03-21T21:57:40.899801+00:00',
#  'versions': {'index': 1, 'is_latest': True}}]




In [40]:
### IF USING THE SAMPLE DATA check if data is a list
### OTHERWISE NOT NEEDED

# if isinstance(data, list):
#     for index, item in enumerate(data):
#         if isinstance(item, dict):
#             print(f"Keys for item {index}: {item.keys()}")
#         else:
#             print(f"Item {index} is not a dictionary.")
# else:
#     print("The data is not a list.")

In [41]:
## Flattend the data for specific keys
## This script provides a function, flatten_data(), which is designed to flatten nested data structures contained in a list of dictionaries.

def flatten_data(data, flatten_keys=None, list_keys=None):
    """
    Flattens data based on specified keys to flatten and list keys to handle.

    Parameters:
    data (list of dicts): The data to flatten.
    flatten_keys (list of str): Keys whose nested dictionaries should be flattened.
    list_keys (list of str): Keys whose lists should be handled differently.

    Returns:
    list of dicts: The flattened data.
    """
    if flatten_keys is None:
        flatten_keys = []
    if list_keys is None:
        list_keys = []
    
    flattened_data = []
    
    for item in data:
        flattened_item = {}
        for key, value in item.items():
            if key in flatten_keys and isinstance(value, dict):
                # Flatten nested dictionaries
                for k, v in value.items():
                    flattened_item[f"{key}.{k}"] = v
            elif key in list_keys and isinstance(value, list):
                # Handle lists differently
                if key not in flattened_item:
                    flattened_item[key] = []
                flattened_item[key].extend(value)
            elif key not in flatten_keys and key not in list_keys:
                # Regular values
                flattened_item[key] = value
        
        # Add NaN or None for missing keys in lists
        max_len = max(len(value) if isinstance(value, list) else 1 for value in flattened_item.values())
        for key, value in flattened_item.items():
            if isinstance(value, list) and len(value) < max_len:
                flattened_item[key] += [np.nan] * (max_len - len(value))
        
        flattened_data.append(flattened_item)
    
    return flattened_data

# Example usage:
# Specify which keys to flatten and which keys to handle as lists
flatten_keys = ['access', 
                'created', 
                'custom_fields', 
                'files', 'id', 
                'is_draft', 
                'is_published', 
                'links', 
                'metadata', 
                'parent', 
                'pids', 
                'revision_id', 
                'status', 
                'updated', 
                'versions']
list_keys = []

# Flatten data
flattened_data = flatten_data(final_community_list, flatten_keys=flatten_keys, list_keys=list_keys)

# Create DataFrame
data_1_df = pd.DataFrame(flattened_data)

# Display DataFrame
data_1_df.head()


Unnamed: 0,links.self,links.self_html,links.self_doi,links.doi,links.parent,links.parent_html,links.parent_doi,links.self_iiif_manifest,links.self_iiif_sequence,links.files,...,files.count,files.total_bytes,files.entries,media_files,deletion_status,stats,metadata.additional_descriptions,metadata.description,metadata.contributors,metadata.additional_titles
0,https://prism.northwestern.edu/api/records/ccs...,https://prism.northwestern.edu/records/ccsqm-d...,https://prism.northwestern.edu/doi/10.18131/g3...,https://doi.org/10.18131/g3-fczs-gg86,https://prism.northwestern.edu/api/records/qsh...,https://prism.northwestern.edu/records/qshxd-j...,https://prism.northwestern.edu/doi/10.18131/qs...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/records/ccs...,...,1,563906,{'img007.pdf': {'id': '5e9935f1-235c-4520-a0ef...,"{'enabled': False, 'order': [], 'count': 0, 't...","{'is_deleted': False, 'status': 'P'}","{'this_version': {'views': 82, 'unique_views':...",,,,
1,https://prism.northwestern.edu/api/records/9vf...,https://prism.northwestern.edu/records/9vf4w-n...,https://prism.northwestern.edu/doi/10.18131/g3...,https://doi.org/10.18131/g3-09q8-1v93,https://prism.northwestern.edu/api/records/pdv...,https://prism.northwestern.edu/records/pdvsy-b...,https://prism.northwestern.edu/doi/10.18131/pd...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/records/9vf...,...,1,288585,{'img015.pdf': {'id': '73c4d1d4-b582-4ba2-b2c1...,"{'enabled': False, 'order': [], 'count': 0, 't...","{'is_deleted': False, 'status': 'P'}","{'this_version': {'views': 46, 'unique_views':...",,,,
2,https://prism.northwestern.edu/api/records/h9j...,https://prism.northwestern.edu/records/h9jwj-6...,https://prism.northwestern.edu/doi/10.18131/g3...,https://doi.org/10.18131/g3-fr15-6752,https://prism.northwestern.edu/api/records/48q...,https://prism.northwestern.edu/records/48qwt-q...,https://prism.northwestern.edu/doi/10.18131/48...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/records/h9j...,...,1,453339,{'img002.pdf': {'id': '396581d2-c861-43e1-9d1f...,"{'enabled': False, 'order': [], 'count': 0, 't...","{'is_deleted': False, 'status': 'P'}","{'this_version': {'views': 10, 'unique_views':...",,,,
3,https://prism.northwestern.edu/api/records/fga...,https://prism.northwestern.edu/records/fgazv-c...,https://prism.northwestern.edu/doi/10.18131/g3...,https://doi.org/10.18131/g3-96h1-e620,https://prism.northwestern.edu/api/records/vmd...,https://prism.northwestern.edu/records/vmdva-b...,https://prism.northwestern.edu/doi/10.18131/vm...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/records/fga...,...,1,614689,{'img052.pdf': {'id': 'bec39567-f3ff-4b8b-97c7...,"{'enabled': False, 'order': [], 'count': 0, 't...","{'is_deleted': False, 'status': 'P'}","{'this_version': {'views': 6, 'unique_views': ...",,,,
4,https://prism.northwestern.edu/api/records/sj6...,https://prism.northwestern.edu/records/sj60f-d...,https://prism.northwestern.edu/doi/10.18131/g3...,https://doi.org/10.18131/g3-vk0s-fp85,https://prism.northwestern.edu/api/records/3ms...,https://prism.northwestern.edu/records/3msmw-h...,https://prism.northwestern.edu/doi/10.18131/3m...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/iiif/record...,https://prism.northwestern.edu/api/records/sj6...,...,1,956817,{'img034.pdf': {'id': '146cf278-18b5-410e-a9e0...,"{'enabled': False, 'order': [], 'count': 0, 't...","{'is_deleted': False, 'status': 'P'}","{'this_version': {'views': 2, 'unique_views': ...",,,,


In [42]:
## Print the column indices and column names as needed
# print("Column indices and names:")
# for index, column_name in enumerate(data_1_df.columns):
#     print("Index:", index, "| Column Name:", column_name)

In [43]:
## Print data as neded to check various things
# pprint(data_1_df.at[0, 'metadata.creators'])  

In [44]:
# ## Save dataframe to a CSV

with open(r"output/data_1_df.csv", 'w', encoding='utf-8') as file:
    data_1_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

In [45]:
### Use for flattening specific keys (i.e. those that aren't too complicated to faltten) 

def flatten_dict(d, parent_key='', sep='.'):
    """
    Recursively flattens a nested dictionary.

    Parameters:
    d (dict): The dictionary to flatten.
    parent_key (str): The base key string for nested keys.
    sep (str): The separator between parent and child keys.

    Returns:
    dict: A flattened dictionary.
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def flatten_metadata(data_df, metadata_key):
    """
    Flattens the specified metadata key from the DataFrame.

    Parameters:
    data_df (pd.DataFrame): The DataFrame containing metadata.
    metadata_key (str): The key in the metadata to flatten.

    Returns:
    pd.DataFrame: A DataFrame with flattened metadata.
    """
    # Initialize a dictionary to store lists for each column
    columns = {}
    
    # Iterate through each list of entries in the specified metadata column
    for entries in data_df[f'metadata.{metadata_key}']:
        
        # Handle NaN values (which are float types in Pandas)
        if isinstance(entries, float) and np.isnan(entries):
            entries = []  # Replace NaN with an empty list

        # Ensure entries is always iterable
        elif not isinstance(entries, list):
            entries = [entries]  # Convert single non-list values to a list

        # Initialize temporary lists for each entry
        temp_data = {}

        for entry in entries:
            if isinstance(entry, dict):
                flattened_entry = flatten_dict(entry)
                for key, value in flattened_entry.items():
                    column_key = f'{metadata_key}.{key}'
                    temp_data.setdefault(column_key, []).append(value)
            else:
                #print(f"Ignoring non-dictionary entry: {entry}")
        
        # Append temporary lists to the main lists in columns dictionary
        for column_key, temp_list in temp_data.items():
            if column_key not in columns:
                columns[column_key] = []  # Ensure all columns exist
            
            # Fill missing values with None to match the longest list
            while len(columns[column_key]) < len(entries):
                columns[column_key].append(None)

            columns[column_key].extend(temp_list)

    # Ensure all lists have the same length
    max_length = max(len(col) for col in columns.values()) if columns else 0
    for col_key in columns:
        while len(columns[col_key]) < max_length:
            columns[col_key].append(None)  # Fill missing values

    # Create a pandas DataFrame with lists as values
    flattened_field_df = pd.DataFrame(columns)
    
    return flattened_field_df

def flatten_metadata_list(data_df, metadata_key_list):
    """
    Flattens the specified metadata fields from the DataFrame.

    Parameters:
    data_df (pd.DataFrame): The DataFrame containing metadata.
    metadata_key_list (list): The list of metadata keys to flatten.

    Returns:
    pd.DataFrame: A DataFrame with flattened metadata.
    """
    flattened_data_df = data_df.copy()
    
    for metadata_key in metadata_key_list:
        flattened_field_df = flatten_metadata(data_df, metadata_key)
        flattened_data_df = pd.concat([flattened_data_df, flattened_field_df], axis=1)
    
    return flattened_data_df


# Specify the metadata fields to flatten
metadata_key_list = ['additional_descriptions',
                     'additional_titles',
                     'dates',
                     'languages',
                     #'publisher',
                     'resource_type',
                     'rights'
                    ]

# Call the function
data_2_df = flatten_metadata_list(data_1_df, metadata_key_list)
data_2_df.head()


IndentationError: expected an indented block after 'else' statement on line 58 (3791371611.py, line 62)

In [None]:
# ## Print the column indices and column names
# print("Column indices and names:")
# for index, column_name in enumerate(data_2_df.columns):
#     print("Index:", index, "| Column Name:", column_name)

In [None]:
# ## Save dataframe to a CSV

with open(r"output/data_2_df.csv", 'w', encoding='utf-8') as file:
    data_2_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

In [None]:
# pprint(data_2_df.at[0, 'metadata.creators'])  

In [None]:
## Flatten the metadata for creator, contributor

def process_creator_contributor_metadata(data_df, metadata_key):
    names_list = []
    types_list = []
    orcids_list = []
    affiliations_list = []
    affiliation_ids_list = []
    role_ids_list = []
    role_titles_list = []

    # Iterate through the data and extract required information
    for metadata_list in data_df[f"metadata.{metadata_key}"]:
        # Initialize temporary lists to store values for the current metadata_list
        names = []
        types = []
        orcids = []
        affiliations = []
        affiliation_ids = []
        role_ids = []
        role_titles = []

        # Check if the element is a list and not NaN
        if isinstance(metadata_list, list):
            for item in metadata_list:
                if isinstance(item, dict):
                    person_or_org = item.get('person_or_org', {})
                    role = item.get('role', {})

                    # Extract and concatenate names
                    names.append(person_or_org.get('name', ''))

                    # Extract and concatenate types
                    types.append(person_or_org.get('type', ''))

                    # Extract and concatenate ORCID identifiers
                    identifiers = person_or_org.get('identifiers', [])
                    orcids.append('|'.join([f"{i['identifier']}" for i in identifiers if i.get('scheme') == 'orcid']))

                    # Extract and concatenate affiliations
                    affils = item.get('affiliations', [])
                    affil_names = [a['name'] for a in affils if 'name' in a]
                    affil_ids = [f"{a['id']}" for a in affils if 'id' in a]

                    affiliations.append('|'.join(affil_names))
                    affiliation_ids.append('|'.join(affil_ids))

                    # Extract and concatenate role ids and titles
                    role_ids.append(role.get('id', ''))
                    role_titles.append(role.get('title', {}).get('en', ''))

        # Append the concatenated values for the current metadata_list to the main lists
        names_list.append('; '.join(names))
        types_list.append('; '.join(types))
        orcids_list.append('; '.join(orcids))
        affiliations_list.append('; '.join(affiliations))
        affiliation_ids_list.append('; '.join(affiliation_ids))
        role_ids_list.append('; '.join(role_ids))
        role_titles_list.append('; '.join(role_titles))

    return pd.DataFrame({
        f'{metadata_key}.person_or_org.name': names_list,
        f'{metadata_key}.person_or_org.type': types_list,
        f'{metadata_key}.person_or_org.identifiers.orcid': orcids_list,
        f'{metadata_key}.person_or_org.affiliation': affiliations_list,
        f'{metadata_key}.affiliations.id': affiliation_ids_list,
        f'{metadata_key}.role.id': role_ids_list,
        f'{metadata_key}.role.title.en': role_titles_list
    })

metadata_key_list = ['creators', 'contributors']
final_dfs = []

# Process each metadata key and collect the resulting DataFrames
for key in metadata_key_list:
    final_dfs.append(process_creator_contributor_metadata(data_2_df, key))

# Concatenate the DataFrames
creator_contributor_df = pd.concat(final_dfs, axis=1)

# Print the final DataFrame
creator_contributor_df.head()




In [None]:
# pprint(creator_contributor_df.at[0, 'creators.person_or_org.affiliation'])  

In [None]:
## Concatendate the creator_df to data_df
data_3_df = pd.concat([data_2_df, creator_contributor_df], axis=1)
data_3_df.head(50)

In [None]:
# ## Print the column indices and column names
# print("Column indices and names:")
# for index, column_name in enumerate(data_3_df.columns):
#     print("Index:", index, "| Column Name:", column_name)

In [None]:
# ## Save dataframe to a CSV

with open(r"output/data_3_df.csv", 'w', encoding='utf-8') as file:
    data_3_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

In [None]:
### Use to flatten the metadata for subjects 

def flatten_dict(d, parent_key='', sep='.'):
    """
    Recursively flattens a nested dictionary.

    Parameters:
    d (dict): The dictionary to flatten.
    parent_key (str): The base key string for nested keys.
    sep (str): The separator between parent and child keys.

    Returns:
    dict: A flattened dictionary.
    """
    items = []  # Initialize an empty list to store the flattened key-value pairs.
    
    for k, v in d.items():
        # Create a new key by appending the current key to the parent key with the separator.
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        
        if isinstance(v, dict):
            # If the value is a dictionary, recursively flatten it and extend the items list.
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            # If the value is not a dictionary, append the new key-value pair to the items list.
            items.append((new_key, v))
    
    return dict(items)  # Convert the list of items into a dictionary and return it.


def process_subject_metadata(data_df, metadata_key, config):
    """
    Processes the specified metadata key from the DataFrame based on the configuration.

    Parameters:
    data_df (pd.DataFrame): The DataFrame containing metadata.
    metadata_key (str): The key in the metadata to process.
    config (dict): The configuration specifying how to handle different schemes/types.

    Returns:
    pd.DataFrame: A DataFrame with processed metadata.
    """
    # Initialize lists to store extracted information
    # lists_dict is initialized as a dictionary of lists to store extracted information. 
    # It has keys formatted as <metadata_key>.<scheme>.<item>, 
    # for example: {'subjects.MeSH.id': [['https://id.nlm.nih.gov/mesh/D000080908', 'https://id.nlm.nih.gov/mesh/D000907', 'https://id.nlm.nih.gov/mesh/D000908'],
    # and an additional key <metadata_key> to store subjects without a scheme, for example: subjects
    
    lists_dict = {f"{metadata_key}.{scheme}.{item}": [] for scheme in config.keys() for item in config[scheme].keys()}
    lists_dict[metadata_key] = []

    # Iterate through each list of entries in the specified metadata column
    for entries in data_df[f'metadata.{metadata_key}']:
        
        # For each entry, temp_lists_dict is initialized, similarly to lists_dict, 
        # for example: temp_lists_dict: {'subjects.MeSH.id': ['https://id.nlm.nih.gov/mesh/D000080908'], 'subjects.MeSH.subject': ['Broadly Neutralizing Antibodies'], 'subjects.LCSH.id': ['https://id.loc.gov/authorities/subjects/sh2009006392', 'https://id.loc.gov/authorities/subjects/sh2010002443'], 'subjects.LCSH.subject': ['Ketolide antibiotics', 'Antibiotics--Biotechnology--Congresses'], 'subjects.LCNAF.id': [], 'subjects.LCNAF.subject': [], 'subjects': ['Vulputate dignissim', 'Accumsan sit amet nulla']}
        # to temporarily hold extracted data for the current entry.
        # temp_lists_dict = {f"{metadata_key}.{scheme}.{item}": [] for scheme in config.keys() for item in config[scheme].keys()}
        # temp_lists_dict[metadata_key] = []
        
        temp_lists_dict = {}

        for scheme in config.keys():
            for item in config[scheme].keys():
                temp_lists_dict[f"{metadata_key}.{scheme}.{item}"] = []
        temp_lists_dict[metadata_key] = []

        ## If entries is a list, 
        if isinstance(entries, list):
            
            # then iterates through each entry in entries.
            for entry in entries:
                
                ## If entry is a dictionary, then extract the scheme from entry.
                if isinstance(entry, dict):
                    scheme = entry.get('scheme', '')
                    
                    ### If scheme is in config, 
                    if scheme in config:
                        
                        # then iterate through each item specified in config for the given scheme and 
                        # append the corresponding value from entry to temp_lists_dict.
                        for item_key, column_name in config[scheme].items():
                            temp_lists_dict[f"{metadata_key}.{scheme}.{item_key}"].append(entry.get(item_key, ''))
                    else:
                        temp_lists_dict[metadata_key].append(entry.get('subject', ''))
                
                # Print temp_lists_dict to inspect its contents
#                 print("temp_lists_dict:")
#                 print(temp_lists_dict)
                
        # Append collected values to main lists
        for key in temp_lists_dict.keys():
            lists_dict[key].append(temp_lists_dict[key])
    
    # Print lists_dict to inspect its contents
#     print("lists_dict:")
#     print(lists_dict)
    
    # Create DataFrame
    flattened_df = pd.DataFrame(lists_dict)
    
    return flattened_df

def flatten_subject_metadata_list(data_df, metadata_key_list, config):
    """
    Flattens the specified metadata fields from the DataFrame based on the configuration.

    Parameters:
    data_df (pd.DataFrame): The DataFrame containing metadata.
    metadata_key_list (list): The list of metadata keys to flatten.
    config (dict): The configuration specifying how to handle different schemes/types.

    Returns:
    pd.DataFrame: A DataFrame with flattened metadata.
    """
    flattened_data_df = data_df.copy()
    
    for metadata_key in metadata_key_list:
        flattened_field_df = process_subject_metadata(data_df, metadata_key, config)
        flattened_data_df = pd.concat([flattened_data_df, flattened_field_df], axis=1)
    
    return flattened_data_df

# Configuration for handling different schemes
config = {
    'MeSH': {
        'id': 'id',
        'subject': 'subject'
    },
    'LCSH': {
        'id': 'id',
        'subject': 'subject'
    },
    'LCNAF': {
        'id': 'id',
        'subject': 'subject'
    }
}

# Specify the metadata fields to flatten
metadata_key_list = ['subjects']

# Call the function
data_4_df = flatten_subject_metadata_list(data_3_df, metadata_key_list, config)
data_4_df.head()


In [None]:
# pprint(flattened_final_df.at[0, 'subjects.subject.MeSH.subject'])  

In [None]:
# ## Print the column indices and column names
# print("Column indices and names:")
# for index, column_name in enumerate(data_4_df.columns):
#     print("Index:", index, "| Column Name:", column_name)

In [None]:
# ## Save dataframe to a CSV

with open(r"output/data_4_df.csv", 'w', encoding='utf-8') as file:
    data_4_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

In [None]:
## Flatten the metadata for Funder

# Check if 'metadata.funding' exists in the DataFrame
if 'metadata.funding' in data_4_df.columns:
    rows = []  # Store concatenated values

    # Iterate through the data
    for funding_data in data_4_df['metadata.funding']:
        # Handle NaN values (which are float types in Pandas)
        if isinstance(funding_data, float) and np.isnan(funding_data):
            funding_data = []

        if isinstance(funding_data, list) and len(funding_data) > 0:
            funder_names = []
            funder_ids = []
            award_identifiers = []
            award_schemes = []
            award_numbers = []
            award_titles = []

            for item in funding_data:
                if isinstance(item, dict):  # Ensure item is a dictionary
                    funder = item.get('funder', {})
                    award = item.get('award', {})
                    identifiers = award.get('identifiers', [])

                    # Convert None to empty string before joining
                    funder_names.append(str(funder.get('name', '')))
                    funder_ids.append(str(funder.get('id', '')))

                    # Ensure identifiers exist
                    award_identifiers.append('; '.join([str(i.get('identifier', '')) for i in identifiers]))
                    award_schemes.append('; '.join([str(i.get('scheme', '')) for i in identifiers]))

                    # Handle cases where title might be None
                    title_data = award.get('title', {})
                    award_titles.append(title_data.get('en', '') if isinstance(title_data, dict) else '')

                    award_numbers.append(str(award.get('number', '')))

            # Create a dictionary for each row
            row = {
                'funding.funder.name': '; '.join(funder_names),
                'funding.funder.id': '; '.join(funder_ids),
                'funding.funder.award.identifiers.identifier': '; '.join(award_identifiers),
                'funding.funder.award.identifiers.scheme': '; '.join(award_schemes),
                'funding.funder.award.number': '; '.join(award_numbers),
                'funding.funder.award.title.en': '; '.join(award_titles)
            }
            rows.append(row)
        else:
            # Handle cases where funding_data is not a list or is an empty list
            row = {
                'funding.funder.name': '',
                'funding.funder.id': '',
                'funding.funder.award.identifiers.identifier': '',
                'funding.funder.award.identifiers.scheme': '',
                'funding.funder.award.number': '',
                'funding.funder.award.title.en': ''
            }
            rows.append(row)

    # Create a DataFrame
    funding_df = pd.DataFrame(rows)

    # Print the DataFrame
    print(funding_df.head())
else:
    print("'metadata.funding' column not found. Skipping processing.")


In [None]:
## Concatenate dataframes
if 'funding_df' in locals() and not funding_df.empty:  # Check if funding_df exists and is not empty
    data_5_df = pd.concat([data_4_df, funding_df], axis=1)
    print("funding_df exists and was merged with data_4_df.")
else:
    data_5_df = data_4_df.copy()  # Just copy data_4_df if funding_df is missing
    print("funding_df does not exist. Using data_4_df as data_5_df.")


In [None]:
# ## Save dataframe to a CSV

with open(r"output/data_5_df.csv", 'w', encoding='utf-8') as file:
    data_5_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

In [None]:
#### Flatten identifiers ####

# # Check if 'metadata.funding' exists in the DataFrame
# if 'metadata.identifiers' in data_5_df.columns:

#     # Initialize lists to store the concatenated values for each data_list item
#     rows = []

#     # Iterate through the data and extract required information
#     for identifier_data in data_5_df['metadata.identifiers']:
    
#         identifiers_by_scheme = {}
#         if isinstance(identifier_data , list):
#             for item in identifier_data: 
#                 if isinstance(item, dict):
#                     scheme = item.get('scheme')
#                     identifier = item.get('identifier')
            
#                     if scheme not in identifiers_by_scheme:
#                         identifiers_by_scheme[scheme] = []
#                     identifiers_by_scheme[scheme].append(identifier)
    
            
#             # Create a dictionary for each row without using dictionary comprehension
#             row = {}
#             for scheme, ids in identifiers_by_scheme.items():
#                 row[f'identifiers.identifier.{scheme}'] = '; '.join(ids)
#             rows.append(row)
#         else:
#             # Handle cases where item_data is not a dictionary or does not have 'identifiers'
#             row = {f'identifiers.identifier.{scheme}': '' for scheme in identifiers_by_scheme.keys()}
#             rows.append(row)

# # Create a DataFrame from the list of row dictionaries
# identifiers_df = pd.DataFrame(rows)
# identifiers_df.head()

import pandas as pd
  
# Check if 'metadata.identifiers' exists in the DataFrame
if 'metadata.identifiers' in data_5_df.columns:
    for identifier_data in data_5_df['metadata.identifiers']:
        #Ensure rows is always defined
        rows = []
        # Handle NaN values
        if isinstance(identifier_data, float) and pd.isna(identifier_data):
            identifier_data = []  # Replace NaN with an empty list

        identifiers_by_scheme = {}

        if isinstance(identifier_data, list):  # Ensure it's a list before iterating
            for item in identifier_data: 
                if isinstance(item, dict):
                    scheme = str(item.get('scheme', 'unknown'))  # Ensure scheme is a string
                    identifier = str(item.get('identifier', ''))  # Convert to string
                    
                    if scheme not in identifiers_by_scheme:
                        identifiers_by_scheme[scheme] = []
                    identifiers_by_scheme[scheme].append(identifier)
    
            # Create a dictionary for each row
            row = {f'identifiers.identifier.{scheme}': '; '.join(ids) for scheme, ids in identifiers_by_scheme.items()}
            rows.append(row)
        else:
            # Ensure rows is always appended to avoid empty DataFrame errors
            rows.append({})

    # Create DataFrame only if rows exist
    identifiers_df = pd.DataFrame(rows)

    # Print the DataFrame
    identifiers_df.head()

else:
    # Handle missing column case
    identifiers_df = pd.DataFrame()  # Assign an empty DataFrame
    print("'metadata.identifiers' column not found. Skipping processing.")




In [None]:
## Concatenate dataframes
if 'identifiers_df' in locals() and not identifiers_df.empty:  # Check if identifiers_df exists and is not empty
    data_6_df = pd.concat([data_5_df, identifiers_df], axis=1)
    print("identifiers_df exists and was merged with data_5_df.")
else:
    data_6_df = data_5_df.copy()  # Just copy data_5_df if identifiers_df is missing
    print("identifiers_df does not exist. Using data_5_df as data_6_df.")


In [None]:
# ## Save dataframe to a CSV

with open(r"output/data_6_df.csv", 'w', encoding='utf-8') as file:
    data_6_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

In [None]:
## Flatten the field for related_identifiers

# Check if 'metadata.related_identifiers' exists in the DataFrame
if 'related_identifiers' in data_6_df.columns:

    # Initialize lists to store the concatenated values
    identifiers = []
    relation_type_ids = []
    relation_type_titles = []
    resource_type_ids = []
    resource_type_titles = []
    schemes = []

    # Iterate through the data and extract required information
    for related_identifier_data in data_6_df['related_identifiers']:
        temp_identifiers = []
        temp_relation_type_ids = []
        temp_relation_type_titles = []
        temp_resource_type_ids = []
        temp_resource_type_titles = []
        temp_schemes = []
        
        if isinstance(related_identifier_data, list):
            for item in related_identifier_data:
                if isinstance(item, dict):
                    temp_identifiers.append(item.get('identifier', ''))
                    relation_type = item.get('relation_type', {})
                    temp_relation_type_ids.append(relation_type.get('id', ''))
                    temp_relation_type_titles.append(relation_type.get('title', {}).get('en', ''))
                    resource_type = item.get('resource_type', {})
                    temp_resource_type_ids.append(resource_type.get('id', ''))
                    temp_resource_type_titles.append(resource_type.get('title', {}).get('en', ''))
                    temp_schemes.append(item.get('scheme', ''))
    
        identifiers.append('; '.join(temp_identifiers))
        relation_type_ids.append('; '.join(temp_relation_type_ids))
        relation_type_titles.append('; '.join(temp_relation_type_titles))
        resource_type_ids.append('; '.join(temp_resource_type_ids))
        resource_type_titles.append('; '.join(temp_resource_type_titles))
        schemes.append('; '.join(temp_schemes))
    
    # Create a DataFrame
    related_identifiers = pd.DataFrame({
        'related_identifiers.identifier': identifiers,
        'related_identifiers.relation_type.id': relation_type_ids,
        'related_identifiers.relation_type.title.en': relation_type_titles,
        'related_identifiers.resource_type.id': resource_type_ids,
        'related_identifiers.resource_type.title.en': resource_type_titles,
        'related_identifiers.scheme': schemes
    })

# Print the DataFrame
related_identifiers.head()


In [None]:
## Concatendate the creator_df to data_df
data_7_df = pd.concat([data_6_df, related_identifiers], axis=1)
data_7_df.head(50)

In [None]:
# ## Save dataframe to a CSV

with open(r"output/data_7_df.csv", 'w', encoding='utf-8') as file:
    data_7_df.to_csv(file, lineterminator='\n', index=True)
    file.close()

In [None]:
## Print the column indices and column names
print("Column indices and names:")
for index, column_name in enumerate(data_7_df.columns):
    ##print("Index:", index, "| Column Name:", column_name)
    print(f"'{column_name}',")

In [None]:
## Drop columns that have been flattened

columns_to_drop = ['metadata.additional_descriptions'
                   ,'metadata.additional_titles'
                   ,'metadata.contributors'
                   ,'metadata.creators'
                   ,'metadata.dates'
                   #,'metadata.funding'
                   #,'metadata.identifiers'
                   ,'metadata.languages'
                  # ,'metadata.publisher'
                   #,'metadata.related_identifiers'
                   ,'metadata.resource_type'
                   ,'metadata.rights'
                   ,'metadata.subjects']
# Drop the columns
data_8_df = data_7_df.drop(columns=columns_to_drop)

# Display the resulting DataFrame
data_8_df.head()

In [None]:
#### Export dataframe to excel #####

import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import PatternFill, Font, Border, Side

# Column groups with their respective background colors

# Column groups and corresponding colors

blue_columns = [
     
    [
        'metadata.description', 
        'additional_descriptions.description', 
        'additional_descriptions.type.id', 
        'additional_descriptions.type.title.en', 
        'additional_titles.title', 
        'additional_titles.type.id', 
        'additional_titles.type.title.de', 
        'additional_titles.type.title.en', 

    ],
    [
        'dates.date',
        'dates.description',
        'dates.type.id',
        'dates.type.title.de',
        'dates.type.title.en'
    ],
    [
        'languages.id',
        'languages.title.en'
    ],
    [
        'rights.description.en',
        'rights.icon',
        'rights.id',
        'rights.props.scheme',
        'rights.props.url',
        'rights.title.en',
        'rights.link',
    ],
    [
        'creators.person_or_org.name',
        'creators.person_or_org.type',
        'creators.person_or_org.identifiers.orcid',
        'creators.person_or_org.affiliation',
        'creators.affiliations.id',
        'creators.role.id',
        'creators.role.title.en'
    ],
    [
        'contributors.person_or_org.name',
        'contributors.person_or_org.type',
        'contributors.person_or_org.identifiers.orcid',
        'contributors.person_or_org.affiliation',
        'contributors.affiliations.id',
        'contributors.role.id',
        'contributors.role.title.en'
    ],
    [
        'subjects.MeSH.id',
        'subjects.MeSH.subject',
        'subjects.LCSH.id',
        'subjects.LCSH.subject',
        'subjects.LCNAF.id',
        'subjects.LCNAF.subject',
        'subjects'
    ],
    [
        'funding.funder.name',
        'funding.funder.id',
        'funding.funder.award.identifiers.identifier',
        'funding.funder.award.identifiers.scheme',
        'funding.funder.award.number',
        'funding.funder.award.title.en'
    ],
    [
        'identifiers.identifier.ark',
        'identifiers.identifier.arxiv',
        'identifiers.identifier.bibcode',
        'identifiers.identifier.doi',
        'identifiers.identifier.ean13',
        'identifiers.identifier.eissn',
        'identifiers.identifier.handle',
        'identifiers.identifier.igsn',
        'identifiers.identifier.isbn',
        'identifiers.identifier.issn',
        'identifiers.identifier.istc',
        'identifiers.identifier.lissn',
        'identifiers.identifier.lsid',
        'identifiers.identifier.pmid',
        'identifiers.identifier.purl',
        'identifiers.identifier.upc',
        'identifiers.identifier.url',
        'identifiers.identifier.urn',
        'identifiers.identifier.w3id',
        'identifiers.identifier.other',
    ],
    ['related_identifiers.identifier',
        'related_identifiers.relation_type.id',
        'related_identifiers.relation_type.title.en',
        'related_identifiers.resource_type.id',
        'related_identifiers.resource_type.title.en',
        'related_identifiers.scheme'
    ]
]
   
green_columns = [
    ['links.self', 
    'links.self_html', 
    'links.self_doi', 
    'links.doi', 
    'links.parent', 
    'links.parent_html', 
    'links.parent_doi', 
    'links.self_iiif_manifest', 
    'links.self_iiif_sequence', 
    'links.files', 
    'links.media_files', 
    'links.archive', 
    'links.archive_media', 
    'links.latest', 
    'links.latest_html', 
    'links.draft', 
    'links.versions', 
    'links.access_links', 
    'links.access_grants', 
    'links.access_users', 
    'links.access_groups', 
    'links.access_request', 
    'links.access', 
    'links.reserve_doi', 
    'links.communities', 
    'links.communities-suggestions', 
    'links.requests'
    ],
    ['parent.id', 
    'parent.access', 
    'parent.communities', 
    'parent.pids', 
    'versions.is_latest', 
    'versions.index', 
    'pids.doi', 
    'pids.oai', 
    'metadata.title', 
    'metadata.publisher', 
    'metadata.publication_date', 
    'metadata.sizes', 
    'metadata.formats', 
    'metadata.version', 
    'access.record', 
    'access.files', 
    'access.embargo', 
    'access.status', 

    ],
   
    ['files.enabled', 
    'files.order', 
    'files.count', 
    'files.total_bytes', 
    'files.entries', 
    'media_files', 
    'deletion_status', 
    'stats'
    ],
    # ['id',
    # 'is_draft',
    # 'is_published'
    # ]
]


# Shades of blue, green, and orange

blue_shades = ['#E6F0FF', '#CCE1FF', '#B3D1FF', '#99C2FF', '#80B3FF','#66A3FF','#4D94FF','#3385FF', '#1A75FF','#0066FF','#0055CC','#0044AA']
green_shades = ['#E6FFE6', '#CCFFCC', '#B3FFB3', '#99FF99', '#80FF80']
orange_shades = ['#FFE6CC', '#FFCC99']

# Function to apply color fills and borders to columns
def apply_column_colors(ws, col_indices_groups, colors):
    for col_indices, color in zip(col_indices_groups, colors):
        fill = PatternFill(start_color=color.replace('#', ''), end_color=color.replace('#', ''), fill_type='solid')
        thick_border = Border(left=Side(style='thick'), right=Side(style='thick'))
        
        for col_index in col_indices:
            for row in range(1, ws.max_row + 1):
                ws.cell(row=row, column=col_index + 1).fill = fill
                
                # Apply thick border to the first and last columns of the group
                if col_index == col_indices[0]:
                    ws.cell(row=row, column=col_index + 1).border = Border(left=Side(style='thick'), right=ws.cell(row=row, column=col_index + 1).border.right)
                if col_index == col_indices[-1]:
                    ws.cell(row=row, column=col_index + 1).border = Border(left=ws.cell(row=row, column=col_index + 1).border.left, right=Side(style='thick'))

# Create a workbook and worksheet
wb = Workbook()
ws = wb.active

# Define bold font and thin border
bold_font = Font(bold=True)
thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))

# Write column headers to the first row with bold font and border
for c_idx, col_name in enumerate(data_8_df.columns, 1):
    cell = ws.cell(row=1, column=c_idx, value=col_name)
    cell.font = bold_font
    cell.border = thin_border

# Write DataFrame to worksheet starting from the second row with borders
for r_idx, row in enumerate(data_8_df.itertuples(index=False), 2):
    for c_idx, value in enumerate(row, 1):
        cell = ws.cell(row=r_idx, column=c_idx, value=str(value) if isinstance(value, (dict, list)) else value)
        cell.border = thin_border

# Apply borders to ALL cells (headers + data)
for row in ws.iter_rows():
    for cell in row:
        cell.border = thin_border  # Ensures borders are applied to all rows and columns

# Freeze the top row
ws.freeze_panes = ws['A2']

# Apply colors and borders to the specified column groups
apply_column_colors(ws, 
    [[data_8_df.columns.get_loc(col) for col in group if col in data_8_df.columns] 
     for group in blue_columns], blue_shades)

apply_column_colors(ws, 
    [[data_8_df.columns.get_loc(col) for col in group if col in data_8_df.columns] 
     for group in green_columns], green_shades)

#apply_column_colors(ws, [[data_8_df.columns.get_loc(col) for col in group] for group in orange_columns], orange_shades)

# Save the workbook
wb.save("output/styled_dataframe.xlsx")

In [None]:
##### TO DO LIST FOR NEXT STEPS
## Reverse engineer to recreate json
## Test post to repository!
## Figure out how to grab file by name

#### Code below is to reconstruct the json - it's under construction!!

In [None]:
# Open the CSV file and load it into a DataFrame
with open(r"output/data_7_df.csv", 'r', encoding='utf-8') as file:
    data_7_df = pd.read_csv(file)
    file.close()

# Display the DataFrame
data_7_df.head()

In [None]:
### Reverse engineer metadata from datagrame into json: access, created, custom_fields, etc.
import pandas as pd
import json

def unflatten_dict(d, sep='.'):
    """
    Converts a flattened dictionary back to its original nested form.
    """
    result = {}
    for key, value in d.items():
        keys = key.split(sep)
        d = result
        for sub_key in keys[:-1]:
            if sub_key not in d:
                d[sub_key] = {}
            d = d[sub_key]
        d[keys[-1]] = value
    return result

def unflatten_metadata_row(row, flatten_keys):
    """
    Unflattens a single row of metadata from the DataFrame.
    """
    row_data = {}
    
    for key, value in row.items():
        # Determine the prefix (key) for the current column
        prefix = key.split('.')[0]
        
        if prefix in flatten_keys:
            # Handle flattened dictionary keys
            nested_key = key[len(prefix) + 1:]  # Remove prefix
            unflattened = unflatten_dict({nested_key: value})
            # Merge into row_data
            if prefix in row_data:
                row_data[prefix].update(unflattened)
            else:
                row_data[prefix] = unflattened
        else:
            # Ignore columns not in flatten_keys
            continue
    
    return row_data

def unflatten_metadata(df, flatten_keys):
    """
    Unflattens the specified columns of the DataFrame into JSON format.
    """
    # Filter the DataFrame to include only the specified columns
    filtered_df = df[[col for col in df.columns if col.split('.')[0] in flatten_keys]]
    
    original_data = []
    
    for _, row in filtered_df.iterrows():
        original_data.append(unflatten_metadata_row(row, flatten_keys))
    
    return original_data

# # Example DataFrame with flattened metadata
# data = {
#     'access.date': ['2024-01-01', '2024-02-01'],
#     'created.date': ['2024-01-01', '2024-02-01'],
#     'custom_fields.field': ['Field 1', 'Field 2'],
#     'files.file': ['File 1', 'File 2'],
#     'id.value': [1, 2],
#     'is_draft.status': [True, False],
#     'is_published.status': [False, True],
#     'links.url': ['http://example.com/1', 'http://example.com/2'],
#     'parent.parent_id': [101, 102],
#     'pids.pid': ['PID1', 'PID2'],
#     'revision_id.value': [1, 2],
#     'status.state': ['active', 'inactive'],
#     'updated.date': ['2024-01-01', '2024-02-01'],
#     'versions.version': ['v1.0', 'v2.0']
# }

# data_df = pd.DataFrame(data)

# Specify the metadata fields to flatten
flatten_keys = ['access', 
                'created', 
                'custom_fields', 
                'files', 
                'id', 
                'is_draft', 
                'is_published', 
                'links', 
                'parent', 
                'pids', 
                'revision_id', 
                'status', 
                'updated', 
                'versions']

# Unflatten the metadata
general_metadata_1_list = unflatten_metadata(data_7_df, flatten_keys)

# Convert to JSON and print
general_metadata_1_json = json.dumps(general_metadata_1_list, indent=4)
print(general_metadata_1_json)




In [None]:
### Reverse engineer metadata from datagrame into json: additional_descriptions, additional_titles, dates, languages, publisher, resource_types, rights

def unflatten_dict(d, sep='.'):
    """
    Converts a flattened dictionary back to its original nested form.
    """
    result = {}
    for key, value in d.items():
        keys = key.split(sep)
        d = result
        for sub_key in keys[:-1]:
            if sub_key not in d:
                d[sub_key] = {}
            d = d[sub_key]
        d[keys[-1]] = value
    return result

def unflatten_metadata_row(row, metadata_key_list):
    """
    Unflattens a single row of metadata from the DataFrame.
    """
    row_data = {}
    for metadata_key in metadata_key_list:
        columns = {col: row[col] for col in row.index if col.startswith(metadata_key)}
        
        for key, value in columns.items():
            nested_key = key[len(metadata_key) + 1:]  # Remove metadata_key prefix
            unflattened = unflatten_dict({nested_key: value})
            row_data.update(unflattened)
    
    return row_data

def unflatten_metadata(df, metadata_key_list):
    """
    Unflattens the metadata from the DataFrame where each row represents an item in JSON.
    """
    original_data = []
    
    for _, row in df.iterrows():
        original_data.append(unflatten_metadata_row(row, metadata_key_list))
    
    return original_data

# # Example DataFrame with flattened metadata
# data_1_df = pd.DataFrame({
#     'additional_descriptions.description': [['Description 1'], ['Description 2']],
#     'additional_titles.title': [['Title 1'], ['Title 2']],
#     'dates.date': [['2024-01-01'], ['2024-02-01']],
#     'languages.language': [['English'], ['Spanish']],
#     'publisher.publisher': [['Publisher 1'], ['Publisher 2']],
#     'resource_type.resource_type': [['Type 1'], ['Type 2']],
#     'rights.right': [['Right 1'], ['Right 2']]
# })

metadata_key_list = ['additional_descriptions',
                     'additional_titles',
                     'dates',
                     'languages',
                     'publisher',
                     'resource_type',
                     'rights']

# Unflatten the metadata
general_metadata_2_list = unflatten_metadata(data_7_df, metadata_key_list)


general_metadata_2_json = json.dumps(general_metadata_2_list, indent=4)
print(general_metadata_2_json)




In [None]:
# import pandas as pd
# import json

# def unflatten_dict(d, sep='.'):
#     """
#     Converts a flattened dictionary back to its original nested form.
#     """
#     result = {}
#     for key, value in d.items():
#         keys = key.split(sep)
#         d = result
#         for sub_key in keys[:-1]:
#             if sub_key not in d:
#                 d[sub_key] = {}
#             d = d[sub_key]
#         d[keys[-1]] = value
#     return result

# def unflatten_metadata_row(row, metadata_key_list):
#     """
#     Unflattens a single row of metadata from the DataFrame.
#     """
#     row_data = {}
#     for metadata_key in metadata_key_list:
#         columns = {col: row[col] for col in row.index if col.startswith(metadata_key)}
        
#         for key, value in columns.items():
#             nested_key = key[len(metadata_key) + 1:]  # Remove metadata_key prefix
#             if isinstance(value, list) and len(value) == 1:
#                 value = value[0]  # Unwrap single-element lists
#             unflattened = unflatten_dict({nested_key: value})
#             row_data.update(unflattened)
    
#     return row_data

# def unflatten_metadata(df, metadata_key_list):
#     """
#     Unflattens the metadata from the DataFrame where each row represents an item in JSON.
#     """
#     original_data = []
    
#     for _, row in df.iterrows():
#         # Convert each row into a list of values, assuming values are lists
#         list_row = {col: row[col] if isinstance(row[col], list) else [row[col]] for col in df.columns}
#         original_data.append(unflatten_metadata_row(list_row, metadata_key_list))
    
#     return original_data

# # Example DataFrame with flattened metadata
# data_7_df = pd.DataFrame({
#     'additional_descriptions.description': [['Description 1'], ['Description 2']],
#     'additional_titles.title': [['Title 1'], ['Title 2']],
#     'dates.date': [['2024-01-01'], ['2024-02-01']],
#     'languages.language': [['English'], ['Spanish']],
#     'publisher.publisher': [['Publisher 1'], ['Publisher 2']],
#     'resource_type.resource_type': [['Type 1'], ['Type 2']],
#     'rights.right': [['Right 1'], ['Right 2']]
# })

# metadata_key_list = ['additional_descriptions',
#                      'additional_titles',
#                      'dates',
#                      'languages',
#                      'publisher',
#                      'resource_type',
#                      'rights']

# # Unflatten the metadata
# original_data = unflatten_metadata(data_7_df, metadata_key_list)

# # Print the reconstructed data
# print(json.dumps(original_data, indent=4))


In [None]:
### Reverse engineer into metadata in dataframe into json: creator
import json

def reverse_engineer_creator_data(df):
    creator_list = []

    for _, row in df.iterrows():
        names = row['creators.person_or_org.name'].split('; ')
        types = row['creators.person_or_org.type'].split('; ')
        orcids = row['creators.person_or_org.identifiers.orcid'].split('; ')
        affiliations = row['creators.person_or_org.affiliation'].split('; ')
        affiliation_ids = row['creators.affiliations.id'].split('; ')
        role_ids = row['creators.role.id'].split('; ')
        role_titles = row['creators.role.title.en'].split('; ')
        
        row_data = []
        for i in range(len(names)):
            person_or_org = {
                'name': names[i],
                'type': types[i],
                'identifiers': [{'identifier': orcids[i], 'scheme': 'orcid'}] if orcids[i] else []
            }
            affiliations_data = [{'name': aff, 'id': aff_id} for aff, aff_id in zip(affiliations[i].split('|'), affiliation_ids[i].split('|')) if aff and aff_id]
            role_data = {'id': role_ids[i], 'title': {'en': role_titles[i]}}
            
            row_data.append({
                'person_or_org': person_or_org,
                'affiliations': affiliations_data,
                'role': role_data
            })

        creator_list.append({'creator': row_data})

    return creator_list

# Reverse engineer the data and create the JSON structure
reversed_creator_list = reverse_engineer_creator_data(data_7_df)

# Convert the list to a JSON string if needed
reversed_creator_json = json.dumps(reversed_creator_list, indent=4)

# Output the first item in the list for verification
print(reversed_creator_json)




In [None]:
# Assuming original_data is already a Python list from the unflattening process
# Print the first item in the list
if reversed_creator_list:
    print(json.dumps(reversed_creator_list, indent=4))
else:
    print("No data available")

In [None]:
### Reverse engineer into metadata in dataframe into json: contributor

import json

def reverse_engineer_contributor_data(df):
    contributor_list = []

    for _, row in df.iterrows():
        names = row['contributors.person_or_org.name'].split('; ')
        types = row['contributors.person_or_org.type'].split('; ')
        orcids = row['contributors.person_or_org.identifiers.orcid'].split('; ')
        affiliations = row['contributors.person_or_org.affiliation'].split('; ')
        affiliation_ids = row['contributors.affiliations.id'].split('; ')
        role_ids = row['contributors.role.id'].split('; ')
        role_titles = row['contributors.role.title.en'].split('; ')
        
        row_data = []
        for i in range(len(names)):
            person_or_org = {
                'name': names[i],
                'type': types[i],
                'identifiers': [{'identifier': orcids[i], 'scheme': 'orcid'}] if orcids[i] else []
            }
            affiliations_data = [{'name': aff, 'id': aff_id} for aff, aff_id in zip(affiliations[i].split('|'), affiliation_ids[i].split('|')) if aff and aff_id]
            role_data = {'id': role_ids[i], 'title': {'en': role_titles[i]}}
            
            row_data.append({
                'person_or_org': person_or_org,
                'affiliations': affiliations_data,
                'role': role_data
            })

        contributor_list.append({'contributors': row_data})

    return contributor_list

# Reverse engineer the data and create the JSON structure
reversed_contributor_list = reverse_engineer_contributor_data(data_7_df)

# Convert the list to a JSON string if needed
reversed_contributor_json = json.dumps(reversed_contributor_list, indent=4)

# Output the first item in the list for verification
print(reversed_contributor_json)




In [None]:
### Reverse engineer metdata into json: funding

def reverse_engineer_funding_metadata(df):
    funding_list = []
    
    for index, row in df.iterrows():
        funders = row['funding.funder.name'].split('; ')
        ids = row['funding.funder.id'].split('; ')
        identifiers = row['funding.funder.award.identifiers.identifier'].split('; ')
        schemes = row['funding.funder.award.identifiers.scheme'].split('; ')
        numbers = row['funding.funder.award.number'].split('; ')
        titles = row['funding.funder.award.title.en'].split('; ')
        
        row_data = []
        
        for i in range(len(funders)):
            funding_item = {
                'funder': {
                    'name': funders[i],
                    'id': ids[i]
                },
                'award': {
                    'identifiers': [{
                        'identifier': identifiers[i],
                        'scheme': schemes[i]
                    }],
                    'number': numbers[i],
                    'title': {
                        'en': titles[i]
                    }
                }
            }
            row_data.append(funding_item)
        
        funding_list.append({'funding': row_data})
    
    return funding_list

# Reverse engineer the DataFrame back to JSON format
reversed_funder_list = reverse_engineer_funding_metadata(data_7_df)

reversed_funder_json = json.dumps(reversed_funder_list, indent = 4)
print(reversed_funder_json)


In [None]:
### Reverse engineer metadata into json: subjects
import pandas as pd
import json
import ast

def convert_strings_to_lists(df, columns):
    for col in columns:
        df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    return df

def reconstruct_metadata_to_json(data_df, metadata_key, config):
    subject_list = []
    
    for index, row in data_df.iterrows():
#         print(f"Processing row {index}:")
#         print(row)
        
        row_data = []
        
        for scheme in config.keys():
            subjects = row.get(f"{metadata_key}.{scheme}.subject", [])
            ids = row.get(f"{metadata_key}.{scheme}.id", [])
            
            if not isinstance(subjects, list):
                print(f"Warning: {metadata_key}.{scheme}.subject is not a list")
                subjects = []
            if not isinstance(ids, list):
                print(f"Warning: {metadata_key}.{scheme}.id is not a list")
                ids = []
            
            if len(subjects) != len(ids):
                print(f"Warning: Mismatch in length of subjects and ids for scheme {scheme}")
            
            for subject, id_ in zip(subjects, ids):
                if subject and id_:
                    row_data.append({
                        'id': id_,
                        'scheme': scheme,
                        'subject': subject
                    })
        
        plain_subjects = row.get(metadata_key, [])
        if not isinstance(plain_subjects, list):
            print(f"Warning: {metadata_key} is not a list")
            plain_subjects = []
        
        for subject in plain_subjects:
            if subject:
                row_data.append({
                    'subject': subject
                })
        
        # Add the row data to the reconstructed list
        subject_list.append({"subjects":row_data})
    
    return subject_list

# Convert string representations to lists
columns_to_convert = [
    'subjects.MeSH.id',
    'subjects.MeSH.subject',
    'subjects.LCSH.id',
    'subjects.LCSH.subject',
    'subjects.LCNAF.id',
    'subjects.LCNAF.subject',
    'subjects'
]
data_10_df = convert_strings_to_lists(data_7_df, columns_to_convert)

# Configuration for handling different schemes
config = {
    'MeSH': {
        'id': 'id',
        'subject': 'subject'
    },
    'LCSH': {
        'id': 'id',
        'subject': 'subject'
    },
    'LCNAF': {
        'id': 'id',
        'subject': 'subject'
    }
}

# Call the function to reverse the flattening and return the JSON-like list of dictionaries
reversed_subject_list = reconstruct_metadata_to_json(data_10_df, 'subjects', config)

# Convert the list to a JSON string if needed
reversed_subject_json = json.dumps(reversed_subject_list, indent=4)

print(reversed_subject_json)



In [None]:
## Reverse engineer metadata in dataframe into json: identifiers

def reverse_engineer_identifiers(df):
    """
    Reconstructs JSON format from a DataFrame with identifiers in separate columns.

    Parameters:
    df (pd.DataFrame): The DataFrame containing identifier columns.

    Returns:
    list: A list of dictionaries where each dictionary is a JSON object.
    """
    identifier_list = []

    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        # Initialize the dictionary to hold the reconstructed identifiers
        reconstructed_identifiers = []

        # Iterate through the columns to extract identifiers
        for col in df.columns:
            if col.startswith('identifiers.identifier.'):
                scheme = col[len('identifiers.identifier.'):]
                identifier = row[col]

                if pd.notna(identifier) and identifier != '':
                    reconstructed_identifiers.append({
                        'scheme': scheme,
                        'identifier': identifier
                    })

        # Append the reconstructed identifiers for the current row to the list
        identifier_list.append({"identifiers":reconstructed_identifiers})

    return identifier_list

# # Example DataFrame with separate identifier columns and multiple rows
# data_5_df = pd.DataFrame({
#     'identifiers.identifier.ark': ['ark:/c8131/g3rp42', 'ark:/c8131/g3rp43'],
#     'identifiers.identifier.arxiv': ['arXiv:2109.13768', 'arXiv:2109.13769'],
#     'identifiers.identifier.bibcode': ['1974AJ.....79..819H', '1974AJ.....80..820H'],
#     'identifiers.identifier.doi': ['10.1002/acn3.51179', '10.1002/acn3.51180'],
#     'identifiers.identifier.ean13': ['1.23457E+12', '1.23457E+13'],
#     'identifiers.identifier.eissn': ['1470-7330', '1470-7331'],
#     'identifiers.identifier.handle': ['20.1000/100', '20.1000/101'],
#     'identifiers.identifier.igsn': ['HRV003M16', 'HRV003M17'],
#     'identifiers.identifier.isbn': ['978-3-16-148410-0', '978-3-16-148411-0'],
#     'identifiers.identifier.issn': ['1740-5025', '1740-5026'],
#     'identifiers.identifier.istc': ['A02-2009-000004BE-A', 'A02-2009-000004BE-B'],
#     'identifiers.identifier.lissn': ['1748-7188', '1748-7189'],
#     'identifiers.identifier.lsid': ['urn:lsid:zoobank.org:pub:CDC8D258-8F57-41DC-B560-247E17D3DC8C', 'urn:lsid:zoobank.org:pub:CDC8D259-8F58-41DC-B561-247E17D3DC8D'],
#     'identifiers.identifier.pmid': ['23193287', '23193288'],
#     'identifiers.identifier.purl': ['https://purl.fdlp.gov/GPO/gpo53258', 'https://purl.fdlp.gov/GPO/gpo53259'],
#     'identifiers.identifier.upc': ['42100005264', '42100005265'],
#     'identifiers.identifier.url': ['https://www.cnn.com', 'https://www.bbc.com'],
#     'identifiers.identifier.urn': ['urn:isan:0000-0000-2CEA-0000-1-0000-0000-Y', 'urn:isan:0000-0000-2CEA-0000-2-0000-0000-Y'],
#     'identifiers.identifier.w3id': ['https://w3id.org/tree', 'https://w3id.org/tree2'],
#     'identifiers.identifier.other': ['2-s2.0-85178352590', '2-s2.0-85178352591']
# })

# Reverse engineer the DataFrame back to JSON format
reversed_identifiers_list = reverse_engineer_identifiers(data_10_df)


# Convert the list to a JSON string if needed
reversed_identifiers_json = json.dumps(reversed_identifiers_list, indent=4)

print(reversed_identifiers_json)


In [None]:
#### Reverse Engineer metadata in dataframe: Related Identifiers #####

def reverse_engineer_related_identifiers(df):
    """
    Reconstructs JSON format from a DataFrame with concatenated related identifiers.

    Parameters:
    df (pd.DataFrame): The DataFrame containing concatenated related identifiers.

    Returns:
    list: A list of dictionaries where each dictionary represents a row in JSON format.
    """
    related_identifier_list = []

    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        # Initialize lists to hold extracted values
        identifiers = row['related_identifiers.identifier'].split('; ')
        relation_type_ids = row['related_identifiers.relation_type.id'].split('; ')
        relation_type_titles = row['related_identifiers.relation_type.title.en'].split('; ')
        resource_type_ids = row['related_identifiers.resource_type.id'].split('; ')
        resource_type_titles = row['related_identifiers.resource_type.title.en'].split('; ')
        schemes = row['related_identifiers.scheme'].split('; ')
        
        # Reconstruct the JSON structure
        reconstructed_related_identifiers = []
        for i in range(len(identifiers)):
            related_identifier = {
                'identifier': identifiers[i],
                'relation_type': {
                    'id': relation_type_ids[i] if i < len(relation_type_ids) else '',
                    'title': {
                        'en': relation_type_titles[i] if i < len(relation_type_titles) else ''
                    }
                },
                'resource_type': {
                    'id': resource_type_ids[i] if i < len(resource_type_ids) else '',
                    'title': {
                        'en': resource_type_titles[i] if i < len(resource_type_titles) else ''
                    }
                },
                'scheme': schemes[i] if i < len(schemes) else ''
            }
            reconstructed_related_identifiers.append(related_identifier)
        
        # Append to the JSON list
        related_identifier_list.append({"related_identifiers":reconstructed_related_identifiers})
    
    return related_identifier_list

# # Example DataFrame with concatenated related identifiers
# data_6_df = pd.DataFrame({
#     'related_identifiers.identifier': ['identifier1; identifier2', 'identifier3'],
#     'related_identifiers.relation_type.id': ['relationTypeId1; relationTypeId2', 'relationTypeId3'],
#     'related_identifiers.relation_type.title.en': ['relationTypeTitle1; relationTypeTitle2', 'relationTypeTitle3'],
#     'related_identifiers.resource_type.id': ['resourceTypeId1; resourceTypeId2', 'resourceTypeId3'],
#     'related_identifiers.resource_type.title.en': ['resourceTypeTitle1; resourceTypeTitle2', 'resourceTypeTitle3'],
#     'related_identifiers.scheme': ['scheme1; scheme2', 'scheme3']
# })

# Reverse engineer the DataFrame back to JSON format
reversed_related_identifiers_list = reverse_engineer_related_identifiers(data_10_df)

# Print the JSON list
reversed_related_identifiers_json = json.dumps(reversed_related_identifiers_list, indent=4)
print(reversed_related_identifiers_json)

In [None]:
# def format_as_metadata(lists):
#     """
#     Formats a list of lists, where each list contains dictionaries, into a list of metadata entries.
    
#     Each entry in the result list will have a 'metadata' key with the dictionaries from the lists.
#     """
#     result = []
    
#     # Determine the number of items in the lists
#     num_items = len(lists[0]) if lists else 0
    
#     for i in range(num_items):
#         metadata_entry = {
#             'metadata': {}
#         }
#         for lst in lists:
#             if i < len(lst):
#                 item = lst[i]
#                 if isinstance(item, dict):
#                     metadata_entry['metadata'].update(item)
#                 else:
#                     print(f"Error: List item at index {i} in one of the lists is not a dictionary. Item: {item}")
#             else:
#                 print(f"Error: List at index {i} does not have an item at index {i}.")
                
#         result.append(metadata_entry)
    
#     return result

# lists = [reversed_creator_list, 
#          reversed_funder_list,
#          reconstructed_subject_list, 
#          reversed_identifiers_list,
#          reversed_related_identifiers_list] 

# result = format_as_metadata(lists)

# import json
# print(json.dumps(result, indent=4))



In [None]:
all_lists = [general_metadata_1_list,
             general_metadata_2_list,
             reversed_creator_list, 
             reversed_funder_list,
             reversed_subject_list, 
             reversed_identifiers_list,
             reversed_related_identifiers_list] 

for item in all_lists:
    print(len(item))
    print(type(item))
    print(type(item[1]))

# print(reversed_creator_list[0])
# print('--------------------------------------------------')
# print(reversed_creator_list[0][0])
# print('--------------------------------------------------')
# print(reversed_creator_list[0][1])

In [None]:
# metadata_lists = [general_metadata_1_list,
#                   general_metadata_2_list,
#                   reversed_creator_list, 
#                   reversed_funder_list,
#                   reversed_subject_list, 
#                   reversed_identifiers_list,
#                   reversed_related_identifiers_list]

# metadata_json = []

# for list_item in metadata_lists: 
#     for item in list_item:
#         # Since 'metadata_dict' is a dictionary, we should use 'update' to add items
#         metadata_dict = {}  # Reset the dictionary for each list_item
#         metadata_dict.update(item)  # Add the current item to the dictionary

#     # Now add this dictionary to the 'metadata_json' list, wrapping it in "metadata"
#     metadata_json.append({"metadata": metadata_dict})

# # Print the resulting metadata_json
# import json
# print(json.dumps(metadata_json, indent=4))


# print(metadata_json[0])
# print("__________________________________________________")
# print(metadata_json[1])

In [None]:
# def format_as_metadata(lists):
#     """
#     Creates a dictionary where the first item from each list is assigned to the 'metadata' key.

#     Parameters:
#     lists (list of lists): Each list contains two items which are dictionaries.

#     Returns:
#     dict: A dictionary where each 'metadata' key contains the first item from each list.
#     """
#     result = {}
    
#     for lst in lists:
#         if len(lst) != 2:
#             print(f"Error: List must contain exactly two items. List: {lst}")
#             continue
        
#         # Ensure both items in the list are dictionaries
#         if not all(isinstance(item, dict) for item in lst):
#             print(f"Error: All items in the list must be dictionaries. List: {lst}")
#             continue
        
#         # Create or update the 'metadata' entry
#         key = 'metadata'
#         if key not in result:
#             result[key] = {}
        
#         # Add the first item from each list to the metadata
#         result[key].update(lst[0])
    
#     return result


# # Lists to process
# lists = [reversed_creator_list, 
#          reversed_funder_list,
#          reversed_subject_list, 
#          reversed_identifiers_list,
#          reversed_related_identifiers_list]

# # Format the data
# result = format_as_metadata(lists)

# import json
# print(json.dumps(result, indent=4))


In [None]:
# # Combine lists into a list of lists
# all_lists = [general_metadata_1_list,
#              general_metadata_2_list,
#              reversed_creator_list, 
#              reversed_funder_list,
#              reversed_subject_list, 
#              reversed_identifiers_list,
#              reversed_related_identifiers_list] 

# # Separate items by index
# item_1 = [lst[0] for lst in all_lists]  # First items
# item_2 = [lst[1] for lst in all_lists]  # Second items

# # Merge items
# merged_item_1 = {}
# for item in item_1:
#     merged_item_1.update(item)

# merged_item_2 = {}
# for item in item_2:
#     merged_item_2.update(item)

# # Convert merged items back to JSON format if needed
# merged_item_1_json = json.dumps(merged_item_1, indent=4)
# merged_item_2_json = json.dumps(merged_item_2, indent=4)

# print("Merged Item 1:")
# print(merged_item_1_json)

# print("-----------------------------------------------------------------------------")
# print("Merged Item 2:")
# print(merged_item_2_json)

In [None]:
import json

# Combine lists into a list of lists
all_lists = [general_metadata_1_list,
             general_metadata_2_list,
             reversed_creator_list, 
             reversed_funder_list,
             reversed_subject_list, 
             reversed_identifiers_list,
             reversed_related_identifiers_list] 

# Find the total number of records
num_records = len(all_lists[0])  # Assuming all lists have the same length

# Create a list to hold all metadata records
metadata_records = []

# Iterate over each record index
for i in range(num_records):
    merged_item = {}  # Create a new dictionary for each metadata record
    
    for lst in all_lists:
        merged_item.update(lst[i])  # Merge items at the same index across lists
    
    metadata_records.append(merged_item)  # Add to final list

# Convert to JSON format
metadata_json = json.dumps(metadata_records, indent=4)

# Save to a file
with open("output/metadata_records.json", "w", encoding="utf-8") as file:
    file.write(metadata_json)

# Print JSON output
print(metadata_json)


In [None]:
##### THE FOLLOWING CODE CAN BE USED TO FLATTEN JSON DATA

In [None]:
## This works with df for one key with many layers!! ####
# import pandas as pd

# def flatten_dict(d, parent_key='', sep='.'):
#     """
#     Recursively flattens a nested dictionary.

#     Parameters:
#     d (dict): The dictionary to flatten.
#     parent_key (str): The base key string for nested keys.
#     sep (str): The separator between parent and child keys.

#     Returns:
#     dict: A flattened dictionary.
#     """
#     items = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# def flatten_metadata(data_df, metadata_key):
#     """
#     Flattens the specified metadata key from the DataFrame.

#     Parameters:
#     data_df (pd.DataFrame): The DataFrame containing metadata.
#     metadata_key (str): The key in the metadata to flatten.

#     Returns:
#     pd.DataFrame: A DataFrame with flattened metadata.
#     """
#     # Initialize a dictionary to store lists for each column
#     columns = {}
    
  

#     # Iterate through each list of entries in the specified metadata column
#     for entries in data_df[f'metadata.{metadata_key}']:
        
#         # Check if the list of entries is empty
#         if not entries:
#             continue
        
#         # Initialize temporary lists for each entry
#         temp_data = {}
        
#         for entry in entries:
#             if isinstance(entry, dict):
#                 flattened_entry = flatten_dict(entry)
#                 for key, value in flattened_entry.items():
#                     # Ensure the column key exists in the columns dictionary
#                     column_key = f'{metadata_key}.{key}'
#                     if column_key not in columns:
#                         columns[column_key] = []
#                     temp_data.setdefault(column_key, []).append(value)
#             else:
#                 print(f"Ignoring non-dictionary entry: {entry}")
        
#         # Append temporary lists to the main lists in columns dictionary
#         for column_key, temp_list in temp_data.items():
#             if column_key in columns:
#                 columns[column_key].append(temp_list)

#     # Create a pandas DataFrame with lists as values
#     flattened_df = pd.DataFrame(columns)
    
#     return flattened_df

# # Example usage
# # Assuming `data_df` is your DataFrame and 'additional_descriptions' is the key to flatten
# flattened_additional_descriptions = flatten_metadata(data_df, 'additional_descriptions')
# flattened_additional_descriptions.head()


In [None]:
### This works with JSON for one key with many layers !! ####
# import pandas as pd

# def flatten_dict(d, parent_key='', sep='.'):
#     """
#     Recursively flattens a nested dictionary.

#     Parameters:
#     d (dict): The dictionary to flatten.
#     parent_key (str): The base key string for nested keys.
#     sep (str): The separator between parent and child keys.

#     Returns:
#     dict: A flattened dictionary.
#     """
#     items = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# def flatten_metadata(json_data, metadata_key):
#     """
#     Flattens the specified metadata key from the JSON data.

#     Parameters:
#     json_data (list): The JSON data containing metadata.
#     metadata_key (str): The key in the metadata to flatten.

#     Returns:
#     pd.DataFrame: A DataFrame with flattened metadata.
#     """
#     # Initialize a dictionary to store lists for each column
#     columns = {}

#     # Iterate through each list of entries in the specified metadata key
#     for data_entry in json_data:
#         entries = data_entry.get('metadata', {}).get(metadata_key, [])
        
#         # Initialize temporary lists for each entry
#         temp_data = {}
        
#         for entry in entries:
#             if isinstance(entry, dict):
#                 flattened_entry = flatten_dict(entry)
#                 for key, value in flattened_entry.items():
#                     # Ensure the column key exists in the columns dictionary
#                     column_key = f'{metadata_key}.{key}'
#                     if column_key not in columns:
#                         columns[column_key] = []
#                     temp_data.setdefault(column_key, []).append(value)
#             else:
#                 print(f"Ignoring non-dictionary entry: {entry}")
        
#         # Append temporary lists to the main lists in columns dictionary
#         for column_key, temp_list in temp_data.items():
#             if column_key in columns:
#                 columns[column_key].append(temp_list)

#     # Determine the maximum length of the lists
#     max_length = max(len(v) for v in columns.values())

#     # Pad lists with None to match the maximum length
#     for k, v in columns.items():
#         if len(v) < max_length:
#             columns[k].extend([None] * (max_length - len(v)))

#     # Create a pandas DataFrame with lists as values
#     flattened_df = pd.DataFrame(columns)
    
#     return flattened_df

# # Example JSON data
# json_data = [
#     {
#         "metadata": {
#             "additional_descriptions": [
#                 {
#                     "description": "Description 1",
#                     "lang": {"id": "en", "title": {"en": "English"}},
#                     "type": {"id": "type1", "title": {"en": "Type 1"}}
#                 },
#                 {
#                     "description": "Description 2",
#                     "lang": {"id": "de", "title": {"de": "German"}},
#                     "type": {"id": "type2", "title": {"de": "Type 2"}}
#                 }
#             ]
#         }
#     },
#     {
#         "metadata": {
#             "additional_descriptions": [
#                 {
#                     "description": "Description 3",
#                     "lang": {"id": "fr", "title": {"en": "French"}},
#                     "type": {"id": "type3", "title": {"en": "Type 3"}}
#                 }
#             ]
#         }
#     }
# ]

# # Flatten the additional_descriptions metadata
# flattened_additional_descriptions = flatten_metadata(json_data, 'additional_descriptions')
# flattened_additional_descriptions.head()



In [None]:
##### THE FOLLOWING CODE CAN BE USED TO FLATTEN PORTIONS OF THE METADATA from a dataframe

In [None]:
#### Flatten Access.Embargos #####

# # Initialize lists to store flattened data
# access_embargo_active = []
# access_embargo_reason = []

# # Iterate through the data (assuming only one dictionary in the list for this example)
# for entry in data_df["access.embargo"]:
#     access_embargo_active.append(entry['active'])
#     access_embargo_reason.append(entry['reason'])

# # Create a pandas DataFrame with a single row
# access_embargo_df = pd.DataFrame({
#     'access.embargo.active': access_embargo_active,
#     'access.embargo.reason': access_embargo_reason,

# })

# # Display the DataFrame
# access_embargo_df.head()

# # Append sto data_df
# data_df = pd.concat([data_df, access_embargo_df], axis=1)
# data_df.head(50)

In [None]:
# ### Flatten additional_descriptions ####
# import pandas as pd

# # Initialize empty lists to store flattened data
# additional_descriptions_description_list = []
# additional_descriptions_language_id_list = []
# additional_descriptions_language_title_en_list = []
# additional_descriptions_language_title_de_list = []
# additional_descriptions_type_id_list = []
# additional_descriptions_type_title_en_list = []
# additional_descriptions_type_title_de_list = []

# # Iterate through each list of additional descriptions in the "metadata.additional_descriptions" column of data_df
# for additional_descriptions in data_df['metadata.additional_descriptions']:
    
#     # Check if subjects_info is empty, append an empty row to the DataFrame
#     if not additional_descriptions:
#         empty_df = pd.DataFrame(columns=additional_descriptions.columns)
#         dfs.append(empty_df)
    
#     temp_additional_descriptions_description_list = []
#     temp_additional_descriptions_language_id_list = []
#     temp_additional_descriptions_language_title_en_list = []
#     temp_additional_descriptions_language_title_de_list = []
#     temp_additional_descriptions_type_id_list = []
#     temp_additional_descriptions_type_title_en_list = []
#     temp_additional_descriptions_type_title_de_list = []
    
#     # Iterate through each entry in the current list of additional descriptions
#     for entry in additional_descriptions:
#         print(entry)
#         # Check if the entry is a dictionary
#         if isinstance(entry, dict):
#             try:
#                 # Append 'description' value to respective list
#                 temp_additional_descriptions_description_list.append(entry['description'])
                
#                 # Check if 'lang' key exists in the entry
#                 if 'lang' in entry:
#                     temp_additional_descriptions_language_id_list.append(entry['lang']['id'])
#                     temp_additional_descriptions_language_title_en_list.append(entry['lang']['title'].get('en', None))  # Use .get() to handle missing keys
#                     temp_additional_descriptions_language_title_de_list.append(entry['lang']['title'].get('de', None))  # Use .get() to handle missing keys
#                 else:
#                     # Raise a KeyError if 'lang' key is missing
#                     raise KeyError("'lang' key is missing")

#                 # Check if 'type' key exists in the entry
#                 if 'type' in entry:
#                     temp_additional_descriptions_type_id_list.append(entry['type']['id'])
#                     temp_additional_descriptions_type_title_en_list.append(entry['type']['title'].get('en', None))  # Use .get() to handle missing keys
#                     temp_additional_descriptions_type_title_de_list.append(entry['type']['title'].get('de', None))  # Use .get() to handle missing keys
#                 else:
#                     # Raise a KeyError if 'type' key is missing
#                     raise KeyError("'type' key is missing")

#             except KeyError as e:
#                 # Handle KeyError exceptions
#                 print(f"Error: {str(e)} in data entry: {entry}")

#                 # Append None to maintain alignment with other lists in case of error
#                 temp_additional_descriptions_language_id_list.append(None)
#                 temp_additional_descriptions_language_title_en_list.append(None)
#                 temp_additional_descriptions_language_title_de_list.append(None)
#                 temp_additional_descriptions_type_id_list.append(None)
#                 temp_additional_descriptions_type_title_en_list.append(None)
#                 temp_additional_descriptions_type_title_de_list.append(None)
                
#         else:
#             # Handle non-dictionary entries
#             print(f"Ignoring non-dictionary entry: {entry}")
            
#     # Append collected values to main lists
#     additional_descriptions_description_list.append(temp_additional_descriptions_description_list)
#     additional_descriptions_language_id_list.append(temp_additional_descriptions_language_id_list)
#     additional_descriptions_language_title_en_list.append(temp_additional_descriptions_language_title_en_list)
#     additional_descriptions_language_title_de_list.append(temp_additional_descriptions_language_title_de_list)
#     additional_descriptions_type_id_list.append(temp_additional_descriptions_type_id_list)
#     additional_descriptions_type_title_en_list.append(temp_additional_descriptions_type_title_en_list)
#     additional_descriptions_type_title_de_list.append(temp_additional_descriptions_type_title_de_list)

# # Create a pandas DataFrame with a single row containing lists as values
# additional_description_df = pd.DataFrame({
#     'additional_descriptions.description': additional_descriptions_description_list,
#     'additional_descriptions.language.id': additional_descriptions_language_id_list,
#     'additional_descriptions.language.title.en': additional_descriptions_language_title_en_list,
#     'additional_descriptions.language.title.de': additional_descriptions_language_title_de_list,
#     'additional_descriptions.type.id': additional_descriptions_type_id_list,
#     'additional_descriptions.type.title.en': additional_descriptions_type_title_en_list,
#     'additional_descriptions.type.title.de': additional_descriptions_type_title_de_list
# })

# # Display the DataFrame
# additional_description_df.head()


In [None]:
# #### Flatten Metadata.Creators #######

# import pandas as pd

# # Initialize lists to store creator information
# personal_family_name_list = []
# personal_given_name_list = []
# personal_name_list = []
# organizational_name_list = []

# # Iterate through the data and extract required information
# for creators_list in data_df["metadata.creators"]:
#     # Check if the element is a list
#     if isinstance(creators_list, list):
#         # Initialize temporary lists for each record
#         temp_personal_family_name_list = []
#         temp_personal_given_name_list = []
#         temp_personal_name_list = []
#         temp_organizational_name_list = []
        
#         for creator_info in creators_list:
#             if isinstance(creator_info, dict):
#                 if creator_info['person_or_org']['type'] == 'personal':
#                     temp_personal_family_name_list.append(creator_info['person_or_org'].get('family_name', ''))
#                     temp_personal_given_name_list.append(creator_info['person_or_org'].get('given_name', ''))
#                     temp_personal_name_list.append(creator_info['person_or_org'].get('name', ''))

#                 elif creator_info['person_or_org']['type'] == 'organizational':
#                     temp_organizational_name_list.append(creator_info['person_or_org'].get('name', ''))

        
#         # Append collected values to main lists
#         personal_family_name_list.append(temp_personal_family_name_list)
#         personal_given_name_list.append(temp_personal_given_name_list)
#         personal_name_list.append(temp_personal_name_list)
#         organizational_name_list.append(temp_organizational_name_list)
#     else:
#         # If the element is not a list, append empty values
#         personal_family_name_list.append([''])
#         personal_given_name_list.append([''])
#         personal_name_list.append([''])
#         organizational_name_list.append([''])

# # Create DataFrame
# creator_data = {
#     'creators.person_or_org.personal.family_name': personal_family_name_list,
#     'creators.person_or_org.personal.given_name': personal_given_name_list,
#     'creators.person_or_org.personal.name': personal_name_list,
#     'creators.person_or_org.organizational.name': organizational_name_list
# }

# creator_df = pd.DataFrame(creator_data)

# # Display DataFrame
# # creator_df.head()

# # Append creator_df to data_df
# data_df = pd.concat([data_df, creator_df], axis=1)

# # Display DataFrame
# data_df.head()


In [None]:
# pprint(creator_df.at[0, 'creators.person_or_org.organizational.name'])  

In [None]:
# ### Use for flattening creator, works but not great ####

# import pandas as pd

# def flatten_dict(d, parent_key='', sep='.'):
#     items = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# def initialize_lists_dict(metadata_key, config):
#     lists_dict = {}
#     for scheme, items in config.items():
#         if isinstance(items, dict):
#             for sub_item in items:
#                 if isinstance(items[sub_item], dict):
#                     for sub_sub_item in items[sub_item]:
#                         lists_dict[f"{metadata_key}.{scheme}.{sub_item}.{sub_sub_item}"] = []
#                 else:
#                     lists_dict[f"{metadata_key}.{scheme}.{sub_item}"] = []
#         else:
#             lists_dict[f"{metadata_key}.{scheme}"] = []
#     lists_dict[metadata_key] = []
#     return lists_dict

# def process_metadata(data_df, metadata_key, config):
#     lists_dict = initialize_lists_dict(metadata_key, config)

#     for entries in data_df[f'metadata.{metadata_key}']:
#         temp_lists_dict = initialize_lists_dict(metadata_key, config)

#         if isinstance(entries, list):
#             for entry in entries:
#                 if isinstance(entry, dict):
#                     if metadata_key == 'creators':
#                         if 'affiliations' in entry:
#                             for affiliation in entry['affiliations']:
#                                 temp_lists_dict[f"{metadata_key}.affiliations.id"].append(affiliation.get('id', ''))
#                                 temp_lists_dict[f"{metadata_key}.affiliations.name"].append(affiliation.get('name', ''))
#                         if 'person_or_org' in entry:
#                             for key, value in entry['person_or_org'].items():
#                                 if key == 'identifiers':
#                                     for identifier in value:
#                                         temp_lists_dict[f"{metadata_key}.person_or_org.identifiers.identifier"].append(identifier.get('identifier', ''))
#                                         temp_lists_dict[f"{metadata_key}.person_or_org.identifiers.scheme"].append(identifier.get('scheme', ''))
#                                 else:
#                                     temp_lists_dict[f"{metadata_key}.person_or_org.{key}"].append(value)
#                         if 'role' in entry:
#                             temp_lists_dict[f"{metadata_key}.role.id"].append(entry['role'].get('id', ''))
#                             temp_lists_dict[f"{metadata_key}.role.title"].append(entry['role'].get('title', ''))
#                     else:
#                         scheme = entry.get('scheme', '')
#                         if scheme in config:
#                             for item_key, column_name in config[scheme].items():
#                                 temp_lists_dict[f"{metadata_key}.{scheme}.{item_key}"].append(entry.get(item_key, ''))
#                         else:
#                             temp_lists_dict[metadata_key].append(entry.get('subject', ''))

#         for key in temp_lists_dict.keys():
#             lists_dict[key].append(temp_lists_dict[key])
    
#     flattened_df = pd.DataFrame(lists_dict)
#     return flattened_df

# def flatten_metadata_list(data_df, metadata_key_list, config_list):
#     flattened_data_df = data_df.copy()
    
#     for metadata_key, config in zip(metadata_key_list, config_list):
#         flattened_field_df = process_metadata(data_df, metadata_key, config)
#         flattened_data_df = pd.concat([flattened_data_df, flattened_field_df], axis=1)
    
#     return flattened_data_df

# # # Example DataFrame
# # data = {
# #     'id': [1, 2],
# #     'metadata.subjects': [
# #         [
# #             {"subject": "Subject 1"},
# #             {"scheme": "MeSH", "id": "M1", "subject": "MeSH Subject 1"},
# #             {"scheme": "LCSH", "id": "L1", "subject": "LCSH Subject 1"},
# #             {"scheme": "LCNAF", "id": "N1", "subject": "LCNAF Subject 1"}
# #         ],
# #         [
# #             {"subject": "Subject 2"},
# #             {"scheme": "MeSH", "id": "M2", "subject": "MeSH Subject 2"},
# #             {"scheme": "LCSH", "id": "L2", "subject": "LCSH Subject 2"},
# #             {"scheme": "LCNAF", "id": "N2", "subject": "LCNAF Subject 2"}
# #         ]
# #     ],
# #     'metadata.creators': [
# #         [
# #             {'affiliations': [{'name': 'Northwestern University'},
# #                               {'name': 'Another department'},
# #                               {'id': '01bj3aw27', 'name': 'United States Department of Energy'}],
# #              'person_or_org': {'family_name': 'Doe', 'given_name': 'Jane A', 'identifiers': [{'identifier': '0000-0002-6747-0985', 'scheme': 'orcid'}],
# #                                'name': 'Doe, Jane A', 'type': 'personal'},
# #              'role': {'id': 'role-data-curator', 'title': {'en': 'Data curator role'}}},
# #             {'affiliations': [{'name': 'Northwestern University'},
# #                               {'name': 'a department'},
# #                               {'id': '02fh5m162', 'name': 'Národní Pedagogické Muzeum a Knihovna J. A. Komenského'}],
# #              'person_or_org': {'family_name': 'Taylor', 'given_name': 'John', 'identifiers': [{'identifier': '0000-0002-7619-0205', 'scheme': 'orcid'}],
# #                                'name': 'Taylor, John', 'type': 'personal'},
# #              'role': {'id': 'role-editor', 'title': {'en': 'Editor role'}}}
# #         ],
# #         [
# #             {'affiliations': [{'name': 'Company'},
# #                               {'name': 'Norcross'},
# #                               {'id': '000e0be47', 'name': 'Northwestern University'}],
# #              'person_or_org': {'identifiers': [{'identifier': '00by51808', 'scheme': 'ror'}],
# #                                'name': 'Mölnlycke Health Care (United States)', 'type': 'organizational'},
# #              'role': {'id': 'role-formal-analysis', 'title': {'en': 'Formal analysis role'}}},
# #             {'affiliations': [{'name': 'Raleigh'},
# #                               {'id': '000e0be47', 'name': 'Northwestern University'}],
# #              'person_or_org': {'identifiers': [{'identifier': '030bsqk21', 'scheme': 'ror'}],
# #                                'name': 'Plant Health Care (United States)', 'type': 'organizational'},
# #              'role': {'id': 'role-education-and-training', 'title': {'en': 'Education and training role'}}}
# #         ]
# #     ]
# # }

# # data_df = pd.DataFrame(data)

# # Configuration for handling different schemes
# config_subjects = {
#     'MeSH': {
#         'id': 'id',
#         'subject': 'subject'
#     },
#     'LCSH': {
#         'id': 'id',
#         'subject': 'subject'
#     },
#     'LCNAF': {
#         'id': 'id',
#         'subject': 'subject'
#     }
# }

# # Configuration for creators metadata
# config_creators = {
#     'affiliations': {
#         'id': 'affiliations.id',
#         'name': 'affiliations.name'
#     },
#     'person_or_org': {
#         'family_name': 'person_or_org.family_name',
#         'given_name': 'person_or_org.given_name',
#         'identifiers': {
#             'identifier': 'person_or_org.identifiers.identifier',
#             'scheme': 'person_or_org.identifiers.scheme'
#         },
#         'name': 'person_or_org.name',
#         'type': 'person_or_org.type'
#     },
#     'role': {
#         'id': 'role.id',
#         'title': 'role.title'
#     }
# }

# # Specify the metadata fields to flatten
# metadata_key_list = ['subjects', 'creators']

# # Specify the corresponding configurations for each metadata key
# config_list = [config_subjects, config_creators]

# # Call the function
# flattened_final_df = flatten_metadata_list(data_df, metadata_key_list, config_list)
# flattened_final_df.head()


In [None]:
# ## Flatten Metadata.Subjects ####
# import pandas as pd

# # Initialize lists to store subject information
# # record_id_list = []
# subject_list = []
# mesh_id_list = []
# mesh_subject_list = []
# lcnaf_id_list = []
# lcnaf_subject_list = []
# lcsh_id_list = []
# lcsh_subject_list = []

# # Initialize an empty list to store DataFrames
# dfs = []

# # Iterate through the data and extract required information
# for idx, subjects_info in enumerate(data_df["metadata.subjects"]):

#     # If subjects_info is empty, append an empty row to the DataFrame
#     if not subjects_info:
#         empty_df = pd.DataFrame(columns=subjects_df.columns)
#         dfs.append(empty_df)
    
#     temp_subject_list = []
#     temp_mesh_id_list = []
#     temp_mesh_subject_list = []
#     temp_lcnaf_id_list = []
#     temp_lcnaf_subject_list = []
#     temp_lcsh_id_list = []
#     temp_lcsh_subject_list = []

#     if isinstance(subjects_info, list):
#         for subject_info in subjects_info:
#             if isinstance(subject_info, dict):
#                 subject = subject_info.get('subject', '')
#                 scheme = subject_info.get('scheme', '')

#                 if scheme == 'MeSH':
#                     temp_mesh_id_list.append(subject_info.get('id', ''))
#                     temp_mesh_subject_list.append(subject)
#                 elif scheme == 'LCSH':
#                     temp_lcsh_id_list.append(subject_info.get('id', ''))
#                     temp_lcsh_subject_list.append(subject)
#                 elif scheme == 'LCNAF':
#                     temp_lcnaf_id_list.append(subject_info.get('id', ''))
#                     temp_lcnaf_subject_list.append(subject)
#                 else:
#                     temp_subject_list.append(subject)

#     # Append collected values to main lists
#     subject_list.append(temp_subject_list)
#     mesh_id_list.append(temp_mesh_id_list)
#     mesh_subject_list.append(temp_mesh_subject_list)
#     lcsh_id_list.append(temp_lcsh_id_list)
#     lcsh_subject_list.append(temp_lcsh_subject_list)
#     lcnaf_id_list.append(temp_lcnaf_id_list)
#     lcnaf_subject_list.append(temp_lcnaf_subject_list)

# # Create DataFrame
# subjects_df = pd.DataFrame({
# #     'record_id': record_id_list,
#     'subjects.subject': subject_list,
#     'subjects.subject.MeSH.id': mesh_id_list,
#     'subjects.subject.MeSH.subject': mesh_subject_list,
#     'subjects.subject.LCSH.id': lcsh_id_list,
#     'subjects.subject.LCSH.subject': lcsh_subject_list,
#     'subjects.subject.LCNAF.id': lcnaf_id_list,
#     'subjects.subject.LCNAF.subject': lcnaf_subject_list
# })

# subjects_df.head()

In [None]:
# ### For flattening subjects, works but doesn't group subject by id

# import pandas as pd

# def flatten_dict(d, parent_key='', sep='.'):
#     """
#     Recursively flattens a nested dictionary.

#     Parameters:
#     d (dict): The dictionary to flatten.
#     parent_key (str): The base key string for nested keys.
#     sep (str): The separator between parent and child keys.

#     Returns:
#     dict: A flattened dictionary.
#     """
#     items = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# def process_metadata(data_df, metadata_key, config):
#     """
#     Processes the specified metadata key from the DataFrame based on the configuration.

#     Parameters:
#     data_df (pd.DataFrame): The DataFrame containing metadata.
#     metadata_key (str): The key in the metadata to process.
#     config (dict): The configuration specifying how to handle different schemes/types.

#     Returns:
#     pd.DataFrame: A DataFrame with processed metadata.
#     """
#     # Initialize lists to store extracted information
#     lists_dict = {key: [] for key in config.keys()}
#     lists_dict[metadata_key] = []

#     # Iterate through each list of entries in the specified metadata column
#     for entries in data_df[f'metadata.{metadata_key}']:
#         temp_lists_dict = {key: [] for key in config.keys()}
#         temp_lists_dict[metadata_key] = []

#         if isinstance(entries, list):
#             for entry in entries:
#                 if isinstance(entry, dict):
#                     scheme = entry.get('scheme', '')
#                     for scheme_key, scheme_config in config.items():
#                         if scheme == scheme_key:
#                             for item_key, column_name in scheme_config.items():
#                                 temp_lists_dict[scheme_key].append(entry.get(item_key, ''))
#                     if scheme not in config.keys():
#                         temp_lists_dict[metadata_key].append(entry.get('subject', ''))
        
#         # Append collected values to main lists
#         for key in temp_lists_dict.keys():
#             lists_dict[key].append(temp_lists_dict[key])
    
#     # Create DataFrame
#     flattened_df = pd.DataFrame(lists_dict)
    
#     return flattened_df

# def flatten_metadata_list(data_df, metadata_key_list, config):
#     """
#     Flattens the specified metadata fields from the DataFrame based on the configuration.

#     Parameters:
#     data_df (pd.DataFrame): The DataFrame containing metadata.
#     metadata_key_list (list): The list of metadata keys to flatten.
#     config (dict): The configuration specifying how to handle different schemes/types.

#     Returns:
#     pd.DataFrame: A DataFrame with flattened metadata.
#     """
#     flattened_data_df = data_df.copy()
    
#     for metadata_key in metadata_key_list:
#         flattened_field_df = process_metadata(data_df, metadata_key, config)
#         flattened_data_df = pd.concat([flattened_data_df, flattened_field_df], axis=1)
    
#     return flattened_data_df



# # Configuration for handling different schemes
# config = {
#     'MeSH': {
#         'id': 'subjects.subject.MeSH.id',
#         'subject': 'subjects.subject.MeSH.subject'
#     },
#     'LCSH': {
#         'id': 'subjects.subject.LCSH.id',
#         'subject': 'subjects.subject.LCSH.subject'
#     },
#     'LCNAF': {
#         'id': 'subjects.subject.LCNAF.id',
#         'subject': 'subjects.subject.LCNAF.subject'
#     }
# }

# # Specify the metadata fields to flatten
# metadata_key_list = ['subjects']

# # Call the function
# flattened_final_df = flatten_metadata_list(data_df, metadata_key_list, config)
# flattened_final_df.head()


In [None]:
# Append subjects_df to data_df
# data_df = pd.concat([data_df, subjects_df], axis=1)
# data_df.head(50)

In [None]:
#### Flatten Dates #####

# # Initialize empty lists to store date information
# dates_date_list = []
# dates_description_list = []
# dates_type_id_list = []
# dates_type_title_de_list = []
# dates_type_title_en_list = []

# # Initialize an empty list to store DataFrames
# dfs = []

# # Iterate through each list of dates in the "metadata.dates" column of data_df
# for dates in data_df["metadata.dates"]:
    
#         # If subjects_info is empty, append an empty row to the DataFrame
#     if not dates:
#         empty_df = pd.DataFrame(columns=dates.columns)
#         dfs.append(empty_df)
    
#     temp_dates_date_list = []
#     temp_dates_description_list = []
#     temp_dates_type_id_list = []
#     temp_dates_type_title_de_list = []
#     temp_dates_type_title_en_list = []
    
#     # Iterate through each entry in the current list of dates
#     for entry in dates:
        
#         # Check if the entry is a dictionary
#         if isinstance(entry, dict):
#             try:
#                 # Append 'date' and 'description' values to respective lists
#                 temp_dates_date_list.append(entry['date'])
#                 temp_dates_description_list.append(entry['description'])

#                 # Check if 'type' key exists in the entry
#                 if 'type' in entry:
#                     # Append 'id', 'title.de', and 'title.en' values to respective lists
#                     temp_dates_type_id_list.append(entry['type']['id'])
#                     temp_dates_type_title_de_list.append(entry['type']['title'].get('de', None))  # Use .get() to handle missing keys
#                     temp_dates_type_title_en_list.append(entry['type']['title'].get('en', None))  # Use .get() to handle missing keys
#                 else:
#                     # Raise a KeyError if 'type' key is missing
#                     raise KeyError("'type' key is missing")

#             except KeyError as e:
#                 # Handle KeyError exceptions
#                 print(f"Error: {str(e)} in data entry: {entry}")

#                 # Append None to maintain alignment with other lists in case of error
#                 temp_dates_type_id_list.append(None)
#                 temp_dates_type_title_de_list.append(None)
#                 temp_dates_type_title_en_list.append(None)

        
#         else:
#             # Handle non-dictionary entries
#             print(f"Ignoring non-dictionary entry: {entry}")
            
#     # Append collected values to main lists
#     dates_date_list.append(temp_dates_date_list)
#     dates_description_list.append(temp_dates_description_list)
#     dates_type_id_list.append(temp_dates_type_id_list)
#     dates_type_title_de_list.append(temp_dates_type_title_de_list)
#     dates_type_title_en_list.append(temp_dates_type_title_en_list)

# # Create a pandas DataFrame with a single row containing lists as values
# dates_df = pd.DataFrame({
#     'dates.date': dates_date_list,
#     'dates.description': dates_description_list,
#     'dates.type.id': dates_type_id_list,
#     'dates.type.title.de': dates_type_title_de_list,
#     'dates.type.title.en': dates_type_title_en_list
# })

# # Display the DataFrame
# dates_df.head()


In [None]:
# # Append dates_df to data_df
# data_df = pd.concat([data_df, dates_df], axis=1)
# data_df.head(50)

In [None]:
### THE FOLLOWING GODE CAN BE USED TO FLATTEN JSON OR NESTED DATA IN DATAFRAMES ######

In [None]:
# ## Test JSON data for dates: 
# import pandas as pd

# # Define your data
# data = {
#     'dates': [
#         {'date': '2022-04-01',
#          'description': 'date this was accepted',
#          'type': {'id': 'accepted',
#                   'title': {'de': 'Angenommen',
#                             'en': 'Accepted'}}},
#         {'date': '1945/1966',
#          'description': 'date this was available',
#          'type': {'id': 'available',
#                   'title': {'en': 'Available'}}},
#         {'date': '1987',
#          'description': 'date this was collected',
#          'type': {'id': 'collected',
#                   'title': {'en': 'Collected'}}},
#         {'date': '1988',
#          'description': 'another date this was collected',
#          'type': {'id': 'collected',
#                   'title': {'en': 'Collected'}}},
#         {'date': '2024-01-01',
#          'description': 'date this was copyrighted',
#          'type': {'id': 'copyrighted',
#                   'title': {'de': 'Mit Copyright versehen',
#                             'en': 'Copyrighted'}}},
#         {'date': '2024',
#          'description': 'date this was created',
#          'type': {'id': 'created',
#                   'title': {'de': 'Erstellt',
#                             'en': 'Created'}}},
#         {'date': '2010',
#          'description': 'date this was issued',
#          'type': {'id': 'issued',
#                   'title': {'de': 'Veröffentlicht',
#                             'en': 'Issued'}}},
#         # Add an example of incomplete data
#         {'date': '1999',
#          'description': 'missing type information'}
#     ]
# }


In [None]:
### This works for two layers ###
# import pandas as pd

# def flatten_metadata(data_df, metadata_key):
#     # Initialize a dictionary to store lists for each column
#     columns = {}

#     # Iterate through each list of entries in the specified metadata column
#     for entries in data_df[f'metadata.{metadata_key}']:
        
#         # Check if the list of entries is empty
#         if not entries:
#             continue
        
#         # Initialize temporary lists for each entry
#         temp_data = {}
        
#         for entry in entries:
#             if isinstance(entry, dict):
#                 for key, value in entry.items():
#                     # Create a sub-key for nested dictionaries
#                     if isinstance(value, dict):
#                         for sub_key, sub_value in value.items():
#                             # Ensure the column key exists in the columns dictionary
#                             column_key = f'{metadata_key}.{key}.{sub_key}'
#                             if column_key not in columns:
#                                 columns[column_key] = []
#                             temp_data.setdefault(column_key, []).append(sub_value)
#                     else:
#                         column_key = f'{metadata_key}.{key}'
#                         if column_key not in columns:
#                             columns[column_key] = []
#                         temp_data.setdefault(column_key, []).append(value)
#             else:
#                 print(f"Ignoring non-dictionary entry: {entry}")
        
#         # Append temporary lists to the main lists in columns dictionary
#         for column_key, temp_list in temp_data.items():
#             if column_key in columns:
#                 columns[column_key].append(temp_list)

#     # Create a pandas DataFrame with lists as values
#     flattened_df = pd.DataFrame(columns)
    
#     return flattened_df

# # Example usage
# # Assuming `data_df` is your DataFrame and 'additional_descriptions' is the key to flatten
# flattened_additional_descriptions = flatten_metadata(data_df, 'additional_descriptions')
# flattened_additional_descriptions.head()


In [None]:
## Works with JSON but does not flatten every layer ###

import pandas as pd

# def flatten_dict(d, parent_key='', sep='.'):
#     """
#     Recursively flattens a nested dictionary.

#     Parameters:
#     d (dict): The dictionary to flatten.
#     parent_key (str): The base key string for nested keys.
#     sep (str): The separator between parent and child keys.

#     Returns:
#     dict: A flattened dictionary.
#     """
#     items = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# def flatten_metadata(json_data):
#     """
#     Recursively flattens all keys under 'metadata' from the JSON data and includes sibling keys.

#     Parameters:
#     json_data (list): The JSON data containing metadata.

#     Returns:
#     pd.DataFrame: A DataFrame with flattened metadata.
#     """
#     # Initialize a dictionary to store lists for each column
#     columns = {}

#     # Iterate through each entry in the JSON data
#     for data_entry in json_data:
#         flattened_data_entry = flatten_dict(data_entry)
        
#         # Remove 'metadata' key to avoid redundancy
#         if 'metadata' in flattened_data_entry:
#             flattened_data_entry.pop('metadata')
        
#         # Flatten all keys under 'metadata'
#         metadata = data_entry.get('metadata', {})
#         flattened_metadata = flatten_dict(metadata, parent_key='metadata')
        
#         for key, value in flattened_metadata.items():
#             # Ensure the column key exists in the columns dictionary
#             column_key = f'metadata.{key}'
#             if column_key not in columns:
#                 columns[column_key] = []
#             columns[column_key].append(value)

#         # Append other non-metadata columns
#         for key, value in flattened_data_entry.items():
#             if key != 'metadata':
#                 if key not in columns:
#                     columns[key] = []
#                 columns[key].append(value)

#     # Ensure all lists are of the same length
#     max_length = max(len(v) for v in columns.values())
#     for k, v in columns.items():
#         if len(v) < max_length:
#             columns[k].extend([None] * (max_length - len(v)))  # Use None to fill missing values

#     # Create a pandas DataFrame with lists as values
#     flattened_df = pd.DataFrame(columns)
    
#     return flattened_df

# # Example JSON data
# json_data = [
#     {
#         "id": {"count": "1", "available": "many"},
#         "name": "Item 1",
#         "metadata": {
#             "additional_descriptions": [
#                 {
#                     "description": "Description 1",
#                     "lang": {"id": "en", "title": {"en": "English"}},
#                     "type": {"id": "type1", "title": {"en": "Type 1"}}
#                 },
#                 {
#                     "description": "Description 2",
#                     "lang": {"id": "de", "title": {"de": "German"}},
#                     "type": {"id": "type2", "title": {"de": "Type 2"}}
#                 }
#             ],
#             "other_metadata": {
#                 "field1": "value1",
#                 "field2": "value2"
#             }
#         },
#         "category": "Category A",
#         "tags": ["tag1", "tag2"]
#     },
#     {
#         "id": {"count": "20", "available": "some"},
#         "name": "Item 2",
#         "metadata": {
#             "additional_descriptions": [
#                 {
#                     "description": "Description 3",
#                     "lang": {"id": "fr", "title": {"en": "French"}},
#                     "type": {"id": "type3", "title": {"en": "Type 3"}}
#                 }
#             ],
#             "other_metadata": {
#                 "field1": "value3",
#                 "field3": "value4"
#             }
#         },
#         "category": "Category B",
#         "tags": ["tag3"]
#     }
# ]

# # Flatten all metadata
# flattened_data_df = flatten_metadata(json_data)
# flattened_data_df.head()


In [None]:
##### Works better with JSON but still does not flatten every layer #####
# import pandas as pd

# def flatten_dict(d, parent_key='', sep='.'):
#     """
#     Recursively flattens a nested dictionary.

#     Parameters:
#     d (dict): The dictionary to flatten.
#     parent_key (str): The base key string for nested keys.
#     sep (str): The separator between parent and child keys.

#     Returns:
#     dict: A flattened dictionary.
#     """
#     items = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# def flatten_metadata(json_data, metadata_key):
#     """
#     Flattens the specified metadata key from the JSON data and includes sibling keys.

#     Parameters:
#     json_data (list): The JSON data containing metadata.
#     metadata_key (str): The key in the metadata to flatten.

#     Returns:
#     pd.DataFrame: A DataFrame with flattened metadata.
#     """
#     # Initialize a dictionary to store lists for each column
#     columns = {}
#     other_columns = []

#     # Iterate through each entry in the JSON data
#     for data_entry in json_data:
#         flattened_data_entry = flatten_dict(data_entry)
#         other_columns.append(flattened_data_entry)

#         entries = data_entry.get('metadata', {}).get(metadata_key, [])
        
#         # Initialize temporary lists for each entry
#         temp_data = {}
        
#         for entry in entries:
#             if isinstance(entry, dict):
#                 flattened_entry = flatten_dict(entry)
#                 for key, value in flattened_entry.items():
#                     # Ensure the column key exists in the columns dictionary
#                     column_key = f'{metadata_key}.{key}'
#                     if column_key not in columns:
#                         columns[column_key] = []
#                     temp_data.setdefault(column_key, []).append(value)
#             else:
#                 print(f"Ignoring non-dictionary entry: {entry}")
        
#         # Append temporary lists to the main lists in columns dictionary
#         for column_key, temp_list in temp_data.items():
#             if column_key in columns:
#                 columns[column_key].append(temp_list)

#     # Ensure all lists are of the same length
#     max_length = max(len(v) for v in columns.values())
#     for k, v in columns.items():
#         if len(v) < max_length:
#             columns[k].extend([[]] * (max_length - len(v)))  # Use empty lists instead of None

#     # Combine flattened metadata with other columns
#     for other_column in other_columns:
#         for key, value in other_column.items():
#             if key not in columns:
#                 columns[key] = [value] * max_length

#     # Create a pandas DataFrame with lists as values
#     flattened_df = pd.DataFrame(columns)
    
#     return flattened_df


# json_data = [
#     {
#         "id":{"count":"1", "available": "many"},
#         "name": "Item 1",
#         "metadata": {
#             "additional_descriptions": [
#                 {
#                     "description": "Description 1",
#                     "lang": {"id": "en", "title": {"en": "English"}},
#                     "type": {"id": "type1", "title": {"en": "Type 1"}}
#                 },
#                 {
#                     "description": "Description 2",
#                     "lang": {"id": "de", "title": {"de": "German"}},
#                     "type": {"id": "type2", "title": {"de": "Type 2"}}
#                 }
#             ]
#         },
#         "category": "Category A",
#         "tags": ["tag1", "tag2"]
#     },
#     {
#         "id": {"count" :"20", "available": "some"},
#         "name": "Item 2",
#         "metadata": {
#             "additional_descriptions": [
#                 {
#                     "description": "Description 3",
#                     "lang": {"id": "fr", "title": {"en": "French"}},
#                     "type": {"id": "type3", "title": {"en": "Type 3"}}
#                 }
#             ]
#         },
#         "category": "Category B",
#         "tags": ["tag3"]
#     }
# ]



# # Flatten the additional_descriptions metadata
# flattened_data_df = flatten_metadata(json_data, 'additional_descriptions')
# flattened_data_df.head()



In [None]:
#### Using an object-oriented approach ###

# import pandas as pd

# class MetadataFlattener:
#     def __init__(self, data_df):
#         self.data_df = data_df

#     @staticmethod
#     def flatten_dict(d, parent_key='', sep='.'):
#         """
#         Recursively flattens a nested dictionary.
#         """
#         items = []
#         for k, v in d.items():
#             new_key = f"{parent_key}{sep}{k}" if parent_key else k
#             if isinstance(v, dict):
#                 items.extend(MetadataFlattener.flatten_dict(v, new_key, sep=sep).items())
#             else:
#                 items.append((new_key, v))
#         return dict(items)

#     def flatten_metadata(self, metadata_key):
#         """
#         Flattens the specified metadata key from the DataFrame.
#         """
#         # Initialize a dictionary to store lists for each column
#         columns = {}
        
#         # Iterate through each list of entries in the specified metadata column
#         for entries in self.data_df[f'metadata.{metadata_key}']:
#             # Check if the list of entries is empty
#             if not entries:
#                 continue
            
#             # Initialize temporary lists for each entry
#             temp_data = {}
            
#             for entry in entries:
#                 if isinstance(entry, dict):
#                     flattened_entry = MetadataFlattener.flatten_dict(entry)
#                     for key, value in flattened_entry.items():
#                         # Ensure the column key exists in the columns dictionary
#                         column_key = f'{metadata_key}.{key}'
#                         if column_key not in columns:
#                             columns[column_key] = []
#                         temp_data.setdefault(column_key, []).append(value)
#                 else:
#                     print(f"Ignoring non-dictionary entry: {entry}")
            
#             # Append temporary lists to the main lists in columns dictionary
#             for column_key, temp_list in temp_data.items():
#                 if column_key in columns:
#                     columns[column_key].append(temp_list)
        
#         # Create a pandas DataFrame with lists as values
#         flattened_field_df = pd.DataFrame(columns)
        
#         return flattened_field_df

#     def process_metadata(self, metadata_key, config):
#         """
#         Processes the specified metadata key from the DataFrame based on the configuration.
#         """
#         # Initialize lists to store extracted information
#         lists_dict = {f"{metadata_key}.{scheme}.{item}": [] for scheme in config.keys() for item in config[scheme].keys()}
#         lists_dict[metadata_key] = []

#         # Iterate through each list of entries in the specified metadata column
#         for entries in self.data_df[f'metadata.{metadata_key}']:
#             temp_lists_dict = {f"{metadata_key}.{scheme}.{item}": [] for scheme in config.keys() for item in config[scheme].keys()}
#             temp_lists_dict[metadata_key] = []

#             if isinstance(entries, list):
#                 for entry in entries:
#                     if isinstance(entry, dict):
#                         scheme = entry.get('scheme', '')
#                         if scheme in config:
#                             for item_key, column_name in config[scheme].items():
#                                 temp_lists_dict[f"{metadata_key}.{scheme}.{item_key}"].append(entry.get(item_key, ''))
#                         else:
#                             temp_lists_dict[metadata_key].append(entry.get('subject', ''))
            
#             # Append collected values to main lists
#             for key in temp_lists_dict.keys():
#                 lists_dict[key].append(temp_lists_dict[key])
        
#         # Create DataFrame
#         flattened_df = pd.DataFrame(lists_dict)
        
#         return flattened_df

#     def flatten_metadata_list(self, metadata_key_list, config=None):
#         """
#         Flattens the specified metadata fields from the DataFrame.
#         """
#         flattened_data_df = self.data_df.copy()
        
#         for metadata_key in metadata_key_list:
#             if config and metadata_key in config:
#                 flattened_field_df = self.process_metadata(metadata_key, config[metadata_key])
#             else:
#                 flattened_field_df = self.flatten_metadata(metadata_key)
#             flattened_data_df = pd.concat([flattened_data_df, flattened_field_df], axis=1)
        
#         return flattened_data_df

# # Example usage:
# data = {
#     'id': [1, 2],
#     'metadata.subjects': [
#         [
#             {"scheme": "LCSH", "id": "1", "subject": "Subject 1"},
#             {"scheme": "MeSH", "id": "2", "subject": "Subject 2"}
#         ],
#         [
#             {"scheme": "LCSH", "id": "3", "subject": "Subject 3"},
#             {"scheme": "MeSH", "id": "4", "subject": "Subject 4"}
#         ]
#     ],
#     'metadata.additional_descriptions': [
#         [
#             {"description": "Description 1"},
#             {"type": "Type 1"}
#         ],
#         [
#             {"description": "Description 2"},
#             {"type": "Type 2"}
#         ]
#     ],
#     'metadata.additional_titles': [
#         [
#             {"title": "Title 1"},
#             {"type": "Type 1"}
#         ],
#         [
#             {"title": "Title 2"},
#             {"type": "Type 2"}
#         ]
#     ]
# }

# data_df = pd.DataFrame(data)

# # Configuration for handling different schemes
# config = {
#     'subjects': {
#         'MeSH': {
#             'id': 'id',
#             'subject': 'subject'
#         },
#         'LCSH': {
#             'id': 'id',
#             'subject': 'subject'
#         }
#     }
# }

# # Specify the metadata fields to flatten
# metadata_key


In [None]:
# # Define a function to flatten JSON
# def flatten_json(json_data):
#     flat_data = {}

#     def flatten(x, name=''):
#         if isinstance(x, dict):
#             for a in x:
#                 flatten(x[a], name + a + '_')
#         elif isinstance(x, list):
#             for i, a in enumerate(x):
#                 flatten(a, name + str(i) + '_')
#         else:
#             flat_data[name[:-1]] = x

#     flatten(json_data)
#     return flat_data


# # Flatten the JSON data
# flattened_data = flatten_json(multiple_identifier_list)
# print(flattened_data)
# # # Convert to DataFrame
# df_1 = pd.DataFrame.from_dict(flattened_data, orient='index', columns=['value'])

# df_1.head()

In [None]:
# # Define a function to flatten JSON
# def flatten_json(json_data):
#     flat_data = {}

#     def flatten(x, name=''):
#         if isinstance(x, dict):
#             for a in x:
#                 flatten(x[a], name + a + '_')
#         elif isinstance(x, list):
#             for i, a in enumerate(x):
#                 flatten(a, name + str(i) + '_')
#         else:
#             flat_data[name[:-1]] = x

#     flatten(json_data)
#     return flat_data

# # Flatten the JSON data
# flattened_data = flatten_json(multiple_identifier_list)

# # Convert to DataFrame
# df_1 = pd.DataFrame([flattened_data])

# df_1.head()


In [None]:
# import pandas as pd

# # Define a function to flatten JSON
# def flatten_json(json_data):
#     flat_data = {}

#     def flatten(x, name=''):
#         if isinstance(x, dict):
#             for a in x:
#                 flatten(x[a], name + a + '_')
#         elif isinstance(x, list):
#             for i, a in enumerate(x):
#                 flatten(a, name + str(i) + '_')
#         else:
#             flat_data[name[:-1]] = x

#     flatten(json_data)
#     return flat_data

# # Flatten the JSON data
# flattened_data = flatten_json(multiple_identifier_list)

# # Create DataFrame from flattened data
# df_1 = pd.DataFrame([flattened_data])

# # Transpose the DataFrame so that each row represents one record
# df_1 = df_1.transpose()

# # Reset index to have the keys as a column
# df_1.reset_index(inplace=True)

# # Rename columns
# df_1.columns = ['Column Header', 'Value']

# df_1.head()


In [None]:
# Create DataFrame
df = pd.DataFrame(multiple_identifier_list)

# Flatten the DataFrame
df_flattened = pd.json_normalize(df.to_dict(orient='records'))

# Display the flattened DataFrame
df_flattened.head()

In [None]:
# import pandas as pd
# from collections.abc import MutableMapping

# def flatten_dict(d, parent_key='', sep='_'):
#     items = []
#     for k, v in d.items():
#         new_key = parent_key + sep + k if parent_key else k
#         if isinstance(v, MutableMapping):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# # Flatten each dictionary in the list
# flattened_data = [flatten_dict(record) for record in multiple_identifier_list]

# # Convert to DataFrame
# df = pd.DataFrame(flattened_data)

# # Display DataFrame
# df.head()

In [None]:
# def flatten_dict(d, parent_key='', sep='_'):
#     items = []
#     for k, v in d.items():
#         new_key = parent_key + sep + k if parent_key else k
#         if isinstance(v, dict):
#             items.extend(flatten_dict(v, new_key, sep=sep).items())
#         else:
#             items.append((new_key, v))
#     return dict(items)

# # Flatten each dictionary in the list
# flattened_data = [flatten_dict(record) for record in multiple_identifier_list]

# # Convert to DataFrame
# df = pd.DataFrame(flattened_data)

# # Display DataFrame
# df.head()

In [None]:
import pandas as pd

def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            for i, elem in enumerate(v):
                if isinstance(elem, dict):
                    items.extend(flatten_dict(elem, new_key + sep + str(i), sep=sep).items())
                else:
                    items.append((new_key + sep + str(i), elem))
        else:
            items.append((new_key, v))
    return dict(items)

# Flatten each dictionary in the list
flattened_data = [flatten_dict(record) for record in multiple_identifier_list]

# Convert to DataFrame
df = pd.DataFrame(flattened_data)

# Display DataFrame
df.head(50)

In [None]:
for column in df.columns:
    print(column)

In [None]:
print(df.at[4, 'metadata.additional_descriptions'])  

In [None]:
import pandas as pd

def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            for i, elem in enumerate(v):
                if isinstance(elem, dict):
                    items.extend(flatten_dict(elem, new_key + sep + str(i), sep=sep).items())
                else:
                    items.append((new_key + sep + str(i), elem))
        else:
            items.append((new_key, v))
    return dict(items)

# Sample data
data = [
    {'access': {'embargo': {'active': False, 'reason': None},
                'files': 'public',
                'record': 'public',
                'status': 'open'},
     'created': '2023-03-31T16:36:26.988908+00:00',
     'custom_fields': {},
     'files': {'enabled': True, 'order': []},
     'id': 'y9pb3-wtb18',
     'is_draft': False,
     'is_published': True,
     'links': {'access_links': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18/access/links',
                'archive': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18/files-archive',
                'doi': 'https://doi.org/10.18131/g3-myvr-1x72',
                'draft': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18/draft',
                'files': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18/files',
                'latest': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18/versions/latest',
                'latest_html': 'https://prism.northwestern.edu/records/y9pb3-wtb18/latest',
                'reserve_doi': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18/draft/pids/doi',
                'self': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18',
                'self_doi': 'https://prism.northwestern.edu/doi/10.18131/g3-myvr-1x72',
                'self_html': 'https://prism.northwestern.edu/records/y9pb3-wtb18',
                'self_iiif_manifest': 'https://prism.northwestern.edu/api/iiif/record:y9pb3-wtb18/manifest',
                'self_iiif_sequence': 'https://prism.northwestern.edu/api/iiif/record:y9pb3-wtb18/sequence/default',
                'versions': 'https://prism.northwestern.edu/api/records/y9pb3-wtb18/versions'},
     'metadata': {'creators': [{'person_or_org': {'name': 'Creator not identified.', 'type': 'organizational'}}],
                  'dates': [{'date': '1943',
                             'description': 'When the item was originally created.',
                             'type': {'id': 'created', 'title': {'de': 'Erstellt', 'en': 'Created'}}}],
                  'description': 'View of a garden outside a villa.',
                  'formats': ['image/tiff'],
                  'publication_date': '1943',
                  'publisher': 'DigitalHub. Galter Health Sciences Library & Learning Center',
                  'resource_type': {'id': 'image-photograph', 'title': {'en': 'Photograph'}},
                  'rights': [{'description': {'en': 'This mark indicates that this work has been\xa0'
                                                     'identified\xa0as being free of known restrictions under '
                                                     'copyright law, including all related and neighboring '
                                                     'rights. The mark should only be used for works already '
                                                     'free of known copyright and database restrictions and in '
                                                     'the public domain throughout the world.'},
                              'id': 'cc-pdm-1.0',
                              'props': {'scheme': 'custom',
                                        'url': 'http://creativecommons.org/publicdomain/mark/1.0'},
                              'title': {'en': 'Creative Commons Public Domain Mark 1.0'}}],
                  'subjects': [{'subject': '12th General Hospital'},
                               {'subject': 'World War II'},
                               {'subject': 'Northwestern University'},
                               {'id': 'https://id.nlm.nih.gov/mesh/D047789', 'scheme': 'MeSH',
                                'subject': 'World War II'},
                               {'id': 'https://id.nlm.nih.gov/mesh/D006772', 'scheme': 'MeSH',
                                'subject': 'Hospitals, Military'},
                               {'id': 'https://id.nlm.nih.gov/mesh/D058749', 'scheme': 'MeSH',
                                'subject': 'Military Facilities'},
                               {'id': 'https://id.loc.gov/authorities/names/no2019036071',
                                'scheme': 'LCNAF', 'subject': 'ʻAyn al-Turk (Algeria)'}],
                  'title': 'Ain-El-Turck 087', 'version': 'v1.0.0'},
     'parent': {'communities': {'default': 'ff968ff7-5cb4-4ad7-aff3-f62cf1707920',
                                'ids': ['ff968ff7-5cb4-4ad7-aff3-f62cf1707920']},
                'id': '0jyda-j3836'},
     'pids': {'doi': {'client': 'datacite', 'identifier': '10.18131/g3-myvr-1x72',
                       'provider': 'datacite'},
              'oai': {'identifier': 'oai:prism.northwestern.edu:y9pb3-wtb18', 'provider': 'oai'}},
     'revision_id': 3,
     'status': 'published',
     'updated': '2023-03-31T16:36:27.311928+00:00',
     'versions': {'index': 1, 'is_latest': True}}
]

# Flatten each dictionary in the list
flattened_data = [flatten_dict(record) for record in data]

# Convert to DataFrame
df = pd.DataFrame(flattened_data)

# Display DataFrame
df.head()


In [None]:
def flatten_json(json_data):
    flat_data = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                if isinstance(x[a], list):
                    if a == 'subjects':
                        for item in x[a]:
                            scheme = item.get('scheme', 'Author Keywords')
                            subject = item.get('subject', '')
                            subject_id = item.get('id', '')
                            if scheme not in flat_data:
                                flat_data[scheme] = {'subject': [], 'id': []}
                            flat_data[scheme]['subject'].append(subject)
                            flat_data[scheme]['id'].append(subject_id)
                    else:
                        flatten(x[a], name + a + '_')
                else:
                    flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            flat_data[name[:-1]] = x

    flatten(json_data)
    return flat_data

# Flatten each dictionary in the list
flattened_data = [flatten_dict(record) for record in data]

# Convert to DataFrame
df = pd.DataFrame(flattened_data)

# Display DataFrame
df.head()


In [None]:
def flatten_json(json_data):
    flat_data = []

    def flatten(x, name=''):
        if isinstance(x, dict):
            flat_record = {}
            for a in x:
                if isinstance(x[a], list):
                    if a == 'subjects':
                        for item in x[a]:
                            scheme = item.get('scheme')
                            print(scheme)
#                             subject = item.get('subject', '')
#                             subject_id = item.get('id', '')
#                             if scheme not in flat_record:
#                                 flat_record[scheme] = {'subject': [], 'id': []}
#                             flat_record[scheme]['subject'].append(subject)
#                             flat_record[scheme]['id'].append(subject_id)
                    else:
                        flatten(x[a], name + a + '_')
                else:
                    flatten(x[a], name + a + '_')
                flat_record[name[:-1]] = x[a]
            flat_data.append(flat_record)
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            flat_data[name[:-1]] = x

    flatten(json_data)
    return flat_data

# Flatten each dictionary in the list
flattened_data = [flatten_dict(record) for record in data]

# Convert to DataFrame
df = pd.DataFrame(flattened_data)

# # Display DataFrame
# df.head()

In [None]:
for column in df.columns:
    print(column)