In [1]:
import glob
import json
from tqdm import tqdm

In [2]:
class ReadAndExtractRefs:
    def __init__(self, path_to_ref_files="."):
        self.path_to_ref_files = path_to_ref_files
        self.files = glob.glob(self.path_to_ref_files + "*.json")
        self.final_eid_reference_dict = {}

    def read_json_file(self, filename):
        try:
            with open(filename, "r") as f:
                data = json.load(f)
            return data
        except FileNotFoundError:
            print(f"File not found: {filename}")
            return None
        except json.JSONDecodeError:
            print(f"Error decoding JSON in file: {filename}")
            return None

    def extract_eids(self, data):
        for key, value in data.items():
            if not value:
                self.final_eid_reference_dict[key] = []
            else:
                if isinstance(value, dict):
                    eid_list = [value.get("scopus-eid", None)]
                else:
                    eid_list = [
                        reference.get("scopus-eid", None)
                        for reference in value
                        if isinstance(reference, dict) and "scopus-eid" in reference
                    ]
                self.final_eid_reference_dict[key] = eid_list

    def run(self):
        for f in tqdm(self.files):
            data = self.read_json_file(f)
            if data:
                self.extract_eids(data)
        return self.final_eid_reference_dict

In [3]:
path_to_ref_files = "../data/01-raw/scopus/references/"

extractor = ReadAndExtractRefs(path_to_ref_files)
final_eid_reference_dict = extractor.run()

100%|██████████| 86/86 [01:26<00:00,  1.00s/it]


In [4]:
# save it to json
p = "../data/02-clean/references/eid_reference_dict.json"

with open(p, "w") as f:
    json.dump(final_eid_reference_dict, f, indent=4)