This converter 
- input: one xml file containing test records from the psytkom database (from start) - later a folder of such files
- output: one json file containing all test records from the input file(s) - later one json file with all tests from all xml files in the folder.
    - purpose: be able to use a reduced version of the test database in a search index (OpenSearch) for PSYNDEXER cataloging tool to link to a test from a work via its identifier

Libraries used:
- xml.etree.ElementTree
- json


Fields a test record can contain:
- id (four digits)
- shortName
- longName
- otherNames (array)
- authors (array)
- year
- classifications (array)


Example:

```json
[
    {
        "id": "0001",
        "shortName": "80 WT",
        "longName": "80-Wort-Test",
        "otherNames": [
            "80-Wort-Test/zpid",
            "80-Wort-Test (80 WT)"
        ],
        "includedTestVariants": [
            { "shortName": "80 WT", "longName": "80-Wort-Test" },
        ],
        "publicationYear": "1973",
        "authors": [
            "Burgstaller, F."
        ],
        "classifications": [
            "Spelling Tests"
        ]
    }
]
```

In [84]:
from ast import pattern
import re
import xml.etree.ElementTree as ET
import json

# Get the root element
root = ET.parse("../testdb-xml-star/230803_175109/xml/records-005.xml")

# create an empty array to store the records:
test_array = []

for record in root.findall("Record"):


    # get the id:
    id = record.find("ND").text

    # if it is longer than four digits, remove any leading digits before the last 4:
    if len(id) > 4:
        id = id[-4:]

    # add the id by adding a new test object to the array and then adding the id to it:
    test_array.append({"id": id})

    # get the shortName from SNAM, if it exists:
    shortName = record.find("SNAM")
    # add the shortName to the test object:
    test_array[-1]["shortName"] = shortName.text if shortName != None else None

    # and the longName from LNAM, if it exists:
    longName = record.find("LNAM")
    # add the longName to the test object:
    test_array[-1]["longName"] = longName.text if longName != None else None

    # start a list of otherNames:
    otherNames = []
    # get ENAM:
    enam = record.find("ENAM")
    # ENAM ends in a "/" followed by a short string. Remove this part - it can be either: /zpid, /author, /autor or /journal:
    if enam != None:
        #enam_origin = enam.text.split("/")[1]
        enam = enam.text.split("/")[0]
        # note for "big" export: the title of the test is not necessarily the title of the publication it is in!
        ## Separating "name of original english version of this german test adaptation" and 
        # "name of this German test as translated into english"
        # if enam contains text in the following pattern: 
        # "Career Development Inventory - High-Scool Form III (CDI; Super, E.D. & Thompson, A.S., 1976) - German adaptation"), save a copy in variable english_original - but only save the part before the final " - ":
        # pattern = r".*\s\-\s.*"
        # if re.search(pattern, enam):
        #     # remove the part after the parentheses, but keep the parantheses and their content, e.g. 
        #     # when enam is "Teacher Behaviors Inventory (TBI; Murray, H.G., 1983) - German modified version", 
        #     # remove " - German modified version" and keep just "Teacher Behaviors Inventory (TBI; Murray, H.G., 1983)":
        #     removal_pattern = r"\s-\s.*"
        #     english_original = re.sub(removal_pattern, "", enam)
        #     # add the english_original to the test object:
        #     test_array[-1]["english_original"] = english_original
    # add enam to the list:
    otherNames.append(enam)
    
    # check if there is a SYN field - if there is, split it along semicolons and add to the list of otherNames:
    syn = record.find("SYN")
    if syn != None:
        for element in syn.text.split("; "):
            otherNames.append(element)

    # add the list (that includes both ENAM and SYN to the test object:
    test_array[-1]["otherNames"] = otherNames

    # get the publicationYear from PY:
    publicationYear = record.find("PY").text

    # add the publicationYear to the test object - but only if it truly is a four-digit number - if it contains letters, it is not a year and should not be added:
    if len(publicationYear) == 4 and publicationYear.isdigit():
        test_array[-1]["publicationYear"] = publicationYear

    # test_array[-1]["publicationYear"] = publicationYear

    # get the list of PTSHs from PTSH, split them along semicolons and add to List classifications:
    classifications = []
    for element in record.find("PTSH").text.split("; "):
        classifications.append(element)
    # add the list to the test object:
    test_array[-1]["classifications"] = classifications


    # dump the array to a json file:
    with open("testdb5.json", "w") as outfile:
        # make sure to encode any umlauts correctly:
        json.dump(test_array, outfile, ensure_ascii=False)

        # json.dump(test_array, outfile)
        
        
   

In [None]:
# Given a list of allcaps names, find the truecase for each and return a list of truecase names.

# import truecase

# lnams = [
#     "Abzeichentest", "ARBEITSBESCHREIBUNGSBOGEN"
# ]

