In [14]:
import pdfplumber

def extract_text(path):
    with pdfplumber.open(path) as pdf:
        return "\n".join(page.extract_text() or "" for page in pdf.pages)

pdf_path = r"C:\Users\ADMIN\Documents\GitHub\Digital-Calibration-Certificate-for-National-Metrology-Laboratory-of-the-Philippines\valid pdf\1.pdf"
raw_text = extract_text(pdf_path)


In [15]:
raw_text

'CALIBRATION CERTIFICATE\nNo. 02-2025-FORC-0043\nDate of Calibration : February 24, 2025\nCalibration Item : Axle Weighing Scale\nCapacity : 15 000 kgf\nMeasurement Range : 0 kgf to 15 000 kgf\nResolution : 5 0 kgf\nMake / Model : Intercomp;\nSerial No. : 1122YL23002\nCustomer : SAVVICE CORPORATION\n3rd Floor BMWC Bldg., Cagayan Valley Road\nSta. Rita, Guiginto, Bulacan\nMEASUREMENT RESULTS:\nRelative Relative\nApplied Indicated Deviation\nExpanded Accuracy\nForce Force (Indicated Force -\nApplied Force) Uncertainty Error\nkgf kgf kgf % %\n0.00 0.00 0 0.00 0.00\n3 000 2 850 -150 1.04 5.26\n6 000 5 700 -300 0.56 5.26\n9 000 8 550 -450 0.41 5.26\n12 000 11 433 -567 0.45 4.96\n15 000 14 300 -700 0.31 4.90\nUNCERTAINTY OF MEASUREMENT:\nThe uncertainty stated is the expanded uncertainty obtained by multiplying the standard uncertainty\nby the coverage factor k = 2. It has been determined in accordance with the “JCGM 100:2008\nEvaluation of measurement data- Guide to the expression of uncert

In [16]:
import pdfplumber

def extract_tables(path):
    tables_data = []
    
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            # Extract tables from the page
            tables = page.extract_tables()
            
            if tables:
                tables_data.extend(tables)
    
    return tables_data

pdf_path = r"C:\Users\ADMIN\Documents\GitHub\Digital-Calibration-Certificate-for-National-Metrology-Laboratory-of-the-Philippines\valid pdf\1.pdf"
tables = extract_tables(pdf_path)

# Process and display tables
for i, table in enumerate(tables):
    print(f"Table {i+1}:")
    for row in table:
        # Filter out None values and empty strings
        cleaned_row = [cell if cell else "" for cell in row]
        print(cleaned_row)
    print("\n")

Table 1:
['Applied\nForce', 'Indicated\nForce', 'Deviation\n(Indicated Force -\nApplied Force)', 'Relative\nExpanded\nUncertainty', 'Relative\nAccuracy\nError']
['kgf', 'kgf', 'kgf', '%', '%']
['0.00\n3 000\n6 000\n9 000\n12 000\n15 000', '0.00\n2 850\n5 700\n8 550\n11 433\n14 300', '0\n-150\n-300\n-450\n-567\n-700', '0.00\n1.04\n0.56\n0.41\n0.45\n0.31', '0.00\n5.26\n5.26\n5.26\n4.96\n4.90']


Table 2:
['Name of Standard', 'Make/Model', 'Calibration Certificate No.', 'Traceability']
['Force Measuring\nInstrument\nSN 1251056K0094', 'Shimadzu/ UH-\nF1000kNX', '11-2020-FORC-0116', 'Traceable to the SI\nthrough NMD-ITDI']




In [17]:
import re

def extract_calibration_info(raw_text):
    # Initialize variables to store extracted information
    info = {
        'certificate_number': '',
        'calibration_date': '',
        'calibration_item': '',
        'capacity': '',
        'measurement_range': '',
        'resolution': '',
        'make_model': '',
        'serial_number': '',
        'customer_name': '',
        'customer_address': ''
    }
    
    # Split text into lines for easier processing
    lines = raw_text.split('\n')
    
    # Extract certificate number using regex
    cert_match = re.search(r'No\.\s+([\w\-]+)', raw_text)
    if cert_match:
        info['certificate_number'] = cert_match.group(1)
    
    # Extract calibration date using regex
    date_match = re.search(r'Date of Calibration\s*:\s*([^\n]+)', raw_text)
    if date_match:
        info['calibration_date'] = date_match.group(1).strip()
    
    # Extract calibration item
    item_match = re.search(r'Calibration Item\s*:\s*([^\n]+)', raw_text)
    if item_match:
        info['calibration_item'] = item_match.group(1).strip()
    
    # Extract capacity
    capacity_match = re.search(r'Capacity\s*:\s*([^\n]+)', raw_text)
    if capacity_match:
        info['capacity'] = capacity_match.group(1).strip()
    
    # Extract measurement range
    range_match = re.search(r'Measurement Range\s*:\s*([^\n]+)', raw_text)
    if range_match:
        info['measurement_range'] = range_match.group(1).strip()
    
    # Extract resolution
    resolution_match = re.search(r'Resolution\s*:\s*([^\n]+)', raw_text)
    if resolution_match:
        info['resolution'] = resolution_match.group(1).strip()
    
    # Extract make/model
    make_match = re.search(r'Make / Model\s*:\s*([^\n]+)', raw_text)
    if make_match:
        info['make_model'] = make_match.group(1).strip()
    
    # Extract serial number
    serial_match = re.search(r'Serial No\.\s*:\s*([^\n]+)', raw_text)
    if serial_match:
        info['serial_number'] = serial_match.group(1).strip()
    
    # Extract customer and address
    # First find the customer line
    customer_match = re.search(r'Customer\s*:\s*([^\n]+)', raw_text)
    if customer_match:
        info['customer_name'] = customer_match.group(1).strip()
        
        # Find the address (typically the line after customer)
        customer_line_index = -1
        for i, line in enumerate(lines):
            if 'Customer :' in line or 'Customer:' in line:
                customer_line_index = i
                break
        
        # If we found the customer line and there's at least one more line after it
        if customer_line_index != -1 and customer_line_index + 1 < len(lines):
            # The next line is likely the address
            address_line = lines[customer_line_index + 1].strip()
            
            # Check if the next line appears to be an address (not a section header)
            if address_line and not address_line.isupper() and 'MEASUREMENT' not in address_line:
                info['customer_address'] = address_line
            
            # If we might need to look for more address lines
            next_section_idx = -1
            for i in range(customer_line_index + 1, len(lines)):
                if 'MEASUREMENT' in lines[i] or 'RESULTS' in lines[i]:
                    next_section_idx = i
                    break
            
            # If we found a section header and there might be more address lines
            if next_section_idx != -1 and customer_line_index + 1 < next_section_idx - 1:
                # Collect all lines between customer and next section
                address_lines = []
                for i in range(customer_line_index + 1, next_section_idx):
                    if lines[i].strip():  # If not empty
                        address_lines.append(lines[i].strip())
                
                if address_lines:
                    info['customer_address'] = ' '.join(address_lines)
                    
            # Extract environmental conditions
    temp_match = re.search(r'Ambient Temperature\s*:\s*\(?([^\)\n]+)\)?', raw_text)
    if temp_match:
        info['temperature'] = temp_match.group(1).strip()
    
    hum_match = re.search(r'Relative Humidity\s*:\s*\(?([^\)\n]+)\)?', raw_text)
    if hum_match:
        info['humidity'] = hum_match.group(1).strip()
    
    # --- Replace old respPerson logic with this new block ---
    # Extract responsible persons by finding uppercase name lines and their following role lines
    exclude_prefixes = (
        'CALIBRATION', 'UNCERTAINTY', 'STANDARD', 'ENVIRONMENTAL',
        'PAGE', 'MEASUREMENT', 'REMARKS', 'DATE', '-END'
    )
    person_indices = []
    for i, line in enumerate(lines):
        txt = line.strip()
        # line is all-caps (with spaces, commas, dots or hyphens), at least two words,
        # and does not start with any known header
        if (re.match(r'^[A-Z][A-Z\s,\.\-]+$', txt)
            and len(txt.split()) > 1
            and not any(txt.startswith(pref) for pref in exclude_prefixes)):
            person_indices.append(i)

    # take up to three name/role pairs
    for idx, li in enumerate(person_indices[:3]):
        info[f'resp_person{idx+1}_name'] = lines[li].strip()
        # the role should be the next non-empty line after the name
        for j in range(li+1, min(li+5, len(lines))):
            role_txt = lines[j].strip()
            if role_txt and role_txt != lines[li].strip():
                info[f'resp_person{idx+1}_role'] = role_txt
                break
    
    # Clean up data …
    for key, val in info.items():
        if isinstance(val, str):
            info[key] = re.sub(r'\s+', ' ', val).strip()
    
    # Clean up data - remove any unwanted characters or fix formatting issues
    for key in info:
        # Clean up extra spaces
        if isinstance(info[key], str):
            info[key] = re.sub(r'\s+', ' ', info[key]).strip()
            # Fix specific formatting issues
            info[key] = info[key].replace(' ,', ',').replace(' .', '.')
            info[key] = info[key].replace(' ;', ';').replace(' :', ':')
    
    return info

calibration_info = extract_calibration_info(raw_text)
calibration_info

{'certificate_number': '02-2025-FORC-0043',
 'calibration_date': 'February 24, 2025',
 'calibration_item': 'Axle Weighing Scale',
 'capacity': '15 000 kgf',
 'measurement_range': '0 kgf to 15 000 kgf',
 'resolution': '5 0 kgf',
 'make_model': 'Intercomp;',
 'serial_number': '1122YL23002',
 'customer_name': 'SAVVICE CORPORATION',
 'customer_address': '3rd Floor BMWC Bldg., Cagayan Valley Road Sta. Rita, Guiginto, Bulacan',
 'temperature': '22 ± 2',
 'humidity': '40 ± 5',
 'resp_person1_name': 'AHDRIAN CAMILO C. GERNALE',
 'resp_person1_role': 'Science Research Specialist II',
 'resp_person2_name': 'RADLEY F. MANALO',
 'resp_person2_role': 'Senior Science Research Specialist'}

In [18]:
import pdfplumber

def extract_table_columns(path):
    tables_columns = []
    
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            
            for table in tables:
                if not table or len(table) < 2:  # Skip empty tables or tables with only headers
                    continue
                
                # Assume first row contains headers
                headers = table[0]
                
                # Create dictionary to store columns data
                columns_data = {}
                
                # Initialize dictionary with header names as keys and empty lists as values
                for header in headers:
                    if header:  # Skip empty headers
                        columns_data[header] = []
                
                # Fill in column data from remaining rows
                for row in table[1:]:
                    for i, cell in enumerate(row):
                        if i < len(headers) and headers[i] and cell:
                            columns_data[headers[i]].append(cell)
                
                tables_columns.append(columns_data)
    
    return tables_columns

# Usage
pdf_path = r"C:\Users\ADMIN\Documents\GitHub\Digital-Calibration-Certificate-for-National-Metrology-Laboratory-of-the-Philippines\valid pdf\1.pdf"
table_columns = extract_table_columns(pdf_path)

# Display each table's columns
for i, table in enumerate(table_columns):
    print(f"Table {i+1}:")
    for column_name, column_values in table.items():
        print(f"Column: {column_name}")
        print(f"Values: {column_values}")
        print()
    print("=" * 50)

Table 1:
Column: Applied
Force
Values: ['kgf', '0.00\n3 000\n6 000\n9 000\n12 000\n15 000']

Column: Indicated
Force
Values: ['kgf', '0.00\n2 850\n5 700\n8 550\n11 433\n14 300']

Column: Deviation
(Indicated Force -
Applied Force)
Values: ['kgf', '0\n-150\n-300\n-450\n-567\n-700']

Column: Relative
Expanded
Uncertainty
Values: ['%', '0.00\n1.04\n0.56\n0.41\n0.45\n0.31']

Column: Relative
Accuracy
Error
Values: ['%', '0.00\n5.26\n5.26\n5.26\n4.96\n4.90']

Table 2:
Column: Name of Standard
Values: ['Force Measuring\nInstrument\nSN 1251056K0094']

Column: Make/Model
Values: ['Shimadzu/ UH-\nF1000kNX']

Column: Calibration Certificate No.
Values: ['11-2020-FORC-0116']

Column: Traceability
Values: ['Traceable to the SI\nthrough NMD-ITDI']



In [19]:
# extract instrument name and serial number from Table 2
std_entry = table_columns[1]['Name of Standard'][0]
lines = std_entry.split('\n')
instrument_name = ' '.join(lines[:-1]).strip()
instrument_serial = lines[-1].strip()

# Extract Make/Model from table 2 without newlines
make_model = " ".join(table_columns[1]['Make/Model'][0].split())

#Extract calibration certificate number from table 2
cert_number = table_columns[1]['Calibration Certificate No.'][0]

# Extract traceability from table 2
traceability = " ".join(table_columns[1]['Traceability'][0].split())


# store into calibration_info
calibration_info['standard_item'] = instrument_name
calibration_info['standard_serial_number'] = instrument_serial
calibration_info["standard_model"] = make_model
calibration_info['standard_cert_number'] = cert_number
calibration_info['standard_traceability'] = traceability


# now you can reference
calibration_info

{'certificate_number': '02-2025-FORC-0043',
 'calibration_date': 'February 24, 2025',
 'calibration_item': 'Axle Weighing Scale',
 'capacity': '15 000 kgf',
 'measurement_range': '0 kgf to 15 000 kgf',
 'resolution': '5 0 kgf',
 'make_model': 'Intercomp;',
 'serial_number': '1122YL23002',
 'customer_name': 'SAVVICE CORPORATION',
 'customer_address': '3rd Floor BMWC Bldg., Cagayan Valley Road Sta. Rita, Guiginto, Bulacan',
 'temperature': '22 ± 2',
 'humidity': '40 ± 5',
 'resp_person1_name': 'AHDRIAN CAMILO C. GERNALE',
 'resp_person1_role': 'Science Research Specialist II',
 'resp_person2_name': 'RADLEY F. MANALO',
 'resp_person2_role': 'Senior Science Research Specialist',
 'standard_item': 'Force Measuring Instrument',
 'standard_serial_number': 'SN 1251056K0094',
 'standard_model': 'Shimadzu/ UH- F1000kNX',
 'standard_cert_number': '11-2020-FORC-0116',
 'standard_traceability': 'Traceable to the SI through NMD-ITDI'}

In [None]:
# #display customer name
# print("Customer Name:", calibration_info['customer_name'])

Customer Name: SYCWIN COATING & WIRES, INC.


In [None]:
# from datetime import datetime
# import re

# import xml.etree.ElementTree as ET


# ## CORE DATA
# # 1) your extracted values
# software_name          = "DigiCert" #STATIC
# software_release       = "v0.0" #STATIC
# country_code_iso       = "PH" #STATIC
# used_lang_code         = "en" #STATIC
# mandatory_lang_code    = "en" #STATIC
# # take the actual certificate number string, not the token list

# # CERTIFICATE NO. (TSR Number)
# unique_identifier = "Calibration No. " + calibration_info["certificate_number"]


# # DATE
# raw_date = calibration_info.get("calibration_date", "")
# # ensure a space between month name and day, then normalize commas and whitespace
# step1 = re.sub(r"([A-Za-z]+)(\d)", r"\1 \2", raw_date)
# step2 = step1.replace(",", ", ")
# norm  = re.sub(r"\s+", " ", step2).strip()

# try:
#     # parse “FullMonthName DD, YYYY” → “YYYY‑MM‑DD”
#     dt = datetime.strptime(norm, "%B %d, %Y")
#     begin_performance_date = dt.strftime("%Y-%m-%d")
# except ValueError:
#     begin_performance_date = ""

# end_performance_date   = ""
# performance_location   = "LABORATORY" 

# ## ITEMS
# # Calibration item
# item_name             = calibration_info.get("calibration_item", "")
# item_model   = calibration_info["make_model"]

# # Identification
# id_issuer = "customer"
# id_name = calibration_info["calibration_item"]
# id_serialnum = calibration_info["serial_number"]

# # Description
# capacity = calibration_info["capacity"]
# measurement_range = calibration_info["measurement_range"]
# resolution = calibration_info["resolution"]

# #Standard item
# standard_name    = ""
# standard_model = ""

# # Identification
# standard_id_issuer = "LABORATORY"
# standard_id_name = ""
# standard_id_serialnum = ""

# # Description
# standard_certificate_number = ""
# standard_traceability = ""


# ## CALIBRATION LABORATORY
# calibration_labcode = "FORC"
# calibration_contactname = "National Metrology Laboratory - Industrial Technology Development Institute"
# calibration_labcity              = "Taguig" #STATIC
# calibration_labcountrycode      = "PH" #STATIC
# calibration_lab_postcode         = "1633" #STATIC
# calibration_labstreet            = "General Santos Ave" #STATIC


# ## Responsible Persons
# resp1_name            = ""
# resp1_role            = ""
# resp2_name            = ""
# resp2_role            = ""
# resp3_name            = ""
# resp3_role            = ""


# ## Customer
# customer_name         = calibration_info['customer_name']
# customer_address      = calibration_info.get("customer_address", "")



# ### MEASUREMENT RESULTS
# measurement_item = calibration_info["calibration_item"]
# measurement_method = "" 
# measurement_desc = ""

# # Influence Conditions
# influencecondition1 = "Ambient Temperature"
# temperature = ""
# unit1 = "°C"

# influencecondition2 = "Relative Humidity"
# humidity = ""
# unit2 = "%"

# #Results 
# measurement_standard = ""
# measurement_standard_values = ""
# measurement_standard_unit = "" # count /n + 1

# measured_item = ""
# measured_item_values = ""
# measured_item_unit = "" # count /n + 1


# measurement_error = ""
# measurement_error_values = ""
# measurement_error_unit = "" # count /n + 1


# ## COMMENT
# calibrationprocedure = ""
# remarks = ""


SyntaxError: invalid syntax (528737574.py, line 41)

In [None]:

# # 2) parse + register namespaces
# template = r"valid xml\empty.xml"
# output   = r"valid xml\filled.xml"

# ns = {
    
#     "si":  "https://ptb.de/si",
#     "dcc": "https://ptb.de/dcc"
# }
# for prefix, uri in ns.items():
#     ET.register_namespace(prefix, uri)

# tree = ET.parse(template)
# root = tree.getroot()

# def set_text(elem, txt, lang=None):
#     if txt is None:
#         return
#     elem.text = txt
#     if lang:
#         elem.set("lang", lang)


In [None]:

# # 3) fill in
# set_text(root.find(".//dcc:software/dcc:name/dcc:content", ns), software_name)
# set_text(root.find(".//dcc:software/dcc:release",           ns), software_release)
# set_text(root.find(".//dcc:coreData/dcc:countryCodeISO3166_1", ns), country_code_iso)
# set_text(root.find(".//dcc:coreData/dcc:usedLangCodeISO639_1",  ns), used_lang_code)
# set_text(root.find(".//dcc:coreData/dcc:mandatoryLangCodeISO639_1", ns), mandatory_lang_code)
# set_text(root.find(".//dcc:coreData/dcc:uniqueIdentifier",       ns), unique_identifier)
# set_text(root.find(".//dcc:coreData/dcc:beginPerformanceDate",   ns), begin_performance_date)
# set_text(root.find(".//dcc:coreData/dcc:endPerformanceDate",     ns), end_performance_date)
# set_text(root.find(".//dcc:coreData/dcc:performanceLocation",    ns), performance_location)

# # items
# set_text(root.find(".//dcc:items/dcc:name/dcc:content", ns), item_name, lang=used_lang_code)
# ec = root.find(".//dcc:items/dcc:equipmentClass", ns)
# if ec is not None:
#     ec.set("refType", equipment_ref_type)
#     set_text(ec.find("dcc:reference", ns), equipment_reference)
#     set_text(ec.find("dcc:classID",   ns), equipment_class_id)

# # sub‐item + identification
# si_el = root.find(".//dcc:item", ns)
# if si_el is not None:
#     set_text(si_el.find("dcc:name/dcc:content", ns), subitem_name, lang=used_lang_code)
#     set_text(si_el.find("dcc:model", ns), subitem_model)
#     ident = si_el.find("dcc:identifications/dcc:identification", ns)
#     if ident is not None:
#         set_text(ident.find("dcc:issuer",               ns), ident_issuer)
#         set_text(ident.find("dcc:value",                ns), ident_value)
#         set_text(ident.find("dcc:name/dcc:content",     ns), ident_name, lang=used_lang_code)

# # laboratory
# set_text(root.find(".//dcc:calibrationLaboratory/dcc:calibrationLaboratoryCode", ns), lab_code)
# set_text(root.find(".//dcc:calibrationLaboratory/dcc:contact/dcc:name/dcc:content", ns), lab_contact_name, lang=used_lang_code)
# set_text(root.find(".//dcc:calibrationLaboratory/dcc:contact/dcc:location/dcc:city",        ns), lab_city)
# set_text(root.find(".//dcc:calibrationLaboratory/dcc:contact/dcc:location/dcc:countryCode",   ns), lab_country_code)
# set_text(root.find(".//dcc:calibrationLaboratory/dcc:contact/dcc:location/dcc:postCode",     ns), lab_post_code)
# set_text(root.find(".//dcc:calibrationLaboratory/dcc:contact/dcc:location/dcc:street",       ns), lab_street)

# # respPersons
# resp_nodes = root.findall(".//dcc:respPersons/dcc:respPerson", ns)
# for idx, (name, role) in enumerate([
#     (resp1_name, resp1_role),
#     (resp2_name, resp2_role),
#     (resp3_name, resp3_role),
# ]):
#     if idx < len(resp_nodes):
#         rp = resp_nodes[idx]
#         set_text(rp.find("dcc:person/dcc:name/dcc:content", ns), name, lang=used_lang_code)
#         set_text(rp.find("dcc:role", ns), role)

# # customer
# cust = root.find(".//dcc:customer", ns)
# set_text(cust.find("dcc:name/dcc:content", ns), customer_name)

# # Fix: Set text directly to the location element, not looking for a nested location
# set_text(cust.find("dcc:location", ns), customer_address)

# # comment
# set_text(root.find(".//dcc:comment", ns), comment)

# # 4) write
# tree.write(output, encoding="utf-8", xml_declaration=True)

In [24]:
from datetime import datetime
import re
import xml.etree.ElementTree as ET

## CORE DATA
software_name          = "DigiCert"
software_release       = "v0.0"
country_code_iso       = "PH"
used_lang_code         = "en"
mandatory_lang_code    = "en"
unique_identifier      = "Calibration No. " + calibration_info["certificate_number"]
raw_date               = calibration_info.get("calibration_date", "")
step1                  = re.sub(r"([A-Za-z]+)(\d)", r"\1 \2", raw_date)
step2                  = step1.replace(",", ", ")
norm                   = re.sub(r"\s+", " ", step2).strip()
try:
    dt = datetime.strptime(norm, "%B %d, %Y")
    begin_performance_date = dt.strftime("%Y-%m-%d")
except ValueError:
    begin_performance_date = ""
end_performance_date   = ""
performance_location   = "LABORATORY"

## ITEMS
item_name           = calibration_info.get("calibration_item", "")
item_model          = calibration_info["make_model"]
id_issuer           = "customer"
id_name             = calibration_info["calibration_item"]
id_serialnum        = calibration_info["serial_number"]
capacity            = "Capacity: " + calibration_info["capacity"]
measurement_range   = "Measurement Range: " + calibration_info["measurement_range"]
resolution          = "Resolution: " + calibration_info["resolution"]
standard_name       = calibration_info["standard_item"]
standard_model      = calibration_info["standard_model"]
standard_id_issuer  = "LABORATORY"
standard_id_name    = calibration_info["standard_item"]
standard_id_serialnum = calibration_info["standard_serial_number"]
standard_certificate_number = "Calibration Certificate No.: " + calibration_info["standard_cert_number"]
standard_traceability      = "Traceability: " + calibration_info["standard_traceability"]

## CALIBRATION LAB
calibration_labcode     = "FORC"
calibration_contactname = "National Metrology Laboratory - Industrial Technology Development Institute"
calibration_labcity     = "Taguig"
calibration_labcountrycode = "PH"
calibration_lab_postcode   = "1633"
calibration_labstreet      = "General Santos Ave"

## RESPONSIBLE PERSONS
resp1_name, resp1_role = "", ""
resp2_name, resp2_role = "", ""
resp3_name, resp3_role = "", ""

## CUSTOMER
customer_name    = calibration_info["customer_name"]
customer_address = calibration_info.get("customer_address", "")

## MEASUREMENT RESULTS
measurement_item   = calibration_info["calibration_item"]
measurement_method = ""
measurement_desc   = ""
influencecondition1 = "Ambient Temperature"
temperature         = calibration_info["temperature"]
unit1               = "°C"
influencecondition2 = "Relative Humidity"
humidity            = calibration_info["humidity"]
unit2               = "%"
measurement_standard         = ""
measurement_standard_values  = ""
measurement_standard_unit    = ""
measured_item                = ""
measured_item_values         = ""
measured_item_unit           = ""
measurement_error            = ""
measurement_error_values     = ""
measurement_error_unit       = ""
calibrationprocedure = ""
remarks              = ""


In [23]:
calibration_info["humidity"]

'40 ± 5'

In [25]:

# parse template, register ns
template = r"valid xml\template.xml"
output   = r"valid xml\filled.xml"
ns = {"dcc":"https://ptb.de/dcc","si":"https://ptb.de/si"}
for p,u in ns.items(): ET.register_namespace(p,u)

from xml.etree.ElementTree import ParseError
try:
    tree = ET.parse(template)
except ParseError as e:
    # fallback to lxml recovery if available
    try:
        from lxml import etree as LET
    except ImportError:
        raise e
    parser = LET.XMLParser(recover=True)
    tree = LET.parse(template, parser)
root = tree.getroot()

def set_text(elem, txt, lang=None):
    if elem is None or txt is None: return
    elem.text = txt
    if lang: elem.set("lang", lang)


In [26]:

# 1) software
sw = root.find(".//dcc:software", ns)
set_text(sw.find("dcc:name/dcc:content", ns), software_name)
set_text(sw.find("dcc:release",           ns), software_release)

# 2) coreData
cd = root.find(".//dcc:coreData", ns)
set_text(cd.find("dcc:countryCodeISO3166_1",   ns), country_code_iso)
set_text(cd.find("dcc:usedLangCodeISO639_1",   ns), used_lang_code)
set_text(cd.find("dcc:mandatoryLangCodeISO639_1", ns), mandatory_lang_code)
set_text(cd.find("dcc:uniqueIdentifier",       ns), unique_identifier)
set_text(cd.find("dcc:beginPerformanceDate",   ns), begin_performance_date)
set_text(cd.find("dcc:endPerformanceDate",     ns), end_performance_date)
set_text(cd.find("dcc:performanceLocation",    ns), performance_location)

# 3) items: first is calibration, second is standard
items = root.findall(".//dcc:items/dcc:item", ns)

# calibration item
if items:
    ci = items[0]
    set_text(ci.find("dcc:name/dcc:content",             ns), item_name, lang=used_lang_code)
    set_text(ci.find("dcc:manufacturer/dcc:name/dcc:content", ns), "", lang=used_lang_code)
    set_text(ci.find("dcc:model",                        ns), item_model)
    ident = ci.find("dcc:identifications/dcc:identification", ns)
    if ident is not None:
        set_text(ident.find("dcc:issuer",             ns), id_issuer)
        set_text(ident.find("dcc:value",              ns), id_name)
        set_text(ident.find("dcc:name/dcc:content",   ns), id_serialnum, lang=used_lang_code)
    desc = ci.find("dcc:description", ns)
    if desc is not None:
        cont = desc.findall("dcc:content", ns)
        if len(cont)>0: set_text(cont[0], capacity,          lang=used_lang_code)
        if len(cont)>1: set_text(cont[1], measurement_range, lang=used_lang_code)
        if len(cont)>2: set_text(cont[2], resolution,        lang=used_lang_code)

# standard item
if len(items)>1:
    si_el = items[1]
    set_text(si_el.find("dcc:name/dcc:content",             ns), standard_name, lang=used_lang_code)
    set_text(si_el.find("dcc:manufacturer/dcc:name/dcc:content", ns), "", lang=used_lang_code)
    set_text(si_el.find("dcc:model",                        ns), standard_model)
    ident2 = si_el.find("dcc:identifications/dcc:identification", ns)
    if ident2 is not None:
        set_text(ident2.find("dcc:issuer",           ns), standard_id_issuer)
        set_text(ident2.find("dcc:value",            ns), standard_id_serialnum)
        set_text(ident2.find("dcc:name/dcc:content", ns), standard_id_name, lang=used_lang_code)
    desc2 = si_el.find("dcc:description", ns)
    if desc2 is not None:
        cont2 = desc2.findall("dcc:content", ns)
        if len(cont2)>0: set_text(cont2[0], standard_certificate_number, lang=used_lang_code)
        if len(cont2)>1: set_text(cont2[1], standard_traceability,       lang=used_lang_code)

# 4) calibrationLaboratory
lab = root.find(".//dcc:calibrationLaboratory", ns)
set_text(lab.find("dcc:calibrationLaboratoryCode",        ns), calibration_labcode)
set_text(lab.find("dcc:contact/dcc:name/dcc:content",     ns), calibration_contactname, lang=used_lang_code)
loc = lab.find("dcc:contact/dcc:location", ns)
set_text(loc.find("dcc:city",        ns), calibration_labcity)
set_text(loc.find("dcc:countryCode", ns), calibration_labcountrycode)
set_text(loc.find("dcc:postCode",    ns), calibration_lab_postcode)
set_text(loc.find("dcc:street",      ns), calibration_labstreet)

# 5) respPersons
resp_nodes = root.findall(".//dcc:respPersons/dcc:respPerson", ns)
for idx,(name,role) in enumerate([(resp1_name,resp1_role),(resp2_name,resp2_role),(resp3_name,resp3_role)]):
    if idx<len(resp_nodes):
        rp = resp_nodes[idx]
        set_text(rp.find("dcc:person/dcc:name/dcc:content", ns), name, lang=used_lang_code)
        set_text(rp.find("dcc:role", ns), role)

# 6) customer
cust = root.find(".//dcc:customer", ns)
set_text(cust.find("dcc:name/dcc:content", ns), customer_name)
f = cust.find("dcc:location/dcc:further/dcc:content", ns)
set_text(f, customer_address, lang=used_lang_code)

# 7) measurementResults
mr = root.find(".//dcc:measurementResults", ns)
set_text(mr.find("dcc:name/dcc:content", ns), measurement_item, lang=used_lang_code)
um = mr.find("dcc:usedMethods/dcc:usedMethod", ns)
set_text(um.find("dcc:name/dcc:content", ns), measurement_method, lang=used_lang_code)
set_text(um.find("dcc:description/dcc:content", ns), measurement_desc, lang=used_lang_code)

# influenceConditions
ic = mr.find("dcc:usedMethods/dcc:influenceConditions", ns)
if ic is not None:
    conds = ic.findall("dcc:influenceCondition", ns)
    if conds:
        infl = conds[0]
        set_text(infl.find("dcc:name/dcc:content", ns), influencecondition1, lang=used_lang_code)
        dq = infl.find("dcc:data/dcc:quantity", ns)
        set_text(dq.find("dcc:name/dcc:content", ns), influencecondition1, lang=used_lang_code)
        real = dq.find("si:real", ns)
        set_text(real.find("si:value", ns), temperature)
        set_text(real.find("si:unit", ns), unit1)
    if len(conds) > 1:
        infl = conds[1]
        set_text(infl.find("dcc:name/dcc:content", ns), influencecondition2, lang=used_lang_code)
        dq = infl.find("dcc:data/dcc:quantity", ns)
        set_text(dq.find("dcc:name/dcc:content", ns), influencecondition2, lang=used_lang_code)
        real = dq.find("si:real", ns)
        set_text(real.find("si:value", ns), humidity)
        set_text(real.find("si:unit", ns), unit2)

# results
res = mr.find("dcc:results", ns)
if res is not None:
    rl = res.findall("dcc:result", ns)
    if rl:
        r0 = rl[0]
        set_text(r0.find("dcc:name/dcc:content", ns), measurement_standard, lang=used_lang_code)
        hy0 = r0.find("dcc:quantity/si:hybrid/si:realListXMLList", ns)
        set_text(hy0.find("si:valueXMLList", ns), measurement_standard_values)
        set_text(hy0.find("si:unitXMLList", ns), measurement_standard_unit)
    if len(rl) > 1:
        r1 = rl[1]
        set_text(r1.find("dcc:name/dcc:content", ns), measured_item, lang=used_lang_code)
        hy1 = r1.find("dcc:quantity/si:hybrid/si:realListXMLList", ns)
        set_text(hy1.find("si:valueXMLList", ns), measured_item_values)
        set_text(hy1.find("si:unitXMLList", ns), measured_item_unit)
    if len(rl) > 2:
        r2 = rl[2]
        set_text(r2.find("dcc:name/dcc:content", ns), measurement_error, lang=used_lang_code)
        hy2 = r2.find("dcc:quantity/si:hybrid/si:realListXMLList", ns)
        set_text(hy2.find("si:valueXMLList", ns), measurement_error_values)
        set_text(hy2.find("si:unitXMLList", ns), measurement_error_unit)

# 8) comment
comm = root.find(".//dcc:comment", ns)
cc = comm.findall("dcc:content", ns)
if cc: set_text(cc[0], calibrationprocedure, lang=used_lang_code)
if len(cc)>1: set_text(cc[1], remarks, lang=used_lang_code)

# write
tree.write(output, encoding="utf-8", xml_declaration=True)