# XML Generator for CrossRef

This notebook generates the XML file for CrossRef using the output from the DL Batch Revise Export.

For this one we're following the example metadata for Conference Proceedings with Papers: [https://gitlab.com/crossref/schema/-/blob/master/best-practice-examples/conf_series5.3.0.xml](https://gitlab.com/crossref/schema/-/blob/master/best-practice-examples/conf_series5.3.0.xml)


In [None]:
import pandas as pd
from lxml import etree
from datetime import datetime

## (ASIDE) Helper Functions


In [None]:
# Create a function to check the state of XML at any point:

def view_xml_result():
    # Pretty-print the XML
    xml_string = etree.tostring(doi_batch, pretty_print=True, xml_declaration=True, encoding="utf-8").decode("utf-8")

    # Print the formatted XML
    print(xml_string)

# VARIABLES

Here are the variables we can control for generating the XML

In [None]:
# MODIFY THESE VARIABLES

DATA_FILE_NAME = "drs2018"

BATCH_ID_NAME = "drs2018"
CONFERENCE = {"name": "DRS2018: Design as a catalyst for change",
              "acronym": "DRS2018"}

START_DATE = "2018-06-25"
END_DATE = "2018-06-28"


ISSN = "23983132"

SERIES_TITLE = "Proceedings of DRS"

# ISBN = "9781912294626"
ISBN = ""

CONFERENCE_VOLUME_DOI = "10.21606/drs.2018.v1"
CONFERENCE_VOLUME_URL = "https://dl.designresearchsociety.org/conference-volumes/36/"

# You set this by checking the CSV file to see what's the most number of authors a paper has.
MAX_AUTHORS = 7

In [None]:
# DO NOT MODIFY THESE VARIABLES
today = datetime.now()

SUBMISSION_TIMESTAMP = today.strftime("%Y%m%d%H%M%S") + "0000"

# Generate root element

In [None]:
# Define namespaces without 'xmlns:' in the keys
namespaces = {
    "xsi": "http://www.w3.org/2001/XMLSchema-instance",
    None: "http://www.crossref.org/schema/5.3.0",  # Default namespace
    "jats": "http://www.ncbi.nlm.nih.gov/JATS1",
    "fr": "http://www.crossref.org/fundref.xsd",
    "mml": "http://www.w3.org/1998/Math/MathML",
}

# Create the root element with namespaces
doi_batch = etree.Element("doi_batch", nsmap=namespaces)

doi_batch.set("version", "5.3.0")

doi_batch.set(
    "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation",
    "http://www.crossref.org/schema/5.3.0 https://www.crossref.org/schemas/crossref5.3.0.xsd"
)

# Add head and body
head = etree.SubElement(doi_batch, "head")
body = etree.SubElement(doi_batch, "body")

# 01 - Populate head

In [None]:
today_short = today.strftime("%y%m%d")

doi_batch_id = etree.SubElement(head, "doi_batch_id").text = BATCH_ID_NAME + "__" + today_short
timestamp = etree.SubElement(head, "timestamp").text = SUBMISSION_TIMESTAMP

depositor = etree.SubElement(head, "depositor")
depositor_name = etree.SubElement(depositor, "depositor_name").text = "desres:desres"
email_address = etree.SubElement(depositor, "email_address").text = ("dl@designresearchsociety.org")

registrant = etree.SubElement(head, "registrant").text = "Digital Library"

view_xml_result()

<?xml version='1.0' encoding='utf-8'?>
<doi_batch xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.crossref.org/schema/5.3.0" xmlns:jats="http://www.ncbi.nlm.nih.gov/JATS1" xmlns:fr="http://www.crossref.org/fundref.xsd" xmlns:mml="http://www.w3.org/1998/Math/MathML" version="5.3.0" xsi:schemaLocation="http://www.crossref.org/schema/5.3.0 https://www.crossref.org/schemas/crossref5.3.0.xsd">
  <head>
    <doi_batch_id>drs2018__250418</doi_batch_id>
    <timestamp>202504182210270000</timestamp>
    <depositor>
      <depositor_name>desres:desres</depositor_name>
      <email_address>dl@designresearchsociety.org</email_address>
    </depositor>
    <registrant>Digital Library</registrant>
  </head>
  <body/>
</doi_batch>



# 02 - Load Data

We want to load the data. For now we only need a copy of the first row to get conference dates.


In [None]:
df = pd.read_csv(DATA_FILE_NAME+".csv", sep=",")

# make a copy of the first row and keep the columns we want
first_row= df.iloc[0].copy()
first_row = first_row.fillna('').astype(str)
first_row_metadata = first_row[["conference_dates"]]

# we need to process first_row_metadata to get the start and end dates cleanly for the xml
def extract_date_parts(datetime_str):
    # date_part = datetime_str.split(" ")[0]
    # year, month, day = date_part.split("-")
    year, month, day = datetime_str.split("-")  # Split into components
    return [day, month, year]

start_date_parts = extract_date_parts(START_DATE)
end_date_parts = extract_date_parts(END_DATE)

# 03 - Create body structure



In [None]:
# conference is the container for everything in body
conference = etree.SubElement(body, "conference")


def make_body_structure():
    # event_metadata contains info about the particular proceeding
    event_metadata = etree.SubElement(conference, "event_metadata")
    conference_name = etree.SubElement(event_metadata, "conference_name").text = CONFERENCE["name"]
    conference_acronym = etree.SubElement(event_metadata, "conference_acronym").text = CONFERENCE["acronym"]

    # conference_date is something we set.
    conference_date = etree.SubElement(event_metadata, "conference_date")
    conference_date.text = first_row_metadata["conference_dates"]
    attributes = ["day", "month", "year"]

    for attr, start_value, end_value in zip(attributes, start_date_parts, end_date_parts):
        conference_date.set(f"start_{attr}", start_value)
        conference_date.set(f"end_{attr}", end_value)

    # proceedings_series_metadata keeps info about the series.
    proceedings_series_metadata = etree.SubElement(conference, "proceedings_series_metadata")

    # series_metadata is part of proceedings_series_metadata
    series_metadata = etree.SubElement(proceedings_series_metadata, "series_metadata")
    titles_series = etree.SubElement(series_metadata, "titles")
    title_series = etree.SubElement(titles_series, "title")
    title_series.text = SERIES_TITLE
    issn = etree.SubElement(series_metadata, "issn").text = ISSN

    # proceedings_title
    proceedings_title = etree.SubElement(proceedings_series_metadata, "proceedings_title").text = CONFERENCE["name"]

    publisher = etree.SubElement(proceedings_series_metadata, "publisher")
    publisher_name = etree.SubElement(publisher, "publisher_name").text = "Design Research Society"


    # publication_date > month, day, year
    publication_date = etree.SubElement(proceedings_series_metadata, "publication_date")
    publication_month = etree.SubElement(publication_date, "month").text = start_date_parts[1]
    publication_day = etree.SubElement(publication_date, "day").text = start_date_parts[0]
    publication_year = etree.SubElement(publication_date, "year").text = start_date_parts[2]

    if len(ISBN) > 0:
        isbn = etree.SubElement(proceedings_series_metadata, "isbn").text = ISBN
    else:
        isbn = etree.SubElement(proceedings_series_metadata, "noisbn")
        isbn.set("reason", "simple_series")

    doi_data = etree.SubElement(proceedings_series_metadata, "doi_data")
    doi = etree.SubElement(doi_data, "doi").text = CONFERENCE_VOLUME_DOI
    resource = etree.SubElement(doi_data, "resource").text = CONFERENCE_VOLUME_URL

In [None]:
body.clear()
make_body_structure()
view_xml_result()

<?xml version='1.0' encoding='utf-8'?>
<doi_batch xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.crossref.org/schema/5.3.0" xmlns:jats="http://www.ncbi.nlm.nih.gov/JATS1" xmlns:fr="http://www.crossref.org/fundref.xsd" xmlns:mml="http://www.w3.org/1998/Math/MathML" version="5.3.0" xsi:schemaLocation="http://www.crossref.org/schema/5.3.0 https://www.crossref.org/schemas/crossref5.3.0.xsd">
  <head>
    <doi_batch_id>drs2018__250418</doi_batch_id>
    <timestamp>202504182210270000</timestamp>
    <depositor>
      <depositor_name>desres:desres</depositor_name>
      <email_address>dl@designresearchsociety.org</email_address>
    </depositor>
    <registrant>Digital Library</registrant>
  </head>
  <body/>
</doi_batch>



# 04 - Select Columns for papers

The `df` we loaded before has too many columns for dealing with our papers. Let's select only the ones we need.

In [None]:
df = pd.read_csv(DATA_FILE_NAME+".csv", sep=",")

content_cols = ["title", "calc_url", "doi"]

for x in range(1, MAX_AUTHORS+1):
    for y in ["fname", "lname", "institution"]:
        content_cols.append(f"author{x}_{y}")

df_core = df[content_cols]
df_core = df_core.fillna('').astype(str)
# df_core.head(10)
df_core.columns

Index(['title', 'calc_url', 'doi', 'author1_fname', 'author1_lname',
       'author1_institution', 'author2_fname', 'author2_lname',
       'author2_institution', 'author3_fname', 'author3_lname',
       'author3_institution', 'author4_fname', 'author4_lname',
       'author4_institution', 'author5_fname', 'author5_lname',
       'author5_institution', 'author6_fname', 'author6_lname',
       'author6_institution', 'author7_fname', 'author7_lname',
       'author7_institution'],
      dtype='object')

# 05 - Create conference paper


In [None]:
DETAILED_INFO = False

def generate_papers():
    for index, row in df_core.iterrows():
        print("-----\n")
        print(f"Working on row {index}: {row['title']}")

        if DETAILED_INFO:
            print("Row details:")
            for col in row.index:
                print(f"  {col}: {row[col]}")

        # Create the right XML
        conference_paper = etree.SubElement(conference, "conference_paper")
        conference_paper.set("language", "en")
        conference_paper.set("publication_type", "full_text")

        #contributors
        contributors = etree.SubElement(conference_paper, "contributors")
        #titles
        titles = etree.SubElement(conference_paper, "titles")
        title = etree.SubElement(titles, "title")
        title.text = row["title"]

        #publication_date
        publication_date = etree.SubElement(conference_paper, "publication_date")
        publication_date.set("media_type", "online")
        publication_month = etree.SubElement(publication_date, "month").text = start_date_parts[1]
        publication_day = etree.SubElement(publication_date, "day").text = start_date_parts[0]
        publication_year = etree.SubElement(publication_date, "year").text = start_date_parts[2]

        #doi_data
        doi_data = etree.SubElement(conference_paper, "doi_data")
        doi = etree.SubElement(doi_data, "doi").text = row["doi"]
        resource = etree.SubElement(doi_data, "resource").text = row["calc_url"]


        # Populate contributors
        # add author1
        author1 = etree.SubElement(contributors, "person_name")
        author1.set("sequence", "first")
        author1.set("contributor_role", "author")
        author1_given_name = etree.SubElement(author1, "given_name").text = row["author1_fname"].strip()
        author1_surname = etree.SubElement(author1, "surname").text = row["author1_lname"].strip()
        author1_affiliation = etree.SubElement(author1, "affiliations")
        author1_institution = etree.SubElement(author1_affiliation, "institution")
        author1_institution_name = etree.SubElement(author1_institution, "institution_name").text = row["author1_institution"].strip()

        print(row["doi"], row["author1_fname"])

        # add additional authors
        for x in range(2, MAX_AUTHORS+1):
            fname = row[f"author{x}_fname"].strip()

            if not fname:  # Check if fname is an empty string
                break

            author = etree.SubElement(contributors, "person_name")
            author.set("sequence", "additional")
            author.set("contributor_role", "author")
            given_name = etree.SubElement(author, "given_name").text = fname
            surname = etree.SubElement(author, "surname").text = row[f"author{x}_lname"].strip()
            affiliation = etree.SubElement(author, "affiliations")
            institution = etree.SubElement(affiliation, "institution")
            institution_name = etree.SubElement(institution, "institution_name").text = row[f"author{x}_institution"].strip()


    print("====\n ALL DONE")


In [None]:
body.clear()
# conference is the container for everything in body
conference = etree.SubElement(body, "conference")
make_body_structure()
generate_papers()
# view_xml_result()

-----

Working on row 0: Introducing Design-Driven Innovation into Brazilian MSMEs: barriers and next challenges of design support
10.21606/drs.2018.442 Mariana
-----

Working on row 1: Combining Practices in Craft and Design
10.21606/drs.2018.537 Micheal
-----

Working on row 2: Co-creating Happy Moments: A Case Study of Designing for People with Mental Health Challenges
10.21606/drs.2018.214 Hong
-----

Working on row 3: The Design and Social Enterprise Ecosystem: How can design be applied to a developing social enterprise ecosystem?
10.21606/drs.2018.283 Kwon
-----

Working on row 4: Editorial: How Organisations Employ Design as Vehicle for Change
10.21606/drs.2018.78 Chris
-----

Working on row 5: Editorial: Designing Social Innovation in Cultural Diversity and with Sensitivity
10.21606/drs.2018.007 Joon
-----

Working on row 6: Social Hierarchy in Design and Social Innovation: perspectives from Thailand
10.21606/drs.2018.420 Cyril
-----

Working on row 7: Healthy Self-Management C

# 06 - Save XML

In [None]:
xml_string = etree.tostring(doi_batch, pretty_print=True, xml_declaration=True, encoding="utf-8").decode("utf-8")

output_file = f"{DATA_FILE_NAME}.xml"
# Open the file in write mode and save the XML string
with open(output_file, "w", encoding="utf-8") as f:
    f.write(xml_string)

print(f"XML has been saved to {output_file}")

XML has been saved to drs2018.xml
