In [5]:
# This code will create a new XML file for each <speech> element in the original XML document, including the <debateinfo> in each new file. If there are no <debateinfo> elements in the document, it will raise a ValueError. If there are multiple <debateinfo> elements, it will use the first one. If you want to use a different <debateinfo>, you can adjust the index in debateinfo = debateinfos[0].

import os
from xml.etree.ElementTree import ElementTree, Element, SubElement, tostring, parse

# Create a new directory
new_dir = 'segmented_files'
os.makedirs(new_dir, exist_ok=True)

# Parse the XML file
filename = '19010509_reps_1_1.xml'
tree = parse(f'data/{filename}')
root = tree.getroot()

# Find the first <debateinfo> element in the entire XML document
debateinfos = root.findall('.//debateinfo')

if not debateinfos:
    raise ValueError("No <debateinfo> element found in the XML document")

debateinfo = debateinfos[0]

# Find all <speech> elements
speeches = root.findall('.//speech')

# Initialize a counter for the speech segments
counter = 1

for speech in speeches:
    # Create a new XML file with the <debateinfo> and <speech>
    new_root = Element('root')
    new_root.append(debateinfo)
    new_root.append(speech)

    # Save the new XML file with a unique name in the new directory
    new_tree = ElementTree(new_root)
    new_tree.write(os.path.join(new_dir, f'{filename[:-4]}_seg_{counter}.xml'))

    # Increment the counter
    counter += 1