In [23]:
import xml.etree.ElementTree as ET
import random

def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return root

def extract_texts(root):
    texts = []
    for block in root.findall(".//block"):
        for p in block.findall("p"):
            if p.text:
                texts.append(p.text.strip())
    return texts

def format_conll(texts):
    formatted_texts = []
    for text in texts:
        sentence = []
        for word in text.split():
            # Here, we're using "O O O" for each word as placeholder tags.
            # These should be replaced with actual tags if available.
            sentence.append(f"{word} O O O")
        formatted_texts.append("\n".join(sentence))
    return formatted_texts

def write_conll_format(sentences, file_path):
    with open(file_path, 'w') as file:
        file.write("-DOCSTART- -X- -X- O\n\n")
        for sentence in sentences:
            file.write(sentence + "\n\n")

def split_data(sentences, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1):
    random.shuffle(sentences)
    total = len(sentences)
    train_end = int(train_ratio * total)
    valid_end = train_end + int(valid_ratio * total)
    
    train_sentences = sentences[:train_end]
    valid_sentences = sentences[train_end:valid_end]
    test_sentences = sentences[valid_end:]
    
    return train_sentences, valid_sentences, test_sentences

def main():
    xml_file_path = '0000000.xml'
    root = parse_xml(xml_file_path)
    texts = extract_texts(root)
    formatted_texts = format_conll(texts)
    
    train_texts, valid_texts, test_texts = split_data(formatted_texts)
    
    write_conll_format(train_texts, 'nyt_train.txt')
    write_conll_format(valid_texts, 'nyt_valid.txt')
    write_conll_format(test_texts, 'nyt_test.txt')
    
    print("Data has been formatted and saved to nyt_train.txt, nyt_valid.txt, and nyt_test.txt")

if __name__ == "__main__":
    main()


Data has been formatted and saved to nyt_train.txt, nyt_valid.txt, and nyt_test.txt


In [22]:
print("Contents of valid.txt:")
print_file('nyt_train.txt')

Contents of valid.txt:
-DOCSTART- -X- -X- O

9,066,000 O O O

9,066,000 O O O

61,040,000 O O O

Net O O O
inc O O O

10,479,000 O O O

.71 O O O

Net O O O
inc O O O

*3*** O O O
COMPANY O O O
REPORTS O O O
** O O O

6mo O O O
sales O O O

Net O O O
inc O O O

9,932,000 O O O

142,283,000 O O O

3,953,000 O O O

Sales O O O

Share O O O
earns O O O

The O O O
company O O O
said O O O
the O O O
1986 O O O
shares O O O
outstanding O O O
reflects O O O
the O O O
pro O O O
rata O O O
effect O O O
of O O O
the O O O
issuance O O O
of O O O
1.35 O O O
million O O O
shares O O O
in O O O
public O O O
offering O O O
in O O O
August O O O
1986. O O O

9,069,000 O O O

Shares O O O
outst O O O

.59 O O O

5,300,000 O O O

Shares O O O
outst O O O

Shares O O O
outst O O O

7,054,000 O O O

75,907,000 O O O

61,040,000 O O O

.59 O O O

.71 O O O

Sales O O O

Shares O O O
outst O O O

75,907,000 O O O

114,876,000 O O O

1985 O O O

Share O O O
earns O O O

Net O O O
inc O O O

.59 O O O

Share