In [4]:
import re
import os
from xml.etree import ElementTree as ET

def parse_reuters_document(text):
    # extract reuters articles from text
    pattern = r'<REUTERS.*?</REUTERS>'
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

def clean_xml(xml_text):
    # remove invalid xml characters
    xml_text = re.sub(r'&#\d+;', '', xml_text)
    # remove special characters that might cause issues
    xml_text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', xml_text)
    return xml_text

def extract_fields(reuters_xml):
    # parse xml and get fields
    try:
        cleaned = clean_xml(reuters_xml)
        root = ET.fromstring(cleaned)
    except ET.ParseError as e:
        print(f"Parse error: {e}")
        return None
    
    doc = {
        'newid': root.get('NEWID', ''),
        'oldid': root.get('OLDID', ''),
        'topics_tag': root.get('TOPICS', ''),
        'lewissplit': root.get('LEWISSPLIT', ''),
        'date': '',
        'title': '',
        'dateline': '',
        'body': '',
        'topics': [],
        'places': [],
        'people': [],
        'orgs': [],
        'exchanges': [],
        'companies': []
    }
    
    # get date
    date_elem = root.find('DATE')
    if date_elem is not None and date_elem.text:
        doc['date'] = date_elem.text.strip()
    
    # get text content
    text_elem = root.find('TEXT')
    if text_elem is not None:
        title_elem = text_elem.find('TITLE')
        if title_elem is not None and title_elem.text:
            doc['title'] = title_elem.text.strip()
        
        dateline_elem = text_elem.find('DATELINE')
        if dateline_elem is not None and dateline_elem.text:
            doc['dateline'] = dateline_elem.text.strip()
        
        body_elem = text_elem.find('BODY')
        if body_elem is not None and body_elem.text:
            doc['body'] = body_elem.text.strip()
    
    # get all categories with explicit checks
    topics_elem = root.find('TOPICS')
    if topics_elem is not None:
        for d in topics_elem.findall('D'):
            if d.text and d.text.strip():
                doc['topics'].append(d.text.strip())
    
    places_elem = root.find('PLACES')
    if places_elem is not None:
        for d in places_elem.findall('D'):
            if d.text and d.text.strip():
                doc['places'].append(d.text.strip())
    
    people_elem = root.find('PEOPLE')
    if people_elem is not None:
        for d in people_elem.findall('D'):
            if d.text and d.text.strip():
                doc['people'].append(d.text.strip())
    
    orgs_elem = root.find('ORGS')
    if orgs_elem is not None:
        for d in orgs_elem.findall('D'):
            if d.text and d.text.strip():
                doc['orgs'].append(d.text.strip())
    
    exchanges_elem = root.find('EXCHANGES')
    if exchanges_elem is not None:
        for d in exchanges_elem.findall('D'):
            if d.text and d.text.strip():
                doc['exchanges'].append(d.text.strip())
    
    companies_elem = root.find('COMPANIES')
    if companies_elem is not None:
        for d in companies_elem.findall('D'):
            if d.text and d.text.strip():
                doc['companies'].append(d.text.strip())
    
    return doc

# main execution
data_path = r"C:\Users\asus\Desktop\NewsIndexing\data"

all_documents = []

# find all sgm files
sgm_files = [f for f in os.listdir(data_path) if f.endswith('.sgm')]
print(f"Found {len(sgm_files)} SGM files\n")

# read each file
for filename in sgm_files[:3]:
    filepath = os.path.join(data_path, filename)
    print(f"Reading {filename}...")
    
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    docs = parse_reuters_document(content)
    all_documents.extend(docs)
    print(f"  Found {len(docs)} documents")

print(f"\nTotal documents: {len(all_documents)}")
print("=" * 80)

# show sample documents
print("\nSample Documents:")
for i, doc_xml in enumerate(all_documents[:5]):
    doc = extract_fields(doc_xml)
    
    if doc is None:
        continue
    
    print(f"\n[Document {i+1}] ID: {doc['newid']}")
    print("-" * 80)
    print(f"Date: {doc['date']}")
    print(f"Title: {doc['title']}")
    print(f"Dateline: {doc['dateline']}")
    
    body_preview = doc['body'][:150] + "..." if len(doc['body']) > 150 else doc['body']
    print(f"Body: {body_preview}")
    
    if doc['places']:
        print(f"Places: {', '.join(doc['places'])}")
    if doc['topics']:
        print(f"Topics: {', '.join(doc['topics'])}")
    if doc['people']:
        print(f"People: {', '.join(doc['people'])}")
    if doc['orgs']:
        print(f"Organizations: {', '.join(doc['orgs'])}")
    if doc['exchanges']:
        print(f"Exchanges: {', '.join(doc['exchanges'])}")
    if doc['companies']:
        print(f"Companies: {', '.join(doc['companies'])}")

print("\n" + "=" * 80)

# calculate statistics
print("\nDataset Statistics:")
print("-" * 80)

all_docs = [extract_fields(doc_xml) for doc_xml in all_documents]
all_docs = [d for d in all_docs if d is not None]

docs_with_topics = sum(1 for d in all_docs if d['topics'])
docs_with_places = sum(1 for d in all_docs if d['places'])
docs_with_people = sum(1 for d in all_docs if d['people'])
docs_with_orgs = sum(1 for d in all_docs if d['orgs'])
docs_with_exchanges = sum(1 for d in all_docs if d['exchanges'])
docs_with_companies = sum(1 for d in all_docs if d['companies'])
docs_with_body = sum(1 for d in all_docs if d['body'])

total = len(all_docs)
print(f"Documents with body text: {docs_with_body}/{total} ({100*docs_with_body/total:.1f}%)")
print(f"Documents with places: {docs_with_places}/{total} ({100*docs_with_places/total:.1f}%)")
print(f"Documents with topics: {docs_with_topics}/{total} ({100*docs_with_topics/total:.1f}%)")
print(f"Documents with people: {docs_with_people}/{total} ({100*docs_with_people/total:.1f}%)")
print(f"Documents with organizations: {docs_with_orgs}/{total} ({100*docs_with_orgs/total:.1f}%)")
print(f"Documents with exchanges: {docs_with_exchanges}/{total} ({100*docs_with_exchanges/total:.1f}%)")
print(f"Documents with companies: {docs_with_companies}/{total} ({100*docs_with_companies/total:.1f}%)")

# show unique values
all_places = set()
all_topics = set()
for d in all_docs:
    all_places.update(d['places'])
    all_topics.update(d['topics'])

print(f"\nUnique places ({len(all_places)}): {sorted(list(all_places))[:20]}")
if len(all_places) > 20:
    print(f"  ... and {len(all_places) - 20} more")

print(f"\nUnique topics ({len(all_topics)}): {sorted(list(all_topics))}")


Found 22 SGM files

Reading reut2-000.sgm...
  Found 1000 documents
Reading reut2-001.sgm...
  Found 1000 documents
Reading reut2-002.sgm...
  Found 1000 documents

Total documents: 3000

Sample Documents:

[Document 1] ID: 1
--------------------------------------------------------------------------------
Date: 26-FEB-1987 15:01:01.79
Title: BAHIA COCOA REVIEW
Dateline: SALVADOR, Feb 26 -
Body: Showers continued throughout the week in
the Bahia cocoa zone, alleviating the drought since early
January and improving prospects for the coming temp...
Places: el-salvador, usa, uruguay
Topics: cocoa

[Document 2] ID: 2
--------------------------------------------------------------------------------
Date: 26-FEB-1987 15:02:20.00
Title: STANDARD OIL <SRD> TO FORM FINANCIAL UNIT
Dateline: CLEVELAND, Feb 26 -
Body: Standard Oil Co and BP North America
Inc said they plan to form a venture to manage the money market
borrowing and investment activities of both compa...
Places: usa

[Document 3] ID: 

In [None]:
import re
import os
import json
from xml.etree import ElementTree as ET

def parse_reuters_document(text):
    # extract reuters articles from text
    pattern = r'<REUTERS.*?</REUTERS>'
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

def clean_xml(xml_text):
    # remove invalid xml characters
    xml_text = re.sub(r'&#\d+;', '', xml_text)
    xml_text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', xml_text)
    return xml_text

def extract_fields(reuters_xml):
    # parse xml and get fields
    try:
        cleaned = clean_xml(reuters_xml)
        root = ET.fromstring(cleaned)
    except ET.ParseError as e:
        print(f"Parse error: {e}")
        return None
    
    doc = {
        'newid': root.get('NEWID', ''),
        'oldid': root.get('OLDID', ''),
        'topics_tag': root.get('TOPICS', ''),
        'lewissplit': root.get('LEWISSPLIT', ''),
        'cgisplit': root.get('CGISPLIT', ''),
        'date': '',
        'title': '',
        'dateline': '',
        'body': '',
        'topics': [],
        'places': [],
        'people': [],
        'orgs': [],
        'exchanges': [],
        'companies': []
    }
    
    # get date
    date_elem = root.find('DATE')
    if date_elem is not None and date_elem.text:
        doc['date'] = date_elem.text.strip()
    
    # get text content
    text_elem = root.find('TEXT')
    if text_elem is not None:
        title_elem = text_elem.find('TITLE')
        if title_elem is not None and title_elem.text:
            doc['title'] = title_elem.text.strip()
        
        dateline_elem = text_elem.find('DATELINE')
        if dateline_elem is not None and dateline_elem.text:
            doc['dateline'] = dateline_elem.text.strip()
        
        body_elem = text_elem.find('BODY')
        if body_elem is not None and body_elem.text:
            doc['body'] = body_elem.text.strip()
    
    # get all categories
    topics_elem = root.find('TOPICS')
    if topics_elem is not None:
        for d in topics_elem.findall('D'):
            if d.text and d.text.strip():
                doc['topics'].append(d.text.strip())
    
    places_elem = root.find('PLACES')
    if places_elem is not None:
        for d in places_elem.findall('D'):
            if d.text and d.text.strip():
                doc['places'].append(d.text.strip())
    
    people_elem = root.find('PEOPLE')
    if people_elem is not None:
        for d in people_elem.findall('D'):
            if d.text and d.text.strip():
                doc['people'].append(d.text.strip())
    
    orgs_elem = root.find('ORGS')
    if orgs_elem is not None:
        for d in orgs_elem.findall('D'):
            if d.text and d.text.strip():
                doc['orgs'].append(d.text.strip())
    
    exchanges_elem = root.find('EXCHANGES')
    if exchanges_elem is not None:
        for d in exchanges_elem.findall('D'):
            if d.text and d.text.strip():
                doc['exchanges'].append(d.text.strip())
    
    companies_elem = root.find('COMPANIES')
    if companies_elem is not None:
        for d in companies_elem.findall('D'):
            if d.text and d.text.strip():
                doc['companies'].append(d.text.strip())
    
    return doc

# main execution
data_path = r"C:\Users\asus\Desktop\NewsIndexing\data"
output_file = "reuters_documents.json"

all_documents = []
parsed_docs = []

# find all sgm files
sgm_files = sorted([f for f in os.listdir(data_path) if f.endswith('.sgm')])
print(f"Found {len(sgm_files)} SGM files\n")

# read each file
for filename in sgm_files:
    filepath = os.path.join(data_path, filename)
    print(f"Processing {filename}...", end=" ")
    
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    docs = parse_reuters_document(content)
    print(f"Found {len(docs)} documents")
    
    # parse each document
    for doc_xml in docs:
        doc = extract_fields(doc_xml)
        if doc is not None:
            parsed_docs.append(doc)

print(f"\nSuccessfully parsed {len(parsed_docs)} documents")

# save to json file
print(f"\nSaving to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(parsed_docs, f, indent=2, ensure_ascii=False)

print(f"Done! Saved {len(parsed_docs)} documents to {output_file}")

# show sample
print("\nSample document:")
print(json.dumps(parsed_docs[0], indent=2))

# show statistics
print("\nDataset Statistics:")
print("-" * 60)
total = len(parsed_docs)
docs_with_body = sum(1 for d in parsed_docs if d['body'])
docs_with_places = sum(1 for d in parsed_docs if d['places'])
docs_with_topics = sum(1 for d in parsed_docs if d['topics'])
docs_with_people = sum(1 for d in parsed_docs if d['people'])
docs_with_orgs = sum(1 for d in parsed_docs if d['orgs'])
docs_with_exchanges = sum(1 for d in parsed_docs if d['exchanges'])
docs_with_companies = sum(1 for d in parsed_docs if d['companies'])

print(f"Total documents: {total}")
print(f"With body text: {docs_with_body} ({100*docs_with_body/total:.1f}%)")
print(f"With places: {docs_with_places} ({100*docs_with_places/total:.1f}%)")
print(f"With topics: {docs_with_topics} ({100*docs_with_topics/total:.1f}%)")
print(f"With people: {docs_with_people} ({100*docs_with_people/total:.1f}%)")
print(f"With organizations: {docs_with_orgs} ({100*docs_with_orgs/total:.1f}%)")
print(f"With exchanges: {docs_with_exchanges} ({100*docs_with_exchanges/total:.1f}%)")
print(f"With companies: {docs_with_companies} ({100*docs_with_companies/total:.1f}%)")

# show unique values
all_places = set()
all_topics = set()
for d in parsed_docs:
    all_places.update(d['places'])
    all_topics.update(d['topics'])

print(f"\nUnique places: {len(all_places)}")
print(f"Unique topics: {len(all_topics)}")
print(f"\nSample places: {sorted(list(all_places))[:10]}")
print(f"Sample topics: {sorted(list(all_topics))[:10]}")

Found 22 SGM files

Processing reut2-000.sgm... Found 1000 documents
Processing reut2-001.sgm... Found 1000 documents
Processing reut2-002.sgm... Found 1000 documents
Processing reut2-003.sgm... Found 1000 documents
Processing reut2-004.sgm... Found 1000 documents
Processing reut2-005.sgm... Found 1000 documents
Processing reut2-006.sgm... Found 1000 documents
Processing reut2-007.sgm... Found 1000 documents
Processing reut2-008.sgm... Found 1000 documents
Processing reut2-009.sgm... Found 1000 documents
Processing reut2-010.sgm... Found 1000 documents
Processing reut2-011.sgm... Found 1000 documents
Processing reut2-012.sgm... Found 1000 documents
Processing reut2-013.sgm... Found 1000 documents
Processing reut2-014.sgm... Found 1000 documents
Processing reut2-015.sgm... Found 1000 documents
Processing reut2-016.sgm... Found 1000 documents
Processing reut2-017.sgm... Found 1000 documents
Processing reut2-018.sgm... Found 1000 documents
Processing reut2-019.sgm... Found 1000 documents


: 