# Read GlobalGiving-style projects XML and preview a few records

This notebook cell will parse the large XML file at `Input/projects.xml` efficiently using a streaming parser, so it won't load the whole file into memory. It will:

- Locate `projects.xml` robustly regardless of current working directory
- Guess the repeating record tag (defaults to `project` if uncertain)
- Flatten nested XML into a tabular dict structure
- Preview the first 5 records as a pandas DataFrame

You can later switch to loading all records or writing them to CSV/Parquet if needed.

In [8]:
from pathlib import Path
from xml.etree.ElementTree import iterparse
from typing import Dict, Iterator, Optional
import pandas as pd
from IPython.display import display


def _strip_ns(tag: str) -> str:
    # Remove XML namespace like '{ns}tag' -> 'tag'
    if tag and tag[0] == '{':
        return tag.split('}', 1)[1]
    return tag


def _find_existing_path() -> Path:
    candidates = [
        Path("../projects.xml"),
        Path("/home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/projects.xml"),
        Path("Input/projects.xml"),
    ]
    for p in candidates:
        if p.exists():
            return p.resolve()
    raise FileNotFoundError(
        "Couldn't locate projects.xml. Checked: " + ", ".join(map(str, candidates))
    )


def _flatten_element(elem, parent_key: Optional[str] = None) -> Dict[str, Optional[str]]:
    data: Dict[str, Optional[str]] = {}

    tag_name = _strip_ns(elem.tag)

    # Attributes
    for k, v in elem.attrib.items():
        key = f"{parent_key}@{k}" if parent_key else f"@{k}"
        data[key] = v

    # Text
    text = (elem.text or "").strip()
    if text and len(elem):
        key = parent_key if parent_key else f"{tag_name}_text"
        data[key] = text
    elif text and not len(elem):
        key = parent_key if parent_key else tag_name
        data[key] = text

    # Children
    for child in list(elem):
        child_tag = _strip_ns(child.tag)
        child_key = f"{parent_key}.{child_tag}" if parent_key else child_tag
        child_data = _flatten_element(child, child_key)
        for ck, cv in child_data.items():
            if ck in data:
                i = 2
                new_ck = f"{ck}__{i}"
                while new_ck in data:
                    i += 1
                    new_ck = f"{ck}__{i}"
                data[new_ck] = cv
            else:
                data[ck] = cv
    return data


def _guess_record_tag(xml_path: Path, sample_events: int = 50000) -> Optional[str]:
    counts: Dict[str, int] = {}
    child_counts: Dict[str, int] = {}
    for i, (_, elem) in enumerate(iterparse(str(xml_path), events=("end",))):
        tag = _strip_ns(elem.tag)
        counts[tag] = counts.get(tag, 0) + 1
        if len(elem) > 0:
            child_counts[tag] = child_counts.get(tag, 0) + 1
        if i >= sample_events:
            break
        elem.clear()

    candidates = [(t, c) for t, c in counts.items() if child_counts.get(t, 0) > 0]
    if not candidates:
        return None
    candidates.sort(key=lambda x: x[1], reverse=True)
    maxc = candidates[0][1]
    for t, c in candidates:
        if c >= max(5, maxc // 10):
            return t
    return candidates[0][0]


def stream_records(
    xml_path: Path, record_tag: Optional[str] = None, limit: Optional[int] = None
) -> Iterator[Dict[str, Optional[str]]]:
    if record_tag is None:
        record_tag = _guess_record_tag(xml_path)
    if record_tag is None:
        record_tag = "project"  # sensible default for GlobalGiving datasets

    count = 0
    for _, elem in iterparse(str(xml_path), events=("end",)):
        tag = _strip_ns(elem.tag)
        if tag == record_tag:
            yield _flatten_element(elem, None)
            elem.clear()
            if limit is not None:
                count += 1
                if count >= limit:
                    break
        # IMPORTANT: do not clear non-matching elements here; that erases children before parent is processed


# Locate the file
xml_path = _find_existing_path()
print(f"Using XML file: {xml_path}")

# Preview a few records without loading full file
preview_limit = 5
# Try the expected GlobalGiving 'project' records first; if none, fallback to guessed tag
records = list(stream_records(xml_path, record_tag="project", limit=preview_limit))
if not records:
    records = list(stream_records(xml_path, record_tag=None, limit=preview_limit))

if not records:
    raise RuntimeError(
        "No records found in the XML. If unexpected, set record_tag explicitly when calling stream_records()."
    )

# df_preview = pd.DataFrame(records)
# print(f"Parsed {len(records)} records for preview; columns: {len(df_preview.columns)}")
# display(df_preview.head())

df_all = pd.DataFrame(
    stream_records(xml_path, record_tag="project")
)
print(f"Loaded all records: {len(df_all)} rows, {len(df_all.columns)} columns")

Using XML file: /home/thiesen/Documents/AI-Innoscence_Ecosystem/Input/projects.xml
Loaded all records: 49646 rows, 699 columns
Loaded all records: 49646 rows, 699 columns


In [10]:
print(sum(df_all['contactCountry'] == "Serbia"))

47


In [None]:
print(sum(df_all['contactCountry'] == "Moldova"))

0


In [13]:
print(sum(df_all['contactCountry'] == "Germany"))

256


In [14]:
print(sum(df_all['contactCity'] == "Novi Sad"))

3


In [15]:
df_novi_sad = df_all[df_all['contactCity'] == "Novi Sad"]
df_novi_sad.head()

Unnamed: 0,active,activities,additionalDocumentation,approvedDate,contactAddress,contactAddress2,contactCity,contactCountry,contactName,contactPostal,...,countries.country.iso3166CountryCode__14,countries.country.name__14,countries.country.iso3166CountryCode__15,countries.country.name__15,countries.country.iso3166CountryCode__16,countries.country.name__16,donationOptions.donationOption.amount__29,donationOptions.donationOption.description__29,donationOptions.donationOption.amount__30,donationOptions.donationOption.description__30
23617,False,"The Igman Initiative will provide training, to...",https://www.globalgiving.org/pfil/35009/projdo...,2018-09-10T09:00:00-04:00,Laze Teleckog 6/1,,Novi Sad,Serbia,Aleksandra Popov,21000,...,,,,,,,,,,
26493,False,The image of refugees has to be transformed; i...,,2019-03-11T09:00:00-04:00,Staroiriski Put 28,Sremska Kamenica,Novi Sad,Serbia,N'Deane Helajzen,21208,...,,,,,,,,,,
29980,False,The image of refugees has to be transformed; i...,https://www.globalgiving.org/pfil/44013/projdo...,2019-12-02T23:59:31-05:00,Staroiriski Put 28,Sremska Kamenica,Novi Sad,Serbia,N'Deane Helajzen,21208,...,,,,,,,,,,
