In [None]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import json

def parse_lexical_resource(xml_file):
    """
    Parse XML lexical resource and create a lookup dictionary.
    
    Returns a dictionary where:
    - Keys are word forms (writtenForm)
    - Values are lists of dictionaries containing lemma and grammatical info
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Dictionary to store the lexicalization lookup
    lexicon_lookup = defaultdict(list)
    
    # Find all LexicalEntry elements
    for entry in root.findall('.//LexicalEntry'):
        # Extract lemma information
        lemma_elem = entry.find('.//Lemma/FormRepresentation')
        if lemma_elem is not None:
            lemma_info = {}
            for feat in lemma_elem.findall('feat'):
                lemma_info[feat.get('att')] = feat.get('val')
            
            lemma = lemma_info.get('writtenForm', '')
            lemgram = lemma_info.get('lemgram', '')
            pos = lemma_info.get('partOfSpeech', '')
            paradigm = lemma_info.get('paradigm', '')
            
            # Extract word forms
            for wordform in entry.findall('.//WordForm'):
                form_info = {}
                for feat in wordform.findall('feat'):
                    form_info[feat.get('att')] = feat.get('val')
                
                written_form = form_info.get('writtenForm', '')
                msd = form_info.get('msd', '')
                
                if written_form:
                    lexicon_lookup[written_form].append({
                        'lemma': lemma,
                        'lemgram': lemgram,
                        'partOfSpeech': pos,
                        'paradigm': paradigm,
                        'msd': msd
                    })
    
    # Convert defaultdict to regular dict for cleaner output
    return dict(lexicon_lookup)


def save_lookup_dict(lookup_dict, output_file='lexicon_lookup.json'):
    """Save the lookup dictionary to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(lookup_dict, f, ensure_ascii=False, indent=2)
    print(f"Lookup dictionary saved to {output_file}")


def demonstrate_lookup(lookup_dict):
    """Demonstrate how to use the lookup dictionary."""
    print("\n=== Lexicalization Lookup Dictionary ===\n")
    
    for word_form, entries in sorted(lookup_dict.items()):
        print(f"Word form: '{word_form}'")
        for i, entry in enumerate(entries, 1):
            print(f"  Entry {i}:")
            print(f"    Lemma: {entry['lemma']}")
            print(f"    Part of Speech: {entry['partOfSpeech']}")
            print(f"    MSD: {entry['msd']}")
            print(f"    Lemgram: {entry['lemgram']}")
            print(f"    Paradigm: {entry['paradigm']}")
        print()


if __name__ == "__main__":
    # Example usage
    xml_file = 'lexical_resource.xml'
    
    print("Parsing XML lexical resource...")
    lookup_dict = parse_lexical_resource(xml_file)
    
    print(f"Created lookup dictionary with {len(lookup_dict)} word forms\n")
    
    # Demonstrate the lookup
    demonstrate_lookup(lookup_dict)
    
    # Save to JSON
    save_lookup_dict(lookup_dict)
    
    # Example of looking up a specific word
    print("\n=== Example Lookup ===")
    test_word = "dväljs"
    if test_word in lookup_dict:
        print(f"Looking up '{test_word}':")
        for entry in lookup_dict[test_word]:
            print(f"  → Lemma: {entry['lemma']} ({entry['msd']})")
    else:
        print(f"Word '{test_word}' not found in dictionary")