In [1]:
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from xml.etree import ElementTree as ET

def extract_text_from_xml(filename):
    with open(filename, 'r', encoding='UTF-8') as f:
        tree = ET.parse(f)
    root = tree.getroot()

    # Traverse the XML tree to get all the text
    texts = [elem.text for elem in root.iter() if elem.text and elem.text.strip() != '']
    return ' '.join(texts)

def contains_numbers(word):
    return bool(re.search(r'\d', word))

def extract_important_words_from_folder(folder_path, top_n=15):
    # Load all XML files from the folder
    xml_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.xml')]
    
    documents = [extract_text_from_xml(xml_file) for xml_file in xml_files]
    
    # Compute TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    important_words = {}
    for doc_idx, doc in enumerate(xml_files):
        # Get feature indices sorted by TF-IDF score
        sorted_feature_idxs = tfidf_matrix[doc_idx].tocoo().col.argsort()[::-1]
        
        # Get top-n words for each document, excluding words with numbers
        words = [feature_names[i] for i in sorted_feature_idxs if not contains_numbers(feature_names[i])]
        important_words[os.path.basename(doc)] = words[:top_n]

    return important_words

folder_path = 'x-oli-workbook_page'
important_words_dict = extract_important_words_from_folder(folder_path)

# Print the results
for file_name, words in important_words_dict.items():
    print(f"{file_name}: {', '.join(words)}")

a02cc691384d43edbaf707f306841c3b.xml: addresses, achieved, accordingly, accurately, accounts, action, adapt, annotators, american, activities, alignment, ages, able, analyzed, actual
a0abc3fa0e6b44aaaf5e70e36602b61e.xml: 
a0d6e696158d44cb993ac9175f224c55.xml: 
a15a0946f8364c0a81c2d538954588ec.xml: 
a28b204fb8954fceb52a3e846fdd5698.xml: 
a2aba8cb20d84feb92654647946ecbb8.xml: academy, abilities, ability, abide, academic, acceptable, abcdef, absolute, acceptance, absolved, accept, able, accepted, academia
a2c9cadf120741b1b0709f4ea57eea38.xml: advisable, accounts, address, addressed, add, affects, advanced, agglomerating, advocated, alignment, abide, actual, able, active, accordingly
a3e60ec52fd1485daf95a85dac31bbff.xml: abcdef, access, academy, abilities, absolute, accept, able, acceptable, accepted, accessed, ability, academic, academia, abide, acceptance
a4167b26abaf41c999d256ed199c8127.xml: 
a47d2cb6edc24ca7a5bb1b42b9021790.xml: 
a8503d398c5545a28640f6fda9980856.xml: 
a9f596c78e124ac4a