In [None]:
import pandas as pd
from lxml import etree
import os
import re

In [None]:
### parse

def parse_xml_to_df(xml_file):
    
    try:
        # Parse the XML file
        tree = etree.parse(xml_file)
        root = tree.getroot()
        
        # Get the filename without the extension
        filename = os.path.basename(xml_file)

        # Create a list to store the data
        data = []

        # Iterate over all elements in the XML file
        for element in root:
            # Create a dictionary to store the data for each element
            element_data = {}
            
            # add the filename
            element_data['source_filename'] = filename
            
            ## extract id
            eadid = root.find('.//eadid')
            if eadid is not None:
                element_data['ead_id'] = eadid.text.strip()  # Add strip() to remove leading and trailing white space
            
            publicid = eadid.get('publicid')
            if publicid is not None:
                result = re.search(r'::(.*)\.xml', publicid)
                if result:
                    public_id = result.group(1).split('::')[-1]
                    element_data['public_id'] = public_id    
            
            # Extract titleproper
            titleproper = root.find('.//titleproper')
            if titleproper is not None:
                element_data['titleproper'] = titleproper.text
            
            
            ## EXtract abstract
            abstract = element.find('.//abstract')
            if abstract is not None:
                element_data['abstract'] = abstract.text

            ## Extract language
            language = element.find('.//langmaterial')
            if language is not None:
                element_data['language'] = ''.join(language.itertext())

            ## Extract scopecontent
            scopecontent = element.findall('./scopecontent')
            if scopecontent:
                scopecontent_texts = []
                for sc in scopecontent:
                    paragraphs = sc.findall('./p')
                    if paragraphs:
                        for p in paragraphs:
                            p_text = ""
                            for child in p.itertext():
                                p_text += child
                            scopecontent_texts.append(p_text)
                element_data['scopecontent'] = '; '.join(scopecontent_texts)
            
            ## Extract controlaccess
            controlaccess = element.find('.//controlaccess')
            if controlaccess is not None:
                subjects = controlaccess.findall('.//subject')
                if subjects:
                    element_data['subjects'] = '; '.join([subject.text for subject in subjects])
                    # Extract the 'source' attribute for each 'subject' tag
                    element_data['subjects_source'] = '; '.join([subject.get('source') for subject in subjects if subject.get('source') is not None])

                genreforms = controlaccess.findall('.//genreform')
                if genreforms:
                    element_data['genreforms'] = '; '.join([genreform.text for genreform in genreforms])
                    # Extract the 'source' attribute for each 'genreform' tag
                    element_data['genreforms_source'] = '; '.join([genreform.get('source') for genreform in genreforms if genreform.get('source') is not None])

                geognames = controlaccess.findall('.//geogname')
                if geognames:
                    element_data['geognames'] = '; '.join([geogname.text for geogname in geognames])
                    # Extract the 'source' attribute for each 'geogname' tag
                    element_data['geognames_source'] = '; '.join([geogname.get('source') for geogname in geognames if geogname.get('source') is not None])

                persnames = controlaccess.findall('.//persname')
                if persnames:
                    element_data['persnames'] = '; '.join([persname.text for persname in persnames])
                    # Extract the 'source' attribute for each 'persname' tag
                    element_data['persnames_source'] = '; '.join([persname.get('source') for persname in persnames if persname.get('source') is not None])

                corpnames = controlaccess.findall('.//corpname')
                if corpnames:
                    element_data['corpnames'] = '; '.join([corpname.text for corpname in corpnames])
                    # Extract the 'source' attribute for each 'corpname' tag
                    element_data['corpnames_source'] = '; '.join([corpname.get('source') for corpname in corpnames if corpname.get('source') is not None])

                famnames = controlaccess.findall('.//famname')
                if famnames:
                    element_data['famnames'] = '; '.join([famname.text for famname in famnames])
                    # Extract the 'source' attribute for each 'famname' tag
                    element_data['famnames_source'] = '; '.join([famname.get('source') for famname in famnames if famname.get('source') is not None])

            ## Extract bioghist    
            bioghist = element.findall('./bioghist')
            if bioghist:
                bioghist_texts = []
                for bio in bioghist:
                    paragraphs = bio.findall('./p')
                    if paragraphs:
                        for p in paragraphs:
                            p_text = ""
                            for child in p.itertext():
                                p_text += child
                            bioghist_texts.append(p_text)
                element_data['bioghist'] = '; '.join(bioghist_texts)

            ## Extract custodhist
            custodhist = element.findall('./custodhist')
            if custodhist:
                custodhist_texts = []
                for cus in custodhist:
                    paragraphs = cus.findall('./p')
                    if paragraphs:
                        for p in paragraphs:
                            p_text = ""
                            for child in p.itertext():
                                p_text += child
                            custodhist_texts.append(p_text)
                element_data['custodhist'] = '; '.join(custodhist_texts)

            # Add the element data to the list of data
            data.append(element_data)

        # print(data)
        
        df = pd.DataFrame([d for d in data if len(d)>4])

    except:
        # If error, print the error message and skip the file
        print("Error parsing file:", xml_file)
        df = None
    
    return df


def parse_xml_folder_to_df(folder_path):
    # Create a list to store the dataframes for each file
    dfs = []
    
    # Loop over all XML files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            file_path = os.path.join(folder_path, filename)
            df = parse_xml_to_df(file_path)
            dfs.append(df)
    
    # Concatenate the dataframes into one dataframe
    result_df = pd.concat(dfs, ignore_index=True)
    
    return result_df

# NEED to customize to your own：change the path here to your path
df1 = parse_xml_folder_to_df("RCRC_Finding_Aid_List_Bentley/Finding_Aids")
df1.to_csv('parse_df1.csv', index=True)



## term matching

# read in the txt file term list
with open('terms_all.txt', 'r') as f:
    terms = [line.strip() for line in f]


def match_terms(row, terms, columns):
    results = []
    for term in terms:
        for col in columns:
            if not isinstance(row[col], float):
                # Split the column into paragraphs
                paragraphs = row[col].split('\n')
                # Loop through each paragraph
                for paragraph in paragraphs:
                    # Check if the term is in the current paragraph
                    if re.search(r'\b' + re.escape(term) + r'\b', paragraph, re.IGNORECASE):
                        # Split paragraph into sentences
                        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', paragraph)
                        # Find the sentence containing the term
                        matched_sentence = next((sentence for sentence in sentences if re.search(r'\b' + re.escape(term) + r'\b', sentence, re.IGNORECASE)), paragraph)
                        results.append({
                            'Term': term,
                            'Occurrence (ead_ID)': row['ead_id'],
                            'Field': col, 
                            'Collection': row.get('titleproper', None),
                            'Context': matched_sentence  # Returning only the matched sentence
                        })
    return results


def match_and_visualize(df, name):
    # Match results
    results_df = pd.DataFrame([result for index, row in df.iterrows() for result in match_terms(row, terms, df.columns)])
    
    # Sort results by 'Term'
    sorted_results_df = results_df.sort_values(by='Term', ascending=True)
    
    # Show matched results
    print("Matched results for ", name)

    # Export to CSV without the index
    sorted_results_df.to_csv('matched_results_' + name + '.csv', index=False)
    return sorted_results_df 
  

# NEED to customize to your own：get the match results now, change to your data name (e.g., in this example case, we use "Bentley")
match_and_visualize(df1, 'Bentley')

In [None]:
# =================================== if more granular for example five words preceding and five following the term/ phrase

In [4]:
import os
import re
import pandas as pd
from lxml import etree

def parse_xml_to_df(xml_file):
    try:
        # Parse the XML file
        tree = etree.parse(xml_file)
        root = tree.getroot()
        
        # Get the filename without the extension
        filename = os.path.basename(xml_file)

        # Create a list to store the data
        data = []

        # Iterate over all elements in the XML file
        for element in root:
            # Create a dictionary to store the data for each element
            element_data = {}
            
            # add the filename
            element_data['source_filename'] = filename
            
            ## extract id
            eadid = root.find('.//eadid')
            if eadid is not None:
                element_data['ead_id'] = eadid.text.strip()  # Add strip() to remove leading and trailing white space
            
            publicid = eadid.get('publicid')
            if publicid is not None:
                result = re.search(r'::(.*)\.xml', publicid)
                if result:
                    public_id = result.group(1).split('::')[-1]
                    element_data['public_id'] = public_id    
            
            # Extract titleproper
            titleproper = root.find('.//titleproper')
            if titleproper is not None:
                element_data['titleproper'] = titleproper.text
            
            
            ## Extract abstract
            abstract = element.find('.//abstract')
            if abstract is not None:
                element_data['abstract'] = abstract.text

            ## Extract language
            language = element.find('.//langmaterial')
            if language is not None:
                element_data['language'] = ''.join(language.itertext())

            ## Extract scopecontent
            scopecontent = element.findall('./scopecontent')
            if scopecontent:
                scopecontent_texts = []
                for sc in scopecontent:
                    paragraphs = sc.findall('./p')
                    if paragraphs:
                        for p in paragraphs:
                            p_text = ""
                            for child in p.itertext():
                                p_text += child
                            scopecontent_texts.append(p_text)
                element_data['scopecontent'] = '; '.join(scopecontent_texts)
            
            ## Extract controlaccess
            controlaccess = element.find('.//controlaccess')
            if controlaccess is not None:
                subjects = controlaccess.findall('.//subject')
                if subjects:
                    element_data['subjects'] = '; '.join([subject.text for subject in subjects])
                    # Extract the 'source' attribute for each 'subject' tag
                    element_data['subjects_source'] = '; '.join([subject.get('source') for subject in subjects if subject.get('source') is not None])

                genreforms = controlaccess.findall('.//genreform')
                if genreforms:
                    element_data['genreforms'] = '; '.join([genreform.text for genreform in genreforms])
                    # Extract the 'source' attribute for each 'genreform' tag
                    element_data['genreforms_source'] = '; '.join([genreform.get('source') for genreform in genreforms if genreform.get('source') is not None])

                geognames = controlaccess.findall('.//geogname')
                if geognames:
                    element_data['geognames'] = '; '.join([geogname.text for geogname in geognames])
                    # Extract the 'source' attribute for each 'geogname' tag
                    element_data['geognames_source'] = '; '.join([geogname.get('source') for geogname in geognames if geogname.get('source') is not None])

                persnames = controlaccess.findall('.//persname')
                if persnames:
                    element_data['persnames'] = '; '.join([persname.text for persname in persnames])
                    # Extract the 'source' attribute for each 'persname' tag
                    element_data['persnames_source'] = '; '.join([persname.get('source') for persname in persnames if persname.get('source') is not None])

                corpnames = controlaccess.findall('.//corpname')
                if corpnames:
                    element_data['corpnames'] = '; '.join([corpname.text for corpname in corpnames])
                    # Extract the 'source' attribute for each 'corpname' tag
                    element_data['corpnames_source'] = '; '.join([corpname.get('source') for corpname in corpnames if corpname.get('source') is not None])

                famnames = controlaccess.findall('.//famname')
                if famnames:
                    element_data['famnames'] = '; '.join([famname.text for famname in famnames])
                    # Extract the 'source' attribute for each 'famname' tag
                    element_data['famnames_source'] = '; '.join([famname.get('source') for famname in famnames if famname.get('source') is not None])

            ## Extract bioghist    
            bioghist = element.findall('./bioghist')
            if bioghist:
                bioghist_texts = []
                for bio in bioghist:
                    paragraphs = bio.findall('./p')
                    if paragraphs:
                        for p in paragraphs:
                            p_text = ""
                            for child in p.itertext():
                                p_text += child
                            bioghist_texts.append(p_text)
                element_data['bioghist'] = '; '.join(bioghist_texts)

            ## Extract custodhist
            custodhist = element.findall('./custodhist')
            if custodhist:
                custodhist_texts = []
                for cus in custodhist:
                    paragraphs = cus.findall('./p')
                    if paragraphs:
                        for p in paragraphs:
                            p_text = ""
                            for child in p.itertext():
                                p_text += child
                            custodhist_texts.append(p_text)
                element_data['custodhist'] = '; '.join(custodhist_texts)

            # Add the element data to the list of data
            data.append(element_data)

        # print(data)
        
        df = pd.DataFrame([d for d in data if len(d)>4])

    except Exception as e:
        # If error, print the error message and skip the file
        print("Error parsing file:", xml_file, e)
        df = None
    
    return df


def parse_xml_folder_to_df(folder_path):
    # Create a list to store the dataframes for each file
    dfs = []
    
    # Loop over all XML files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            file_path = os.path.join(folder_path, filename)
            df = parse_xml_to_df(file_path)
            if df is not None:
                dfs.append(df)
    
    # Concatenate the dataframes into one dataframe
    result_df = pd.concat(dfs, ignore_index=True)
    
    return result_df

# NEED to customize to your own：change the path here to your path
df1 = parse_xml_folder_to_df("RCRC_Finding_Aid_List_Bentley/Finding_Aids")
df1.to_csv('parse_df1.csv', index=True)



## term matching

# read in the txt file term list
with open('terms_all.txt', 'r') as f:
    terms = [line.strip() for line in f]


def match_terms(row, terms, columns):
    results = []
    for term in terms:
        for col in columns:
            if not isinstance(row[col], float):
                # Split the column into paragraphs
                paragraphs = row[col].split('\n')
                # Loop through each paragraph
                for paragraph in paragraphs:
                    # Check if the term is in the current paragraph
                    if re.search(r'\b' + re.escape(term) + r'\b', paragraph, re.IGNORECASE):
                        # Split paragraph into sentences
                        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', paragraph)
                        # Find the sentence containing the term
                        matched_sentence = next((sentence for sentence in sentences if re.search(r'\b' + re.escape(term) + r'\b', sentence, re.IGNORECASE)), paragraph)

                        # Extract context: five words before and after the term
                        words = matched_sentence.split()
                        # Handle multi-word terms by finding the entire phrase in the sentence
                        pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
                        matches = list(pattern.finditer(matched_sentence))
                        for match in matches:
                            start_index = match.start()
                            end_index = match.end()
                            # Split the sentence around the matched phrase
                            pre_context = matched_sentence[:start_index].strip().split()[-5:]  # last five words before the match
                            post_context = matched_sentence[end_index:].strip().split()[:5]    # first five words after the match
                            context = ' '.join(pre_context + [term] + post_context)

                            results.append({
                                'Term': term,
                                'Occurrence (ead_ID)': row['ead_id'],
                                'Field': col, 
                                'Collection': row.get('titleproper', None),
                                'Context': matched_sentence,  # Original matched sentence
                                'Context (extended)': context  # New context with five words before and after
                            })
    return results


def match_and_visualize(df, name):
    # Match results
    results_df = pd.DataFrame([result for index, row in df.iterrows() for result in match_terms(row, terms, df.columns)])
    
    # Sort results by 'Term'
    sorted_results_df = results_df.sort_values(by='Term', ascending=True)
    
    # Show matched results
    print("Matched results for ", name)

    # Export to CSV without the index
    sorted_results_df.to_csv('matched_results_' + name + '.csv', index=False)
    return sorted_results_df 
  

# NEED to customize to your own：get the match results now, change to your data name (e.g., in this example case, we use "Bentley")
matched_df = match_and_visualize(df1, 'Bentley')

# Display the result
matched_df


Matched results for  Bentley


Unnamed: 0,Term,Occurrence (ead_ID),Field,Collection,Context,Context (extended)
138,Benevolent Assimilation,umich-bhl-86354,bioghist,Finding Aid for Dean C. Worcester Papers,"McKinley asked Worcester to join a ""civilian c...",of the U.S. program of Benevolent Assimilation...
162,Colonial,umich-bhl-8772,bioghist,Finding aid for Luce Philippine Project interv...,In 1977 the University of Michigan Center for ...,"(former diplomats, teachers, missionaries, ser..."
99,Colonial,umich-bhl-85419,scopecontent,"Finding aid for Owen A. Tomlinson papers, 1899...",Within the Photograph series will be found six...,"U.S. Army personnel, and other Colonial offici..."
69,Colonial,umich-bhl-851733,bioghist,Finding Aid for Harry Burns Hutchins papers,Mary Hutchins was a member of many organizatio...,"flag, the Michigan Chapter of Colonial Dames, ..."
72,Colonial,umich-bhl-851764,abstract,"Finding aid for George A. Malcolm papers, 1896...","Correspondence, scrapbooks, printed reports, a...",Colonialism: memoirs of an American Colonial C...
...,...,...,...,...,...,...
159,Types,umich-bhl-87265.0,bioghist,Finding aid for News and Information Services ...,News Service has continued to expand its media...,continued to expand its media Types and now in...
46,Types,umich-bhl-2014136,bioghist,Finding aid for University Herbarium (Universi...,The U-M Herbarium is also a leader in digitizi...,1977 database of seed plant Types (Reznicek).
52,Types,umich-bhl-851285,scopecontent,Finding Aid for Thomas Francis Papers,Types of records in these unprocessed subserie...,Types of records in these unprocessed
82,Types,umich-bhl-85193,scopecontent,Finding Aid for Philip A. Hart Papers,Hart himself and his staff had discarded certa...,his staff had discarded certain Types of files...
