In [13]:
from bs4 import BeautifulSoup
import pandas as pd
import os

In [None]:
# get the list of all XML files in the current directory
files = [f for f in os.listdir('.') if f.endswith('.xml')]

In [75]:
# iterate over the list of XML files and get the title, author, published date, updated date, word count, publication name, body text, and indexing terms (extract only the className and the item score). Store the data in a pandas DataFrame

data = []
for item in files:
    # Load the XML file
    with open(item, 'r', encoding='utf-8') as file:
        xml_data = file.read()
    
    # parse the XML file using BeautifulSoup
    soup = BeautifulSoup(xml_data, 'lxml-xml')
    
    # use soup.find to get the elements of the XML file
    title = soup.find('title').text
    # get the author name from the <author><name> tag
    author = soup.find('name').text
    # get the published date from the <published> tag. Stored as a datetime object
    published = soup.find('published').text
    # get the updated date from the <updated> tag
    updated = soup.find('updated').text
    # get the word count from the <wordCount> tag
    word_count = soup.find('wordCount')['number']
    # get the publication name from the <publicationName> tag
    publication_name = soup.find('publicationName').text
    # get the body text from the <bodyText> tag and remove extra spaces and line breaks
    body_text = ' '.join(soup.find('bodyText').get_text().split())
    
    # for any <classificationGroup classificationScheme="indexing-terms"> get the className text and its parent classificationItem score attribute
    indexing_terms = []
    for classification_group in soup.find_all('classificationGroup', classificationScheme='indexing-terms'):
        for classification_item in classification_group.find_all('classificationItem'):
            try:
                class_name = classification_item.find('className').text
                item_score = classification_item.get('score')
                indexing_terms.append({'className': class_name, 'itemScore': item_score})
            except AttributeError:
                print("A classificationItem without a className was found and skipped.")
                
    #for classification_group in soup.find_all('classificationGroup', classificationScheme='indexing-terms'):
    #    for classification_item in classification_group.find_all('classificationItem'):
    #        indexing_terms.append({'className': classification_item.find('className').text, 'itemScore': classification_item.get('score')})
    
    # append the data to the data list
    data.append({'title': title, 'author': author, 'published': published, 'updated': updated, 'word_count': word_count, 'publication_name': publication_name, 'body_text': body_text, 'indexing_terms': indexing_terms})

# create a pandas DataFrame from the data list. Flatten the indexing_terms list of dictionaries
df = pd.DataFrame(data)

A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.
A classificationItem without a className was found and skipped.


In [76]:
df

Unnamed: 0,title,author,published,updated,word_count,publication_name,body_text,indexing_terms
0,US Congress Bans Use Of Microsoft AI Copilot -...,LexisNexis,2024-04-01T00:00:00Z,2024-06-17T19:57:14Z,513,silicon.co.uk,Risk of leaks. US House of Representatives imp...,"[{'className': 'MICROSOFT CORP', 'itemScore': ..."
1,Calltower Launches Microsoft Copilot,LexisNexis,2024-02-13T00:00:00Z,2024-06-17T19:55:20Z,410,MENAFN - Press Releases (English),Link to Story CallTower offers Copilot for cus...,"[{'className': 'MICROSOFT CORP', 'itemScore': ..."
2,Lighthouse Launches New Gen AI Assessment for ...,LexisNexis,2024-02-14T00:00:00Z,2024-06-17T19:54:07Z,378,Entertainment Close-Up,"Lighthouse, a provider of technology-enabled e...","[{'className': 'MICROSOFT CORP', 'itemScore': ..."
3,"ThinkSmart's Webinar Empowers Users with AI, M...",LexisNexis,2024-03-22T00:00:00Z,2024-06-17T19:52:45Z,325,bizbahrain,"ThinkSmart for Development and Training, a lea...","[{'className': 'MICROSOFT CORP', 'itemScore': ..."
4,Microsoft Unveils 'AI-ready' PCs,LexisNexis,2024-05-21T00:00:00Z,2024-06-17T19:57:54Z,651,Newstex Blogs International Business Times Aus...,The views expressed in any and all content dis...,"[{'className': 'MICROSOFT CORP', 'itemScore': ..."
...,...,...,...,...,...,...,...,...
120,Accelerates Secure Adoption of Microsoft Copil...,LexisNexis,2024-01-23T00:00:00Z,2024-06-17T19:53:21Z,713,Market News Publishing,"VARONIS SYSTEMS INC (""VRNS-Q"") MICROSOFT CORP ...","[{'className': 'VARONIS SYSTEMS INC', 'itemSco..."
121,Microsoft Announces AI-Driven Microsoft Copilo...,LexisNexis,2023-09-27T00:00:00Z,2024-06-17T19:54:05Z,559,Business World,The AI-companion is designed to empower people...,"[{'className': 'MICROSOFT CORP', 'itemScore': ..."
122,US Congress bans staff from using Microsoft Co...,LexisNexis,2024-04-01T00:00:00Z,2024-06-17T19:57:09Z,375,TechRadar (UK),The US House of Representatives has reportedly...,"[{'className': 'MICROSOFT CORP', 'itemScore': ..."
123,How to buy Microsoft Copilot Pro subscription,LexisNexis,2024-01-17T00:00:00Z,2024-06-17T19:56:22Z,303,The Times of India (TOI),Microsoft has unveiled a new subscription mode...,"[{'className': 'MICROSOFT CORP', 'itemScore': ..."


In [77]:
# indexing_terms contains a list of dictionaries. Find the maximum number of indexing terms in a single article
max_indexing_terms = df['indexing_terms'].apply(len).max()
max_indexing_terms

58

In [78]:
# find all indexing terms with an itemScore of 90
df['indexing_terms'].apply(lambda x: [i['className'] for i in x if i['itemScore'] == '90'])

0      [MICROSOFT CORP, SIC7372 PREPACKAGED SOFTWARE,...
1      [MICROSOFT CORP, SIC7372 PREPACKAGED SOFTWARE,...
2      [MICROSOFT CORP, SIC7372 PREPACKAGED SOFTWARE,...
3      [MICROSOFT CORP, SIC7372 PREPACKAGED SOFTWARE,...
4      [ARTIFICIAL INTELLIGENCE, TECHNOLOGY, ARTIFICI...
                             ...                        
120                          [ONLINE SECURITY & PRIVACY]
121    [ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGE...
122    [MICROSOFT CORP, SIC7372 PREPACKAGED SOFTWARE,...
123    [MICROSOFT CORP, SIC7372 PREPACKAGED SOFTWARE,...
124    [MICROSOFT CORP, SIC7372 PREPACKAGED SOFTWARE,...
Name: indexing_terms, Length: 125, dtype: object