In [1]:
# for parsing XML files from Nexis Uni bulk download API
from bs4 import BeautifulSoup
import pandas as pd
import os

In [2]:
# get the list of all XML files in the current directory
files = [f for f in os.listdir('.') if f.endswith('.xml')]

In [3]:
# iterate over the list of XML files and get the title, author, published date, updated date, word count, publication name, body text, and indexing terms (extract only the className and the item score). Store the data in a pandas DataFrame

data = []
for item in files:
    # Load the XML file
    with open(item, 'r', encoding='utf-8') as file:
        xml_data = file.read()
    
    # parse the XML file using BeautifulSoup
    soup = BeautifulSoup(xml_data, 'lxml-xml')
    
    # use soup.find to get the elements of the XML file
    title = soup.find('title').text
    # get the author name from the <author><name> tag
    author = soup.find('name').text
    # get the published date from the <published> tag. Stored as a datetime object
    published = soup.find('published').text
    # get the updated date from the <updated> tag
    updated = soup.find('updated').text
    # get the word count from the <wordCount> tag
    word_count = soup.find('wordCount')['number']
    # get the publication name from the <publicationName> tag
    publication_name = soup.find('publicationName').text
    # get the body text from the <bodyText> tag and remove extra spaces and line breaks
    body_text = ' '.join(soup.find('bodyText').get_text().split())
    
    # for any <classificationGroup classificationScheme="indexing-terms"> get the className text and its parent classificationItem score attribute
    indexing_terms = []
    for classification_group in soup.find_all('classificationGroup', classificationScheme='indexing-terms'):
        for classification_item in classification_group.find_all('classificationItem'):
            try:
                class_name = classification_item.find('className').text
                item_score = classification_item.get('score')
                indexing_terms.append({'className': class_name, 'itemScore': item_score})
            except AttributeError:
                print(f"A classificationItem in {item} without a className was found and skipped.")
    
    # append the data to the data list
    data.append({'title': title, 'author': author, 'published': published, 'updated': updated, 'word_count': word_count, 'publication_name': publication_name, 'body_text': body_text, 'indexing_terms': indexing_terms})

# create a pandas DataFrame from the data list. Each row in the DataFrame represents an article
df = pd.DataFrame(data)

A classificationItem in urn:contentItem:6BSC-PBR1-JCMC-W01G-00000-00.xml without a className was found and skipped.
A classificationItem in urn:contentItem:6BW4-SHK1-JBCM-F00M-00000-00.xml without a className was found and skipped.
A classificationItem in urn:contentItem:6C17-2CY1-DXY4-X07D-00000-00.xml without a className was found and skipped.
A classificationItem in urn:contentItem:6C81-BJ41-F03R-N2S1-00000-00.xml without a className was found and skipped.
A classificationItem in urn:contentItem:6C2H-7M91-JCMC-W4SX-00000-00.xml without a className was found and skipped.


In [4]:
df

Unnamed: 0,title,author,published,updated,word_count,publication_name,body_text,indexing_terms
0,Biden losing support among Black voters in swi...,LexisNexis,2024-04-11T00:00:00Z,2024-06-18T19:51:16Z,317,TheHill.com,President Biden’s support amount Black voters ...,"[{'className': 'WALL STREET JOURNAL', 'itemSco..."
1,Donald Trump Leads Joe Biden in Every Battlegr...,LexisNexis,2024-04-27T00:00:00Z,2024-06-18T19:50:31Z,934,Newsweek.com,Polling averages show an uphill battle for the...,"[{'className': 'Constitutional Law', 'itemScor..."
2,Trump Holds on to Lead in Post-Guilty Verdict ...,LexisNexis,2024-06-07T00:00:00Z,2024-06-18T19:47:21Z,363,Newstex Blogs Mediaite,The views expressed in any and all content dis...,"[{'className': 'FOX ENTERTAINMENT GROUP INC', ..."
3,The aspect that benefits Donald Trump over Joe...,LexisNexis,2024-06-13T00:00:00Z,2024-06-18T19:52:33Z,514,CE Noticias Financieras English,The U.S. presidential election will paralyze t...,"[{'className': 'CAMPAIGNS & ELECTIONS', 'itemS..."
4,Donald Trump Stung by Double Polling Blow,LexisNexis,2024-05-03T00:00:00Z,2024-06-18T19:49:35Z,464,Newsweek.com,Two polls have suggested the former president ...,"[{'className': 'FLORIDA ATLANTIC UNIVERSITY', ..."
...,...,...,...,...,...,...,...,...
95,Biden supporters mostly back him in 2024 elect...,LexisNexis,2024-06-10T00:00:00Z,2024-06-18T19:52:08Z,415,The Guardian (London),Poll comes as the Biden campaign has ramped up...,"[{'className': 'POLLS & SURVEYS', 'itemScore':..."
96,Biden leads Trump in polls for first time sinc...,LexisNexis,2024-05-06T00:00:00Z,2024-06-18T19:47:38Z,212,TheHill.com,President Biden is leading former President Tr...,"[{'className': 'ELECTIONS', 'itemScore': '79'}..."
97,CNN polling guru: Biden ‘loved’ surveys four y...,LexisNexis,2024-05-15T00:00:00Z,2024-06-18T19:47:46Z,497,TheHill.com,CNN polling expert Harry Enten signaled Tuesda...,"[{'className': 'VOTERS & VOTING', 'itemScore':..."
98,Fox News' Jessica Tarlov Gets Jesse Watters to...,LexisNexis,2024-04-05T00:00:00Z,2024-06-18T19:52:45Z,315,Newstex Blogs The Wrap,The views expressed in any and all content dis...,"[{'className': 'FOX ENTERTAINMENT GROUP INC', ..."


In [5]:
# indexing_terms contains a list of dictionaries. Find the maximum number of indexing terms in a single article
max_indexing_terms = df['indexing_terms'].apply(len).max()
max_indexing_terms

55

In [6]:
# find all indexing terms with an itemScore of 90
df['indexing_terms'].apply(lambda x: [i['className'] for i in x if i['itemScore'] == '90'])

0     [ELECTIONS & POLITICS, REPORTS, REVIEWS & SECT...
1     [ELECTIONS & POLITICS, HEADS OF GOVERNMENT ELE...
2     [HEADS OF GOVERNMENT ELECTIONS, NEGATIVE PERSO...
3     [ELECTIONS, HEADS OF GOVERNMENT ELECTIONS, POL...
4     [ELECTIONS, US PRESIDENTIAL CANDIDATES 2008, U...
                            ...                        
95    [NEGATIVE PERSONAL NEWS, POLITICAL CANDIDATES,...
96                                                   []
97                                    [VOTERS & VOTING]
98                      [POLLS & SURVEYS, DONALD TRUMP]
99    [CAMPAIGNS & ELECTIONS, POLLS & SURVEYS, VOTER...
Name: indexing_terms, Length: 100, dtype: object

In [7]:
# save the DataFrame to a CSV file
df.to_csv('articles.csv', index=False)