# Article Metadata to DataFrame

This notebook presents the process of downloading the metadata from a list of journal articles and store it in a pandas.DataFrame object. First we need to import BeautifulSoup, os and Pandas.  

In [18]:
from bs4 import BeautifulSoup
import bs4, os
import pandas as pd

These functions explore the XML JATS file and store the information of some tags. 

In [22]:
#Functions to extract information
        
def find_article_id():
    article_id = soup.front.find("article-id")
    try:
        return(article_id.string)
    except AttributeError:
        return None
    
def find_article_title():
    article_title = soup.front.find("article-title")
    try:
        return(article_title.string)
    except AttributeError:
        return None
    
def find_authors():
    authors_lastnames = soup.front.find_all(["surname"])
    authors_names = soup.front.find_all(["given-names"])
    authors = []
    try:
        for author in range(len(authors_lastnames)):
            authors.append(authors_names[author].string + " " + authors_lastnames[author].string)
        return(",".join(authors))
    except AttributeError:
        return None
  

def find_pub_date():
    day = soup.front.find("pub-date").contents[1].string
    month = soup.front.find("pub-date").contents[3].string
    year = soup.front.find("pub-date").contents[5].string
    date = [month, year]
    try:
        return('-'.join(date))
    except AttributeError:
        return None
    
def find_volume():
    volume = soup.front.find("volume")
    try:
        return(volume.string)
    except AttributeError:
        return None

           
def find_fpage():
    fpage = soup.front.find("fpage")
    try:
        return(fpage.string)
    except AttributeError:
        return None

def find_lpage():
    lpage = soup.front.find("lpage")
    try:
        return(lpage.string)
    except AttributeError:
        return None
    
def find_journal_title():
    journal_title = soup.front.find("journal-title")
    try:
        return(journal_title.string)
    except AttributeError:
        return None
    
def find_key_words():
    key_words = soup.front.find_all("kwd", lng="en")
    try:
        key_words = [kwd.string for kwd in key_words] 
        return ", ".join(key_words)
    except AttributeError:
        return None
        

    
def find_abstract():
    abstract_text = soup.front.find("abstract")
    try:
        return(abstract_text.string)
    except AttributeError:
        return None
        
          
    


Then we call the previous functions on each of the articles we have previously downloaded to create a dictionary with the data that we could then convert to a DataFrame.  

In [23]:
xml_dir = "D:/journal_articles/xml_articles/"
txt_dir = "D:/journal_articles/txt_articles/"

# Creates txt files in txt_dir for all articles in the xml_dir

with os.scandir(xml_dir) as entries:
    for entry in entries:
        name = entry.name
        name = name.replace('.xml', '.txt')
        with open(entry, "r", encoding="ISO-8859-1") as file:
            soup = BeautifulSoup(file, "lxml")
            elementos = soup.find("body").find_all("p")
            text = []
            for elem in elementos:
                text.append(elem.text)
                with open(txt_dir + name, "w", encoding="utf-8") as f:
                    f.write("\n".join(text))


#From the xml_dir creates a DataFrame with articles meta-data

file_name = []
article_id = []
authors = []
article_title = []
journal_title = []
pub_date = []
abstract = []
key_words = []
volume = [] 
fpage = [] 
lpage = []

with os.scandir(xml_dir) as entries:
    for entry in entries:
        with open(entry, "r", encoding="ISO-8859-1") as file:
            soup = BeautifulSoup(file, "html.parser")
            file_name.append(entry.name)
            article_id.append(find_article_id())
            authors.append(find_authors())
            article_title.append(find_article_title())
            journal_title.append(find_journal_title())
            pub_date.append(find_pub_date())
            abstract.append(find_abstract())
            key_words.append(find_key_words())
            volume.append(find_volume())
            fpage.append(find_fpage())
            lpage.append(find_lpage())



data = {"file_name": file_name, "article_id": article_id, "authors":authors, "article_title": article_title, "journal_title": journal_title, "pub_date": pub_date, "abstract": abstract, "key_words": key_words, "volume": volume, "fpage": fpage, "lpage": lpage}


df = pd.DataFrame(data)

# Adds the paths to xml and txt files to the DataFrame

def xml_file_path(name):
    return(xml_dir + name)

def txt_file_path(name):
    return(txt_dir + name.replace('.xml', '.txt'))
    
xml_path = []
txt_path = []


for file in df["file_name"]:
    xml_path.append(xml_file_path(file))

for name in df["file_name"]:
    txt_path.append(txt_file_path(name))
    
df["xml_path"] = xml_path
df["txt_path"] = txt_path



Finally, we store the DataFrame to a csv file to be able to acces it locally and do different queries. 

In [25]:
#Saves the DataFrame to a csv file in the xml_root directory

csv_file = "metadata.csv"

csv_file_path = (xml_dir + csv_file)

df.to_csv(csv_file_path)

print(csv_file_path)

D:/journal_articles/xml_articles/metadata.csv
