In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import json
import os

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
title_doi_df = pd.read_csv('data/titles_doi.csv')

In [4]:
driver = webdriver.Safari()

In [7]:
first_url = title_doi_df['DOI'][0]
first_url

'https://doi.org/10.1093/ehr/cead151'

In [10]:
driver = webdriver.Safari()
driver.get(first_url)

soup = BeautifulSoup(driver.page_source)

title_element = soup.find('h1', class_='wi-article-title')
title = title_element.get_text(strip=True)

# Extract the author
author_element = soup.find('button', class_='linked-name')
author = author_element.get_text(strip=True)

# Extract the publication date
date_element = soup.find('div', class_='citation-date')
date = date_element.get_text(strip=True)

# Print the results
print("Title:", title)
print("Author:", author)
print("Publication Date:", date)

Title: Catholic Intellectuals and Transnational Anti-Communism: Pax Romana from the Spanish Civil War to the post-1945 World Order*
Author: Michael Richards
Publication Date: 21 September 2023


In [54]:
driver = webdriver.Safari()
driver.get(first_url)

time.sleep(3)

soup = BeautifulSoup(driver.page_source)

paragraphs = soup.find_all('p', class_='chapter-para')

# Initialize a variable to store the extracted text and citations
result = ""

# Iterate through paragraphs
for paragraph in paragraphs:
    # Extract the text within the paragraph
    text = ''
    citations = []
    for element in paragraph.contents:
        if element.name == 'a':
            # Extract the citation number from the sup tag
            citation_number = int(element.find('sup').get_text())
            # Add the citation to the list
            citations.append(citation_number)
            # Add a placeholder in the text
            text += f'[CITATION-{citation_number}] '
        elif element and hasattr(element, 'strip'):
            # Add the text content (if not None and has a strip method)
            try:
                text += element.strip()
            except:
                pass
            
    result += text 

In [56]:
driver.quit()

In [18]:
def scrape_publication(url, output_file, title):
    # Check if the JSON file already exists
    if os.path.exists(output_file):
            with open(output_file, 'r') as json_file:
                existing_data = json.load(json_file)
            # Check if the title already exists in the JSON data
            if 'title' in existing_data and existing_data['title'] == title:
                print(f"Publication with title '{title}' already exists in {output_file}. Skipping.")
                return

    # Initialize a dictionary to store the data
    publication_data = {}

    # Initialize a WebDriver
    driver = webdriver.Safari()
    driver.get(url)

    # Wait for the page to load (adjust the timeout as needed)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'wi-article-title')))

    # Get the page source after it has loaded
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # Extract the title
    title_element = soup.find('h1', class_='wi-article-title')
    title = title_element.get_text(strip=True)
    publication_data['title'] = title

    # Extract the author
    author_element = soup.find('button', class_='linked-name')
    author = author_element.get_text(strip=True)
    publication_data['author'] = author

    # Extract the publication date
    date_element = soup.find('div', class_='citation-date')
    date = date_element.get_text(strip=True)
    publication_data['date'] = date

    paragraphs = soup.find_all('p', class_='chapter-para')

    # Initialize a variable to store the extracted text and citations
    result = ""

    # Iterate through paragraphs
    for paragraph in paragraphs:
        # Extract the text within the paragraph
        text = ''
        citations = []
        for element in paragraph.contents:
            if element.name == 'a':
                # Extract the citation number from the sup tag
                try:
                    citation_number = int(element.find('sup').get_text())
                    # Add the citation to the list
                    citations.append(citation_number)
                    # Add a placeholder in the text
                    text += f'[CITATION-{citation_number}] '
                except:
                    print(element, url)
                
            elif element and hasattr(element, 'strip'):
                # Add the text content (if not None and has a strip method)
                try:
                    text += element.strip()  
                except:
                    pass
                
        result += text

    publication_data['text'] = result

    # Close the WebDriver
    driver.quit()
    
    return publication_data


In [19]:
output_file = "data/combined_publications.json" 

In [21]:
combined_data = []

def scrape_and_save(row):
    title = row['Title']
    doi = row['DOI']

    # Call the scrape_publication function
    publication_data = scrape_publication(doi, output_file, title)
    combined_data.append(publication_data)

# Iterate through the DataFrame and call scrape_and_save for each row
for index, row in title_doi_df.iterrows():
    scrape_and_save(row)

# Save the combined data to a single JSON file
with open(output_file, 'w') as json_file:
    json.dump(combined_data, json_file, indent=4)


<a class="link xref-fig" data-google-interstitial="false" data-modal-source-id="F1" href="javascript:;">Figure 1</a> https://doi.org/10.1093/ehr/cead107
<a class="link xref-fig" data-google-interstitial="false" data-modal-source-id="F2" href="javascript:;">Figure 2</a> https://doi.org/10.1093/ehr/cead107
<a class="link xref-fig" data-google-interstitial="false" data-modal-source-id="F3" href="javascript:;">Figure 3</a> https://doi.org/10.1093/ehr/cead107
<a class="link xref-fig" data-google-interstitial="false" data-modal-source-id="F4" href="javascript:;">Figure 4</a> https://doi.org/10.1093/ehr/cead107
<a class="link xref-fig" data-google-interstitial="false" data-modal-source-id="F5" href="javascript:;">Figure 5</a> https://doi.org/10.1093/ehr/cead107
<a class="link xref-fig" data-google-interstitial="false" data-modal-source-id="F6" href="javascript:;">Figure 6</a> https://doi.org/10.1093/ehr/cead107
<a class="link link-reveal link-table xref-fig" data-google-interstitial="false" d

TimeoutException: Message: 


In [23]:
# Save the combined data to a single JSON file
with open('data/combined_publications.json', 'w') as json_file:
    json.dump(combined_data, json_file, indent=4)

In [22]:
len(combined_data)

32