In [1]:
import requests
import matplotlib.pyplot as plt
import re
import os
import PyPDF2
from io import BytesIO
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlparse
from urllib.parse import urljoin
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from collections import Counter, defaultdict

In [2]:
def sanitize_filename(filename):
    # List of invalid characters for file names
    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
    for char in invalid_chars:
        filename = filename.replace(char, '_')  # Replace invalid characters with underscore
    return filename

In [3]:
base_url = "https://www.ntnu.no/cerg/publikasjoner/" # 2023 studies are here
years = range(2012, 2023)

yearly_links = [f"{base_url}{year}" for year in years] + [base_url]

In [4]:
# List to store the publication link objects
publication_objects = []

for yearly_link in yearly_links:
    response = requests.get(yearly_link)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    a_tags = soup.find_all('a', class_='ntnu-ibtn primary')
    
    for a_tag in a_tags:
        link = a_tag['href']
        
        title_content = a_tag.contents[-1]
        title = title_content.strip() if isinstance(title_content, str) else 'No Title Available'
        
        publication_object = {
            'link': link,
            'title': title,
            'year': (yearly_link[-4:] if yearly_link[-4:].isdigit() else 2023)
        }

        publication_objects.append(publication_object)

for pub_object in publication_objects[:5]:
    print(pub_object)

print(f"Link count: {len(publication_objects)}")

{'link': 'https://www.tandfonline.com/doi/full/10.3109/15412555.2012.745843', 'title': 'Aerobic exercise training improves right- and left ventricular systolic function in patients with COPD', 'year': '2012'}
{'link': 'http://ntnu.no/cerg/publikasjoner/2012#2-13', 'title': 'Telomere Length and Long-Term Endurance Exercise: Does Exercise Training Affect Biological Age? A Pilot Study', 'year': '2012'}
{'link': 'https://link.springer.com/article/10.2165%2F11630760-000000000-00000', 'title': 'Exercise-Training Intervention Studies in Competitive Swimming', 'year': '2012'}
{'link': 'https://www.thieme-connect.com/products/ejournals/html/10.1055/s-0032-1316364', 'title': 'Recovery after an Intermittent Test', 'year': '2012'}
{'link': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0050933', 'title': 'Insomnia and Endothelial Function – The HUNT 3 Fitness Study', 'year': '2012'}
Link count: 299


In [5]:
# filter out non-links
publication_objects = [pub for pub in publication_objects if pub["link"].startswith(('http', 'https')) and ' ' not in pub["link"]]
print(f"Link count: {len(publication_objects)}")

Link count: 298


In [6]:
# List of domains to exclude
excluded_domains = [
    #'content.iospress.com',
    #'downloads.hindawi.com',
    #'vev.medisin.ntnu.no',
    'thieme-connect.com', # needs login
    #'mayoclinicproceedings.org', # paid
    'ntnu.no/cerg/publikasjoner', # bad link
    'insights.ovid.com', # login
    #"sciencedirect.com",
    #'nature.com', # paid
]

# Filtering out links from the excluded domains
publication_objects = [
    pub for pub in publication_objects 
    if not any(domain in pub['link'] for domain in excluded_domains)
]

# Displaying the updated count of links
print(f"Link count: {len(publication_objects)}")


Link count: 295


In [7]:
# Organize publication_objects by domain
publications_by_domain = defaultdict(list)
for pub_object in publication_objects:
    domain = urlparse(pub_object['link']).netloc
    publications_by_domain[domain].append(pub_object)


In [8]:
# Assume publication_objects is your original list of publication objects
publication_links = [pub['link'] for pub in publication_objects]

# Extract domains
domains = [urlparse(link).netloc for link in publication_links]

# Count unique domains
unique_domains = set(domains)
unique_domain_count = len(unique_domains)

print(f"Number of unique domains: {unique_domain_count}")

Number of unique domains: 53


In [9]:
# Ensure the directory 'publications/' exists
os.makedirs(os.path.join('..', 'data', 'publications'), exist_ok=True)

# Updated Regex pattern to catch more variations of PDF links
pdf_pattern = re.compile(r'(pdf|epdf)', re.IGNORECASE)

In [13]:
driver = webdriver.Chrome()

def generate_pdfs(domain, publication_objects):
    success_count = 0
    failed_links = []
    corrupted_links = []

    for pub_object in publication_objects:
        link = pub_object['link']
        pdf_links = []
        
        driver.get(link)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

        cookie_buttons = driver.find_elements(By.XPATH, 
            "//button["
                "(contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept') and "
                "contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'cookie')) or "
                "(contains(translate(@value, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept') and "
                "contains(translate(@value, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'cookie'))"
            "]"
        )

        if cookie_buttons:
            for button in cookie_buttons:
                button.click()

        page_source = driver.page_source
        
        soup = BeautifulSoup(page_source, 'html.parser')
        pdf_links = soup.find_all('a', href=True, string=pdf_pattern) + [a for a in soup.find_all('a', href=True) if pdf_pattern.search(a['href'])]
        pdf_links = [a["href"] for a in pdf_links]
        
        if not pdf_links:
            # Attempt to find PDF links using Selenium, if BeautifulSoup fails
            pdf_elements = driver.find_elements(By.PARTIAL_LINK_TEXT, '.pdf')
            pdf_links = [element.get_attribute('href') for element in pdf_elements]
        
        if not pdf_links:
            failed_links.append(link)
            continue

        for pdf_url in pdf_links:
            if not pdf_url.startswith('http'):
                pdf_url = urljoin(link, pdf_url)
            
            title_words = '_'.join(pub_object['title'].split(" ")[:7])
            title_shortened = sanitize_filename(title_words)
            file_name = f"{pub_object['year']}_{title_shortened}.pdf"
            file_path = os.path.join('..', 'data', 'publications', file_name)

            # Use requests to download the PDF file
            pdf_response = requests.get(pdf_url, stream=True)

            if pdf_response.status_code != 200:
                failed_links.append(pdf_url)
                continue

            with open(file_path, 'wb') as pdf_file:
                for chunk in pdf_response.iter_content(chunk_size=8192):
                    pdf_file.write(chunk)
                    
            try:
                with open(file_path, 'rb') as pdf_file:
                    PyPDF2.PdfReader(pdf_file)
                success_count += 1 
                break  
            except PyPDF2.errors.PdfReadError:
                os.remove(file_path) 
                corrupted_links.append(pdf_url)

    print(f"Results for {domain}: {success_count} successes, {len(corrupted_links)} corruptions, {len(failed_links)} failures.")
    return success_count, failed_links, corrupted_links


In [11]:
def try_domain(domain, pub_by_domain):
    pubs = pub_by_domain[domain]
    success_count, failed_links, corrupted_links = generate_pdfs2(domain, pubs)

    return {
        'success_count': success_count,
        'failure_count': len(failed_links),
        'corruption_count': len(corrupted_links),
        'failed_links': failed_links,
        'corrupted_links': corrupted_links
    }

### FULL RUN

In [14]:
results_dict = {}

for domain, pubs in publications_by_domain.items():
    success_count, failed_links, corrupted_links = generate_pdfs(domain, pubs)
    
    # Store the results in the results_dict
    results_dict[domain] = {
        'success_count': success_count,
        'failure_count': len(failed_links),
        'corruption_count': len(corrupted_links),
        'failed_links': failed_links,
        'corrupted_links': corrupted_links
    }


driver.quit()

Results for www.tandfonline.com: 0 successes, 0 corruptions, 9 failures.
Results for link.springer.com: 17 successes, 0 corruptions, 0 failures.
Results for journals.plos.org: 26 successes, 0 corruptions, 1 failures.
Results for erj.ersjournals.com: 3 successes, 2 corruptions, 0 failures.


incorrect startxref pointer(1)


Results for academic.oup.com: 2 successes, 6 corruptions, 21 failures.
Results for journals.lww.com: 17 successes, 0 corruptions, 0 failures.
Results for tidsskriftet.no: 2 successes, 0 corruptions, 0 failures.
Results for www.ahajournals.org: 0 successes, 0 corruptions, 19 failures.
Results for journals.sagepub.com: 0 successes, 2 corruptions, 4 failures.
Results for onlinelibrary.wiley.com: 0 successes, 0 corruptions, 19 failures.
Results for pubmed.ncbi.nlm.nih.gov: 0 successes, 0 corruptions, 5 failures.
Results for www.sciencedirect.com: 0 successes, 0 corruptions, 60 failures.
Results for journals.physiology.org: 0 successes, 2 corruptions, 13 failures.
Results for www.ncbi.nlm.nih.gov: 0 successes, 0 corruptions, 60 failures.
Results for respiratory-research.biomedcentral.com: 2 successes, 0 corruptions, 0 failures.
Results for physoc.onlinelibrary.wiley.com: 0 successes, 0 corruptions, 7 failures.
Results for journal.chestnet.org: 0 successes, 0 corruptions, 15 failures.
Result

In [17]:
sorted_items = sorted(results_dict.items(), key=lambda item: item[1]['failure_count'], reverse=True)
top_10_items = sorted_items[:10]

for domain, results in top_10_items:
    print(f"{domain}")
    #try_domain(domain, publications_by_domain)
#try_domain("www.mayoclinicproceedings.org", publications_by_domain)

www.mayoclinicproceedings.org
www.sciencedirect.com
www.ncbi.nlm.nih.gov
www.internationaljournalofcardiology.com
www.thelancet.com
academic.oup.com
www.ahajournals.org
onlinelibrary.wiley.com
journal.chestnet.org
journals.physiology.org


In [None]:
try_domain("journals.lww.com", publications_by_domain)

KeyboardInterrupt: 