In [24]:
import requests
import matplotlib.pyplot as plt
import re
import os
import PyPDF2
from io import BytesIO
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlparse
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from collections import Counter, defaultdict

In [25]:
def sanitize_filename(filename):
    # List of invalid characters for file names
    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
    for char in invalid_chars:
        filename = filename.replace(char, '_')  # Replace invalid characters with underscore
    return filename

In [26]:
base_url = "https://www.ntnu.no/cerg/publikasjoner/" # 2023 studies are here
years = range(2012, 2023)

yearly_links = [f"{base_url}{year}" for year in years] + [base_url]

In [27]:
# List to store the publication link objects
publication_objects = []

for yearly_link in yearly_links:
    response = requests.get(yearly_link)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    a_tags = soup.find_all('a', class_='ntnu-ibtn primary')
    
    for a_tag in a_tags:
        link = a_tag['href']
        
        title_content = a_tag.contents[-1]
        title = title_content.strip() if isinstance(title_content, str) else 'No Title Available'
        
        publication_object = {
            'link': link,
            'title': title,
            'year': (yearly_link[-4:] if yearly_link[-4:].isdigit() else 2023)
        }

        publication_objects.append(publication_object)

for pub_object in publication_objects[:5]:
    print(pub_object)

print(f"Link count: {len(publication_objects)}")

{'link': 'https://www.tandfonline.com/doi/full/10.3109/15412555.2012.745843', 'title': 'Aerobic exercise training improves right- and left ventricular systolic function in patients with COPD', 'year': '2012'}
{'link': 'http://ntnu.no/cerg/publikasjoner/2012#2-13', 'title': 'Telomere Length and Long-Term Endurance Exercise: Does Exercise Training Affect Biological Age? A Pilot Study', 'year': '2012'}
{'link': 'https://link.springer.com/article/10.2165%2F11630760-000000000-00000', 'title': 'Exercise-Training Intervention Studies in Competitive Swimming', 'year': '2012'}
{'link': 'https://www.thieme-connect.com/products/ejournals/html/10.1055/s-0032-1316364', 'title': 'Recovery after an Intermittent Test', 'year': '2012'}
{'link': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0050933', 'title': 'Insomnia and Endothelial Function – The HUNT 3 Fitness Study', 'year': '2012'}
Link count: 299


In [28]:
# filter out non-links
publication_objects = [pub for pub in publication_objects if pub["link"].startswith(('http', 'https')) and ' ' not in pub["link"]]
print(f"Link count: {len(publication_objects)}")

Link count: 298


In [29]:
# List of domains to exclude
excluded_domains = [
    #'content.iospress.com',
    #'downloads.hindawi.com',
    #'vev.medisin.ntnu.no',
    'thieme-connect.com', # needs login
    'mayoclinicproceedings.org', # paid
    'ntnu.no/cerg/publikasjoner', # bad link
    'insights.ovid.com', # login
    "sciencedirect.com",
    "online.,"
    #'nature.com', # paid
]

# Filtering out links from the excluded domains
publication_objects = [
    pub for pub in publication_objects 
    if not any(domain in pub['link'] for domain in excluded_domains)
]

# Displaying the updated count of links
print(f"Link count: {len(publication_objects)}")


Link count: 292


In [30]:
# Organize publication_objects by domain
publications_by_domain = defaultdict(list)
for pub_object in publication_objects:
    domain = urlparse(pub_object['link']).netloc
    publications_by_domain[domain].append(pub_object)


In [31]:
# Assume publication_objects is your original list of publication objects
publication_links = [pub['link'] for pub in publication_objects]

# Extract domains
domains = [urlparse(link).netloc for link in publication_links]

# Count unique domains
unique_domains = set(domains)
unique_domain_count = len(unique_domains)

print(f"Number of unique domains: {unique_domain_count}")

Number of unique domains: 52


In [32]:
# Ensure the directory 'publications/' exists
os.makedirs(os.path.join('..', 'data', 'publications'), exist_ok=True)

# Updated Regex pattern to catch more variations of PDF links
pdf_pattern = re.compile(r'(pdf|epdf)', re.IGNORECASE)

def generate_pdfs(domain, publication_objects):
    success_count = 0
    failed_links = []
    corrupted_links = []

    driver = webdriver.Chrome()

    for pub_object in publication_objects:
        link = pub_object['link']
        pdf_links = []
        
        response = requests.get(link)
        if response.status_code != 200:
            failed_links.append(link)
            continue
        soup = BeautifulSoup(response.text, 'html.parser')
        pdf_links = soup.find_all('a', href=True, string=pdf_pattern) + [a for a in soup.find_all('a', href=True) if pdf_pattern.search(a['href'])]
        pdf_links = [a["href"] for a in pdf_links]
        
        if not pdf_links:
            failed_links.append(link)
            continue

        for pdf_url in pdf_links:
            if not pdf_url.startswith('http'):

                pdf_url = urlparse(link)._replace(path=pdf_url).geturl()
            
            pdf_response = requests.get(pdf_url, stream=True)

            if pdf_response.status_code != 200:
                failed_links.append(link)
                continue  
            
            title_words = '_'.join(pub_object['title'].split(" ")[:7])
            title_shortened = sanitize_filename(title_words)
            file_name = f"{pub_object['year']}_{title_shortened}.pdf"
            file_path = os.path.join('..', 'data', 'publications', file_name)

            with open(file_path, 'wb') as pdf_file:
                for chunk in pdf_response.iter_content(chunk_size=8192):
                    pdf_file.write(chunk)
            
            try:
                with open(file_path, 'rb') as pdf_file:
                    PyPDF2.PdfReader(pdf_file)
                success_count += 1 
                break  
            except PyPDF2.errors.PdfReadError:
                os.remove(file_path) 
                corrupted_links.append(pdf_url)

    driver.quit()

    print(f"Results for {domain}: {success_count} successes, {len(corrupted_links)} corruptions, {len(failed_links)} failures.")
    return success_count, failed_links, corrupted_links

In [33]:
results_dict = {}

for domain, pubs in publications_by_domain.items():
    success_count, failed_links, corrupted_links = generate_pdfs(domain, pubs)
    
    # Store the results in the results_dict
    results_dict[domain] = {
        'success_count': success_count,
        'failure_count': len(failed_links),
        'corruption_count': len(corrupted_links),
        'failed_links': failed_links,
        'corrupted_links': corrupted_links
    }


Results for www.tandfonline.com: 0 successes, 0 corruptions, 7 failures.
https://static-content.springer.com/esm/art%3A10.1007%2Fs00296-020-04713-2/MediaObjects/296_2020_4713_MOESM1_ESM.pdf
/content/pdf/10.1007/s12928-020-00655-5.pdf?pdf=button
https://static-content.springer.com/esm/art%3A10.1007%2Fs40279-021-01608-5/MediaObjects/40279_2021_1608_MOESM1_ESM.pdf
/content/pdf/10.1186/s12929-021-00733-7.pdf?pdf=button
/content/pdf/10.1007/s10620-022-07779-z.pdf?pdf=button
http://farmacia.udec.cl/wp-content/uploads/2020/07/2020_guidelines_web-ATC-OMS.pdf
/content/pdf/10.1007/s10654-023-01029-w.pdf?pdf=button
https://www.helsedirektoratet.no/rapporter/fysisk-aktivitet-kartleggingsrapporter/Fysisk%20aktivitet%20og%20sedat%20tid%20blant%20voksne%20og%20eldre%20i%20Norge.pdf/_/attachment/inline/7d460cdf-051a-4ecd-99d6-7ff8ee07cf06:eff5c93b46b28a3b1a4d2b548fc53b9f51498748/Fysisk%20aktivitet%20og%20sedat%20tid%20blant%20voksne%20og%20eldre%20i%20Norge.pdf
/content/pdf/10.1007/s10620-022-07779-z.

Exception ignored in: <function Service.__del__ at 0x0000018A34981E10>
Traceback (most recent call last):
  File "c:\Users\dlind\anaconda3\envs\masters2\lib\site-packages\selenium\webdriver\common\service.py", line 185, in __del__
    self.stop()
  File "c:\Users\dlind\anaconda3\envs\masters2\lib\site-packages\selenium\webdriver\common\service.py", line 146, in stop
    self.send_remote_shutdown_command()
  File "c:\Users\dlind\anaconda3\envs\masters2\lib\site-packages\selenium\webdriver\common\service.py", line 126, in send_remote_shutdown_command
    request.urlopen(f"{self.service_url}/shutdown")
  File "c:\Users\dlind\anaconda3\envs\masters2\lib\urllib\request.py", line 216, in urlopen
    return opener.open(url, data, timeout)
  File "c:\Users\dlind\anaconda3\envs\masters2\lib\urllib\request.py", line 519, in open
    response = self._open(req, data)
  File "c:\Users\dlind\anaconda3\envs\masters2\lib\urllib\request.py", line 536, in _open
    result = self._call_chain(self.handle_

http://www.icmje.org/coi_disclosure.pdf
http://www.icmje.org/coi_disclosure.pdf
/content/371/bmj.m3485.full.pdf
Results for www.bmj.com: 1 successes, 2 corruptions, 0 failures.


In [None]:
print(Counter(urlparse(link).netloc for link in failed_links))

Counter()
