In [49]:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import re
from collections import Counter
from urllib.parse import urlparse
import PyPDF2
from io import BytesIO
import os
from collections import Counter, defaultdict
import paperscraper



In [50]:
def sanitize_filename(filename):
    # List of invalid characters for file names
    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
    for char in invalid_chars:
        filename = filename.replace(char, '_')  # Replace invalid characters with underscore
    return filename

In [51]:
base_url = "https://www.ntnu.no/cerg/publikasjoner/" # 2023 studies are here
years = range(2012, 2023)

yearly_links = [f"{base_url}{year}" for year in years] + [base_url]

In [52]:
# List to store the publication link objects
publication_objects = []

for yearly_link in yearly_links:
    response = requests.get(yearly_link)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    a_tags = soup.find_all('a', class_='ntnu-ibtn primary')
    
    for a_tag in a_tags:
        link = a_tag['href']
        
        title_content = a_tag.contents[-1]
        title = title_content.strip() if isinstance(title_content, str) else 'No Title Available'
        
        publication_object = {
            'link': link,
            'title': title,
            'year': (yearly_link[-4:] if yearly_link[-4:].isdigit() else 2023)
        }

        publication_objects.append(publication_object)

for pub_object in publication_objects[:5]:
    print(pub_object)

print(f"Link count: {len(publication_objects)}")

{'link': 'https://www.tandfonline.com/doi/full/10.3109/15412555.2012.745843', 'title': 'Aerobic exercise training improves right- and left ventricular systolic function in patients with COPD', 'year': '2012'}
{'link': 'http://ntnu.no/cerg/publikasjoner/2012#2-13', 'title': 'Telomere Length and Long-Term Endurance Exercise: Does Exercise Training Affect Biological Age? A Pilot Study', 'year': '2012'}
{'link': 'https://link.springer.com/article/10.2165%2F11630760-000000000-00000', 'title': 'Exercise-Training Intervention Studies in Competitive Swimming', 'year': '2012'}
{'link': 'https://www.thieme-connect.com/products/ejournals/html/10.1055/s-0032-1316364', 'title': 'Recovery after an Intermittent Test', 'year': '2012'}
{'link': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0050933', 'title': 'Insomnia and Endothelial Function – The HUNT 3 Fitness Study', 'year': '2012'}
Link count: 298


In [53]:
# filter out non-links
publication_objects = [pub for pub in publication_objects if pub["link"].startswith(('http', 'https')) and ' ' not in pub["link"]]
print(f"Link count: {len(publication_objects)}")

Link count: 297


In [54]:
# List of domains to exclude
excluded_domains = [
    'content.iospress.com',
    'downloads.hindawi.com',
    'vev.medisin.ntnu.no',
    'thieme-connect.com', # needs login
    'mayoclinicproceedings.org', # paid
    'ntnu.no/cerg/publikasjoner', # bad link
    'insights.ovid.com', # login
    'nature.com', # paid
]

# Filtering out links from the excluded domains
publication_objects = [
    pub for pub in publication_objects 
    if not any(domain in pub['link'] for domain in excluded_domains)
]

# Displaying the updated count of links
print(f"Link count: {len(publication_objects)}")


Link count: 280


In [55]:
domain = urlparse(pub_object['link']).netloc

In [56]:
# Organize publication_objects by domain
publications_by_domain = defaultdict(list)
for pub_object in publication_objects:
    domain = urlparse(pub_object['link']).netloc
    publications_by_domain[domain].append(pub_object)


In [57]:
# Assume publication_objects is your original list of publication objects
publication_links = [pub['link'] for pub in publication_objects]

# Extract domains
domains = [urlparse(link).netloc for link in publication_links]

# Count unique domains
unique_domains = set(domains)
unique_domain_count = len(unique_domains)

print(f"Number of unique domains: {unique_domain_count}")

Number of unique domains: 48


In [63]:
# Ensure the directory 'publications/' exists
os.makedirs(os.path.join('..', 'data', 'publications'), exist_ok=True)

# Updated Regex pattern to catch more variations of PDF links
pdf_pattern = re.compile(r'(pdf|download|fulltext|printable|epdf)', re.IGNORECASE)

def generate_pdfs(domain, publication_objects):
    # Counter variables
    success_count = 0
    failed_links = []
    corrupted_links = []

    # Iterate over each publication object
    for pub_object in publication_objects:
        link = pub_object['link']
        trx = paperscraper.extract_from_url("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3418173/")
        if trx:
            print(trx)
        response = requests.get(link)
        if response.status_code != 200:
            failed_links.append(link)
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')
        pdf_links = soup.find_all('a', href=True, string=pdf_pattern) + [a for a in soup.find_all('a', href=True) if pdf_pattern.search(a['href'])]

        papers = paperscraper.scrape_url(link)
        if papers:
            pdf_links.extend([paper["url"] for paper in papers])

        if not pdf_links:
            failed_links.append(link)
            continue
        
        for pdf_link in pdf_links:
            # Construct the complete URL if necessary
            pdf_url = pdf_link['href']
            if not pdf_url.startswith('http'):
                pdf_url = urlparse(link)._replace(path=pdf_url).geturl()
            
            pdf_response = requests.get(pdf_url, stream=True)
            if pdf_response.status_code != 200:
                continue  # Try the next PDF link if the request wasn't successful
            
            # Construct file name and path
            title_words = '_'.join(pub_object['title'].split(" ")[:7])
            title_shortened = sanitize_filename(title_words)
            file_name = f"{pub_object['year']}_{title_shortened}.pdf"
            file_path = os.path.join('..', 'data', 'publications', file_name)

            # Save the PDF file
            with open(file_path, 'wb') as pdf_file:
                for chunk in pdf_response.iter_content(chunk_size=8192):
                    pdf_file.write(chunk)
            
            # Check for PDF corruption
            try:
                with open(file_path, 'rb') as pdf_file:
                    PyPDF2.PdfReader(pdf_file)
                success_count += 1  # PDF is not corrupted
                break  # Exit the loop as the PDF has been successfully downloaded and saved
            except PyPDF2.errors.PdfReadError:
                os.remove(file_path)  # Remove the corrupted PDF file
                corrupted_links.append(pdf_url)

    print(f"Results for {domain}: {success_count} successes, {len(corrupted_links)} corruptions, {len(failed_links)} failures.")
    return success_count, failed_links, corrupted_links

ImportError: cannot import name 'PaperScraper' from 'paperscraper' (c:\Users\dlind\anaconda3\envs\masters2\lib\site-packages\paperscraper\__init__.py)

In [62]:
pubs = publications_by_domain["www.sciencedirect.com"]
success_count, failed_links, corrupted_links = generate_pdfs("www.sciencedirect.com", pubs)

Results for www.sciencedirect.com: 0 successes, 0 corruptions, 60 failures.


In [61]:
publications_by_domain["www.sciencedirect.com"]

[{'link': 'https://www.sciencedirect.com/science/article/pii/S1440244013001849?via%3Dihub',
  'title': 'Does rating of perceived exertion result in target exercise intensity during interval training in cardiac rehabilitation? A study of the Borg scale versus a heart rate monitor',
  'year': '2013'},
 {'link': 'https://www.sciencedirect.com/science/article/pii/S0167527314019470',
  'title': 'Remote ischemic preconditioning preserves mitochondrial function and activates pro-survival protein kinase Akt in the left ventricle during cardiac surgery: A randomized trial',
  'year': '2014'},
 {'link': 'https://www.sciencedirect.com/science/article/pii/S0735109714058045',
  'title': 'High-Intensity Interval Exercise Effectively Improves Cardiac Function in Patients With Type 2 Diabetes Mellitus and Diastolic Dysfunction: A Randomized Controlled Trial',
  'year': '2014'},
 {'link': 'http://www.sciencedirect.com/science/article/pii/S0002914914017068',
  'title': 'Coronary atheroma regression and 

In [None]:
results_dict = {}

for domain, pubs in publications_by_domain.items():
    success_count, failed_links, corrupted_links = generate_pdfs(domain, pubs)
    
    # Store the results in the results_dict
    results_dict[domain] = {
        'success_count': success_count,
        'failure_count': len(failed_links),
        'corruption_count': len(corrupted_links),
        'failed_links': failed_links,
        'corrupted_links': corrupted_links
    }


Results for www.tandfonline.com: 0 successes, 0 corruptions, 7 failures.
Results for link.springer.com: 10 successes, 1 corruptions, 7 failures.
Results for journals.plos.org: 0 successes, 0 corruptions, 1 failures.
Results for erj.ersjournals.com: 3 successes, 24 corruptions, 0 failures.
Results for academic.oup.com: 0 successes, 0 corruptions, 19 failures.
Results for journals.lww.com: 0 successes, 0 corruptions, 17 failures.
Results for tidsskriftet.no: 2 successes, 0 corruptions, 0 failures.
Results for www.ahajournals.org: 0 successes, 0 corruptions, 14 failures.
Results for journals.sagepub.com: 0 successes, 0 corruptions, 4 failures.
Results for onlinelibrary.wiley.com: 0 successes, 0 corruptions, 16 failures.
Results for www.sciencedirect.com: 0 successes, 0 corruptions, 60 failures.
Results for journals.physiology.org: 0 successes, 0 corruptions, 10 failures.
Results for respiratory-research.biomedcentral.com: 2 successes, 0 corruptions, 0 failures.
Results for physoc.onlineli

In [None]:
print(Counter(urlparse(link).netloc for link in failed_links))

Counter()
