<a href="https://colab.research.google.com/github/iannickgagnon/itor_jan_23_oa_0038/blob/main/random_articles_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import requests
import numpy as np
from bs4 import BeautifulSoup

In [2]:
def gs_scrape(query: str, nb_pages: int = 5, year: int = None):
    """
    Scrapes a given number of pages of Google Scholar results for a given search query,
    with an optional year filter.

    Args:
        query (str): The search query.
        nb_pages (int): The number of pages to scrape.
        year (int): The publication year to filter articles by.

    Returns:
        (list): A list of articles stored as title-authors-link dictionaries.
    """
    
    # The number of results shown on each page
    NB_RESULTS_PER_PAGE = 10

    # Client identifier
    headers = {'User-Agent': 'Mozilla/5.0'}

    # Initialize container for output articles
    articles = []

    # Through each page's articles
    for page_index in range(nb_pages):
        # Results iterator
        start_index = page_index * NB_RESULTS_PER_PAGE

        # Build URL with query and year filter
        url = f"https://scholar.google.com/scholar?q={query}&hl=en&start={start_index}"
        if year:
            url += f"&as_ylo={year}&as_yhi={year}"

        # Send HTTP request
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        # Parse HTTP request response as HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        for result in soup.find_all('div', class_='gs_ri'):
            # Get title and authors
            title = result.find('h3', class_='gs_rt').get_text()
            authors = result.find('div', class_='gs_a').get_text()

            # Get the href attribute from the hyperlink anchor
            link_tag = result.find('a')
            link = link_tag['href'] if link_tag else None

            # Save article as dictionary
            article = {
                'title': title,
                'authors': authors,
                'link': link
            }
            # Store article in output list
            articles.append(article)

    return articles


In [7]:
def download_articles(articles, 
                      download_folder="downloaded_articles",
                      max_size_kb=1000):
    """
    Downloads PDFs from a list of articles if the PDF link is available and the file size is under the specified limit.

    Args:
        articles (list): List of dictionaries containing 'title', 'authors', and 'link'.
        download_folder (str): Directory to save the downloaded PDFs.

    Returns:
        None
    """

    # Ensure the download folder exists
    os.makedirs(download_folder, exist_ok=True)

    for article in articles:
        try:
            # Get the general page of the article
            response = requests.get(article['link'], headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            
            # Parse the page to find PDF link
            soup = BeautifulSoup(response.content, 'html.parser')
            pdf_link = None

            # Look for PDF-related links
            for anchor in soup.find_all('a', href=True):
                href = anchor['href'].lower()
                text = anchor.get_text().lower()
                if ".pdf" in href or "pdf" in text or "download" in text:
                    pdf_link = anchor['href']
                    break

            if not pdf_link:
                print(f"No PDF link found for article: {article['title']}")
                continue

            # Handle relative URLs
            if not pdf_link.startswith("http"):
                pdf_link = requests.compat.urljoin(article['link'], pdf_link)

            # Check file size before downloading
            head_response = requests.head(pdf_link, headers={'User-Agent': 'Mozilla/5.0'})
            head_response.raise_for_status()

            file_size_kb = int(head_response.headers.get('Content-Length', 0)) / 1024
            if file_size_kb > max_size_kb:
                print(f"Skipped: {article['title']} (Size exceeds {max_size_kb} KB limit - {file_size_kb:.2f} KB)")
                continue

            # Download the PDF
            pdf_response = requests.get(pdf_link, headers={'User-Agent': 'Mozilla/5.0'})
            pdf_response.raise_for_status()

            # Save the PDF file
            safe_title = "".join(c if c.isalnum() else "_" for c in article['title'])
            file_path = os.path.join(download_folder, f"{safe_title}.pdf")

            with open(file_path, "wb") as pdf_file:
                pdf_file.write(pdf_response.content)

            print(f"Downloaded: {article['title']} -> {file_path}")

        except Exception as e:
            print(f"Failed to download article: {article['title']}. Error: {e}")

In [11]:
# The number of pages to scrape
NB_PAGES_TO_SCRAPE = 20

# The number of articles to extract in the current iteration
NB_ARTICLES = 5

# The year to filter articles by
YEAR = 2000

# The maximum size of a PDF file to download
MAX_SIZE_KB = 1000

In [14]:
# Fetch all articles from 10 results pages
articles = gs_scrape('metaheuristics', 
                     nb_pages=NB_PAGES_TO_SCRAPE, 
                     year=YEAR)

for article in articles:
    print(f"TITLE   : {article['title']}\n"
          f"AUTHORS : {article['authors']}\n"
          f"LINK    : {article['link']}\n\n")

TITLE   : Meta-heuristics: The state of the art
AUTHORS : S Voß - Workshop on Local Search for Planning and …, 2000 - Springer
LINK    : https://link.springer.com/chapter/10.1007/3-540-45612-0_1


TITLE   : Solving vehicle routing problems using constraint programming and metaheuristics
AUTHORS : BD Backer, V Furnon, P Shaw, P Kilby, P Prosser - Journal of heuristics, 2000 - Springer
LINK    : https://link.springer.com/article/10.1023/A:1009621410177


TITLE   : [HTML][HTML] Using metaheuristics in multiobjective resource constrained project scheduling
AUTHORS : A Viana, JP de Sousa - European Journal of Operational Research, 2000 - Elsevier
LINK    : https://www.sciencedirect.com/science/article/pii/S0377221799001630


TITLE   : Wisdom: A metaheuristic (pragmatic) to orchestrate mind and virtue toward excellence.
AUTHORS : PB Baltes, UM Staudinger - American psychologist, 2000 - psycnet.apa.org
LINK    : https://psycnet.apa.org/fulltext/2000-13324-012.html


TITLE   : Optimization of 

In [5]:
# Generate 10 unique random indexes between 0 and NB_PAGES_TO_SCRAPE * 10 - 1
index_max = NB_PAGES_TO_SCRAPE * 10 - 1
indexes = np.random.randint(0, index_max, size=(NB_ARTICLES,))
while len(set(indexes)) != len(indexes):
  indexes = np.random.randint(0, index_max, size=(NB_ARTICLES,))

# Display articles
for i in indexes:
  print(f"TITLE   : {articles[i]['title']}\n"
        f"AUTHORS : {articles[i]['authors']}\n"
        f"LINK    : {articles[i]['link']}\n\n")

TITLE   : Applying combinatorial optimization metaheuristics to the golf scramble problem
AUTHORS : RG Dear, Z Drezner - International Transactions in Operational …, 2000 - Wiley Online Library
LINK    : https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1475-3995.2000.tb00203.x


TITLE   : Metaheuristic methods for a class of the facility layout problem
AUTHORS : A Gomes de Alvarenga, FJ Negreiros-Gomes… - Journal of intelligent …, 2000 - Springer
LINK    : https://link.springer.com/article/10.1023/A:1008982420344


TITLE   : [PDF][PDF] On metaheuristic algorithms for combinatorial optimization problems
AUTHORS : M Yagiura, T Ibaraki - The Transactions of the Institute of Electronics …, 2000 - Citeseer
LINK    : https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=9b8ed5e5f89868c14b1ef6d3b1da7084f3878d9b


TITLE   : Optimization of a simplified fleet assignment problem with metaheuristics: Simulated annealing and GRASP
AUTHORS : D Sosnowska - … and Complexity in Numerical Opt

In [None]:
download_articles(articles)