<a href="https://colab.research.google.com/github/iannickgagnon/itor_jan_23_oa_0038/blob/main/random_articles_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import numpy as np
from bs4 import BeautifulSoup

In [2]:
def gs_scrape(query: str, nb_pages: int = 5):
    """
    Scrapes a given number of pages of Google Scholar results for a given search query.

    It looks for the following HTML element classes : 
      
      gs_ri : Google Scholar Results Item
      gs_rt : Google Scholar Result Title
      gs_a  : Google Scholar Author

    It also extracts the href attribute (some_url) of the HTML hyperlink anchor <a href=some_url>.

    Args:
      query (str): The search query.
      nb_pages (int): The number of pages to scrape the results from.

    Returns:
      (list): A list of articles stored as title-authors-link dictionaries.
    """

    # The number of results shown on each page
    NB_RESULTS_PER_PAGE = 10

    # Client identifier
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    # Initialize container for output articles
    articles = []

    # Through each page's articles
    for page_index in range(nb_pages):

        # Results iterator
        start_index = page_index * NB_RESULTS_PER_PAGE
        
        # Build url for search query given current iterator state
        url = f"https://scholar.google.com/scholar?q={query}&hl=en&start={start_index}"
        
        # SenD HTTP request
        response = requests.get(url, headers=headers)

        # Parse HTTP request response as HTML
        for result in BeautifulSoup(response.content, 'html.parser').find_all('div', class_='gs_ri'):
            
            # Get title and authors
            title = result.find('h3', class_='gs_rt').get_text()
            authors = result.find('div', class_='gs_a').get_text()

            # Get the href attribute from the hyperlink anchor
            link = result.find('a')['href']
            
            # Save article as dictionary
            article = {
                'title': title,
                'authors': authors,
                'link': link
            }
            
            # Store article in output list
            articles.append(article)

    return articles

In [3]:
# The number of pages to scrape
NB_PAGES_TO_SCRAPE = 10

# The number of articles to extract in the current iteration
NB_ARTICLES = 10

In [4]:
# Fetch all articles from 10 results pages
articles = gs_scrape('metaheuristics', nb_pages=NB_PAGES_TO_SCRAPE)

In [5]:
# Generate 10 unique random indexes between 0 and NB_PAGES_TO_SCRAPE * 10 - 1
index_max = NB_PAGES_TO_SCRAPE * 10 - 1
indexes = np.random.randint(0, index_max, size=(NB_ARTICLES,))
while len(set(indexes)) != len(indexes):
  indexes = np.random.randint(0, index_max, size=(NB_ARTICLES,))

In [6]:
# Display articles
for i in indexes:
  print(f"TITLE   : {articles[i]['title']}\n"
        f"AUTHORS : {articles[i]['authors']}\n"
        f"LINK    : {articles[i]['link']}\n\n")

TITLE   : Metaheuristics for the team orienteering problem
AUTHORS : C Archetti, A Hertz, MG Speranza - Journal of Heuristics, 2007 - Springer
LINK    : https://link.springer.com/article/10.1007/s10732-006-9004-0


TITLE   : [BOOK][B] Tuning metaheuristics: a machine learning perspective
AUTHORS : M Birattari, J Kacprzyk - 2009 - Springer
LINK    : https://link.springer.com/content/pdf/10.1007/978-3-642-00483-4.pdf


TITLE   : [BOOK][B] Multi-objective optimization in computer networks using metaheuristics
AUTHORS : Y Donoso, R Fabregat - 2016 - books.google.com
LINK    : https://books.google.com/books?hl=en&lr=&id=C6m2vjd764kC&oi=fnd&pg=PP1&dq=metaheuristics&ots=f2xxjK4VNj&sig=Ir_TWuKWyC2IGetBqGf1Yr3qBGk


TITLE   : A survey on optimization metaheuristics
AUTHORS : I Boussaïd, J Lepagnot, P Siarry - Information sciences, 2013 - Elsevier
LINK    : https://www.sciencedirect.com/science/article/pii/S0020025513001588


TITLE   : [BOOK][B] Metaheuristics
AUTHORS : P Siarry - 2016 - Springe