In [1]:
from time import sleep
from random import randint
import requests
from bs4 import BeautifulSoup
import pandas as pd



def scrape_keyword_gazeta_de_alagoas(keyword, max_page, save_path):

  # Creates lists to store scraped data at each loop iteration
  titles        = []
  links         = []
  dates_authors = []

  scraped_data_dict = {
                      'Titles'        : [],
                      'Dates_Authors' : [],
                      'Links'         : []
                      }
  
  # Starts scraping loop fo given page range
  for page in range(1, max_page):

    # Creates random delays to avoid blocking
    delay = randint(0,3)
    print(f'Cool down of {delay}.\nScraping page number {page}')
    sleep(delay)

    # Defines each search query URL, stores request content and creates html_doc variable for scraping
    url      = f'https://d.gazetadealagoas.com.br/?q={keyword}&page={page}'
    response = requests.get(url)
    html_doc = BeautifulSoup(response.content, 'html.parser')

    # Tests request result. Stops loop if any error is given.
    if str(response) != '<Response [200]>':
      print(url); print(f"Request error!\n{response}\nLoop stopped.")
      break

    # Starts scraping if no error is given
    else:
      # Defines custom html content present when no results are found and defines it as stop sign to trigger loop break
      stop_sign = html_doc.find('p', class_ = 'gza-f-roboto gza-f-22 gza-lh-25 gza-c-black-3')
      if stop_sign == None:
        loop = 'go'
      else:
        loop = 'stop'

      # Breaks loop if stop sign is found
      if loop == 'stop':
        print(f'Found stop_sign: {stop_sign}')
        print('Stop trigger activated.')
        print(f'Stopped at search page {page}')
        break

      # Scrapes search result page if no stop sign is found
      elif loop == 'go':
        cards = html_doc.find_all('article', class_ = 'col-md-12')
        for elem in cards:
          # Scrapes articles titles
          title = elem.find('h3')
          titles.append(title.text)

          # Scrapes articles dates and authors
          date_author = elem.find('p')
          dates_authors.append(date_author.text)

          # Scrapes articles links
          link = elem.find('a')
          links.append(f'https://d.gazetadealagoas.com.br{link["href"]}')


  # Checks lists lengths to ensure syncing
  a = len(titles)
  b = len(dates_authors)
  c = len(links)
  print(f'Length of scraped titles list = {a}')
  print(f'Length of scraped dates and authors list = {b}')
  print(f'Length of scraped links list = {c}')
  if a == b and a == c:
    print(f'Scraped data lists with same lengths: {a} items.\nCreating Data Frame.')
    # Updates scraped data dict with lists and creates df
    scraped_data_dict['Titles']        = titles
    scraped_data_dict['Dates_Authors'] = dates_authors
    scraped_data_dict['Links']         = links
    scraped_data_df                    = pd.DataFrame.from_dict(scraped_data_dict)
    scraped_data_df.to_excel(f'{save_path}\scraping_gazeta_de_alagoas-{keyword}.xlsx', index = False )
    return scraped_data_df
  else :
    print('Error!\nLists with different lengths.\nNo Data Frame created.')
    return None
