In [27]:
import selenium
selenium.__version__

'4.3.0'

In [28]:
# Basic module that allows us to operate Chrome with Python Code
from selenium import webdriver

# Modules for installing a "programmable version" of Chrome that is required by Selenium
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService

# Module for selecting HTML elements
from selenium.webdriver.common.by import By

In [29]:
service = ChromeService(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.headless = True




In [30]:
# Use for gathering and exporting search result information
import pandas as pd

# Use for removing Windows' reserved character in webpage's title for PDF export.
import re

# Use for controlling waiting time in web scraping
import time

# Use for exporting webpage to PDF
from base64 import b64decode

# Use for creating output directories
from os import makedirs

In [31]:
name = 'Bernard Madoff'
keywords = ['money laundering','fraud','corruption','tax evasion','arrest','charge','convict']

In [32]:
makedirs('output/summary/', exist_ok=True)

driver = webdriver.Chrome(service=service, options=options)

df_search_results = pd.DataFrame()

for keyword_index, keyword in enumerate(keywords, start=1):

    search_string = (name + ' ' + keyword).replace(' ','+')

    driver.get(f'https://duckduckgo.com/?q={search_string}')

    pdf_encoded = driver.execute_cdp_cmd('Page.printToPDF', {})

    pdf_decoded = b64decode(pdf_encoded['data'], validate=True)

    keyword_file_name = (str(keyword_index) + ' ' + keyword + '.pdf').replace(' ','_')

    with open(f'output/summary/{keyword_file_name}', 'wb') as f:

        f.write(pdf_decoded)

    search_results = driver.find_elements(By.XPATH,'//article[@data-nrn]')

    for result in search_results:

        summary = {}

        summary['name'] = name

        summary['keyword'] = keyword

        summary['title'] = result.find_element(By.XPATH,'./div/h2/a/span').text

        summary['abstract'] = ' '.join([element.text for element in result.find_elements(By.XPATH,'./div/div/span')[1:]])

        summary['url'] = result.find_element(By.XPATH,'./div/h2/a').get_attribute('href')

        df_search_results = df_search_results.append(pd.DataFrame(summary, index=['0']), ignore_index=True)

    time.sleep(5)

df_search_results.index = range(1, len(df_search_results)+1)

df_search_results.index.name = 'index'

df_search_results['duplicated'] = df_search_results.duplicated(subset=['url'])

driver.close()

In [None]:
makedirs('output/results/', exist_ok=True)

driver = webdriver.Chrome(service=service, options=options)

captured = []

for result_index, result in df_search_results.to_dict(orient='index').items():

    if result['duplicated'] == False:

        try:

            driver.get(result['url'])

            pdf_encoded = driver.execute_cdp_cmd('Page.printToPDF', {})

            pdf_decoded = b64decode(pdf_encoded['data'], validate=True)
            
            sterilized_title = re.sub('\\s+', ' ', re.sub('[^a-zA-Z0-9]', ' ', result['title']))

            result_file_name = (str(result_index) + '_' + sterilized_title + '.pdf')
            
            with open(f'output/results/{result_file_name}', 'wb') as f:

                f.write(pdf_decoded)

            captured.append('Success')

        except:

            captured.append('Fail')

    else:

        captured.append('N/A')

df_search_results['captured'] = captured

df_search_results.to_excel('output/summary/search_summary.xlsx')

driver.close()