In [13]:
#!pip install -r ../requirements.txt
#!pip install selenium
#!pip install beautifulsoup4
#!pip install lxml
#!C:\ProgramData\anaconda3\python.exe -m pip install seleniumbase

In [14]:
"""
Scraping class for various scraping-related functions.

Methods:
    __init__: Initialize the Scraping object.
    __del__: Destructor for the Scraping object.
    encode_code: Encode code as base64.
    decode_code: Decode base64-encoded code.
    decode_picture: Decode base64-encoded picture.
    get_result_meta: Get metadata for a given URL.
    take_screenshot: Take a screenshot of the browser window.
    get_real_url: Get the real URL after any redirects.

"""

import json
import base64
from bs4 import BeautifulSoup

from urllib import request
from urllib.parse import urlsplit
from urllib.parse import urlparse
import urllib.parse
import socket

import os
import inspect

import uuid #used to generate random file names

import time

class Scraping:

    def __init__(self):
        """
        Initialize the Scraping object.
        """        
        self = self

    def __del__(self):
        """
        Destructor for the Scraping object.
        """        
        print('Helper object destroyed')

    def encode_code(self, code):
        """
        Encode code as base64.

        Args:
            code (str): Code to encode.

        Returns:
            str: Base64-encoded code.
        """        
        code = code.encode('utf-8','ignore')
        code = base64.b64encode(code)
        return code

    def decode_code(self, value):
        """
        Decode base64-encoded code.

        Args:
            value (str): Base64-encoded code.

        Returns:
            str: Decoded code.
        """

        try:
            code_decoded = base64.b64decode(value)
            code_decoded = BeautifulSoup(code_decoded, "html.parser")
            code_decoded = str(code_decoded)
        except Exception as e:
            print(str(e))
            code_decoded = "decoding error"
        return code_decoded



    def decode_picture(self, value):
        """
        Decode base64-encoded picture.

        Args:
            value (str): Base64-encoded picture.

        Returns:
            str: Decoded picture.
        """        
        picture = value.tobytes()
        picture = picture.decode('ascii')
        return picture

    def get_result_meta(self, url):
        """
        Get metadata for a given URL.

        Args:
            url (str): URL to get metadata for.

        Returns:
            dict: Dictionary containing the metadata.
        """        
        meta = {}
        ip = "-1"
        main = url
        #parse url to get hostname and socket
        try:
            parsed_uri = urlparse(url)
            hostname = '{uri.netloc}'.format(uri=parsed_uri)
            ip = socket.gethostbyname(hostname)
        except Exception as e:
            print(str(e))
            ip = "-1"

        try:
            main = '{0.scheme}://{0.netloc}/'.format(urlsplit(url))
        except Exception as e:
            print(str(e))
            main = url

        #write to meta dictionary
        meta = {"ip":ip, "main":main}

        return meta



    def take_screenshot(self, driver):
        """
        Take a screenshot of the browser window.

        Args:
            driver: WebDriver instance.

        Returns:
            str: Base64-encoded screenshot image.
        """
        #function to encode file content to base64
        def encode_file_base64(self, file):
            f = open(file, 'rb')
            code = f.read()
            code = base64.b64encode(code)
            f.close()
            return code

        current_path = os.path.abspath(os.getcwd())

        #iniatilize constant variables

        #iniatilize the directories for the extension and for the folder for temporary downlods of files
        if os.name == "nt":
            screenshot_folder = current_path+"\\tmp\\"


        else:
            screenshot_folder = current_path+"//tmp//"

        screenshot_file = screenshot_folder+str(uuid.uuid1())+".png"

        time.sleep(2)

        driver.maximize_window() #maximize browser window for screenshot
        driver.save_screenshot(screenshot_file)

        # #open screenshot and save as base64
        screenshot = encode_file_base64(self, screenshot_file)

        os.remove(screenshot_file)

        return screenshot #return base64 code of image

    def get_real_url(url, driver):
        """
        Get the real URL after any redirects.

        Args:
            url (str): URL to get the real URL for.
            driver: WebDriver instance.

        Returns:
            str: Real URL after any redirects.
        """        
        try:
            driver.get(url)
            time.sleep(4)
            current_url = driver.current_url #read real url (redirected url)
            driver.quit()
            return current_url
        except Exception as e:
            print(str(e))
            pass

In [15]:
"""
This template provides a framework for creating a custom scraper for the RAT software. This scraper is designed to work with search services that offer search forms. For other types of search systems, modifications to this template may be necessary. Selenium is utilized as the primary tool for web scraping.

The scraper should be capable of returning the following fields:
- `result_title`: The title of the search result snippet.
- `result_description`: The description in the snippet of the result.
- `result_url`: The URL of the search result.
- `serp_code`: The HTML source code of the search result page, useful for further analysis.
- `serp_bin`: A screenshot of the search result page, if needed for additional analysis.
- `page`: The page number of search results, useful for paginated results or scrolling-based systems.

A typical scraper consists of the following functions:
- `run(query, limit, scraping, headless)`: The main function to execute the scraper with the given parameters.
- `get_search_results(driver, page)`: A helper function to retrieve search results from the given page.
- `check_captcha(driver)`: A helper function to check for CAPTCHA or similar blocks and handle them appropriately.

The variables and functionality described here can be adapted according to the specific search engine being scraped.

The search engine in this template is Ecosia. Change the parameters according to the search engine you want to scrape.
"""

#library with functions for web scraping

#import external libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from selenium.common.exceptions import TimeoutException #used to interrupt loding of websites and needed as workaround to download files with selenium
from selenium.webdriver.common.action_chains import ActionChains #used to simulate pressing of a key

from selenium.webdriver.support.ui import Select


import uuid #used to generate random file names

import time #used to do timeout breaks

import os #used for file management

#base64 encoding to convert the code codes of webpages
import base64

#BeautifulSoup is necessary to beautify the code coded after it has been decoded (especially useful to prevent character errors)
from bs4 import BeautifulSoup
from lxml import html

import random
import inspect
import re

import os
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
parentdir = os.path.dirname(parentdir)

ext_path = parentdir+"/i_care_about_cookies_unpacked"

from seleniumbase import Driver

def run(query, limit, scraping, headless):
    """
    Run the scraper.

    Args:
        query (str): The search query.
        limit (int): The maximum number of search results to retrieve.
        scraping: The Scraping object.
        headless (bool): If True, runs the browser in headless mode (without GUI).

    Returns:
        list: List of search results.
    """
    try:
        # URL and selectors for the search engine
        search_url = "https://core.ac.uk/" #URL for the search engine
        search_box = "styles-control-12Pze" #Selector for the search box
        captcha = "g-recaptcha" #Selector for CAPTCHA in the page source
        
        # Initialize variables
        results_number = 0 #Initialize number of search results
        page = -1 #Initialize SERP page number
        search_results = [] #Initialize list of search results
        
        # Custom function to scrape search results
        def get_search_results(driver, page):
            """
            Retrieve search results from the current page.

            Args:
                driver: Selenium WebDriver instance.
                page (int): Current SERP page.

            Returns:
                list: List of search results from the current page.
            """
            temp_search_results = []

            # Get page source and encode it
            source = driver.page_source
            serp_code = scraping.encode_code(source)
            serp_bin = scraping.take_screenshot(driver)

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(source, features="lxml")

            # Extract search results using CSS selectors
            for result in soup.find_all("div", class_=["styles_search-results__2AZDM"]):
                result_title = "N/A" #Initialize result title
                result_description = "N/A" #Initialize result description
                result_url = "N/A" #Initialize result URL

                # Get Title
                try:
                    title_elem = result.find("h3", class_=["styles-title-1k6Ib"])
                    if title_elem:
                        result_title = title_elem.text.strip()
                except:
                    pass
                
                # Get Description
                try:
                    description_elem = result.find("div", class_=["styles-content-35LN7"])
                    if description_elem:
                        result_description = description_elem.text.strip()
                except:
                    pass
                
                # Get original URL when redirected (Core has its own site. Need conclusion if internal redirection to paper needed.)
                try:
                    url_elem = result.find("a")
                    if url_elem:
                        url = url_elem.attrs['href']
                        if "bing." in url:
                            url = scraping.get_real_url(url)
                        result_url = url
                except:
                    pass

                if result_url != "N/A":
                    temp_search_results.append([result_title, result_description, result_url, serp_code, serp_bin, page])

            return temp_search_results

        # Custom function to check if CAPTCHA is present
        def check_captcha(driver):
            """
            Check if CAPTCHA is present on the page.

            Args:
                driver: Selenium WebDriver instance.

            Returns:
                bool: True if CAPTCHA is present, False otherwise.
            """
            source = driver.page_source
            return captcha in source
        
        def remove_duplicates(search_results):
            """
            Removes duplicate search results based on the URL.

            Args:
                search_results (list): List of search results to deduplicate.

            Returns:
                list: List of search results with duplicates removed.
            """
            seen_urls = set()
            unique_results = []

            # Append only unique results
            for result in search_results:
                url = result[2]
                if url not in seen_urls:
                    seen_urls.add(url)
                    unique_results.append(result)

            return unique_results        

        # Initialize Selenium driver
        driver = Driver(
            browser="chrome",
            wire=True,
            uc=True,
            headless2=headless,
            incognito=False,
            agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
            do_not_track=True,
            undetectable=True,
            extension_dir=ext_path,
            locale_code="de"
        )

        driver.maximize_window()
        driver.set_page_load_timeout(20)
        driver.implicitly_wait(30)
        driver.get(search_url)
        time.sleep(random.randint(2, 5))

        # Start scraping if no CAPTCHA
        if not check_captcha(driver):
            search = driver.find_element(By.CLASS_NAME, search_box) #Find search box
            search.send_keys(query) #Enter search query
            search.send_keys(Keys.RETURN) #Submit search
            time.sleep(random.randint(2, 5)) #Wait for Results

            search_results = get_search_results(driver, page)
            results_number = len(search_results)
            continue_scraping = True #Initialize scraping

            # Loop through pages until limit is reached or CAPTCHA appears
            while results_number < limit and continue_scraping:
                if not check_captcha(driver):
                    time.sleep(random.randint(2, 5))
                    page += 1
                    try:
                        #next_page_url = f"https://www.ecosia.org/search?method=index&q={query}&p={page}" #Next page URL
                        next_page_url = f"https://core.ac.uk/search/?q={query}&page={page}" #Next page URL
                        print(next_page_url)
                        driver.get(next_page_url)
                        extract_search_results = get_search_results(driver, page)
                        print(f"Results extracted: {len(extract_search_results)}")

                        if extract_search_results:
                            print("Appending results.")
                            search_results += extract_search_results
                            search_results = remove_duplicates(search_results)
                            results_number = len(search_results)
                        else:
                            continue_scraping = False
                            search_results = -1
                    except Exception as e:
                        print(f"Failed to get next page: {e}")
                        continue_scraping = False
                else:
                    continue_scraping = False
                    search_results = -1

            driver.quit()
            return search_results
        else:
            search_results = -1
            driver.quit()
            return search_results

    except Exception as e:
        print(f"Exception occurred: {e}")
        try:
            driver.quit()
        except:
            pass
        return -1

In [16]:
#function to test the scraper. it shows all scraped details or an error message, if it fails

def test_scraper(query, limit, scraper, headless):
    search_results = run(query, limit, scraper, headless)

    i = 0
    if search_results != -1:
        for sr in search_results:
            i+=1
            print(i)
            print(sr[0])
            print(sr[1])
            print(sr[2])
    else:
        print("Scraping failed")

In [19]:
#initialise the scraper: Change the parameters for testing your scraper
scraper = Scraping() #initialize the scraping object

query = "test" #search query
limit = 10 #max_number of results (the scraper normally adds some more pages since not all search engines deliver a certain number of search results on every SERP)



In [20]:
test_scraper(query, limit, scraper, headless=True)

Helper object destroyed
1
Test 2214: Kubota M7-132
ABOUT THE TEST REPORT AND USE OF THE DATA The test data contained in this report are a tabulation of the results of a series of tests. Due to the restricted format of these pages, only a limited amount of data and not all of the tractor specifications are included. The full OECD report contains usually about 30 pages of data and specifications. The test data were obtained for each tractor under similar conditions and therefore, provide a means of comparison of performance based on a limited set of reported data. EXPLANATION OF THE TEST PROCEDURES Purpose The purpose of the tests in this booklet, and available test reports is to provide users with data for comparisons of performance among tractor models. General Tractors are tested at the University of Nebraska according to test procedures of the OECD (Organization of Economic Cooperation and Development), the SAE (Society of Automotive Engineers) International and the ASABE (American S