In [1]:
import os
import re
from collections import OrderedDict

import bs4
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

In [12]:
class SoupScraper():
    def __init__(self, url) -> None:
        self.url = url
        self.soup = None

    def parse(self):
        response = requests.get(self.url)
        self.soup = bs4.BeautifulSoup(response.text, 'html.parser')

    def scrape_title(self):
        # extracts title from the bs4 object and stores it in the info dictionary
        if not self.soup:
            raise NameError('The url is not parsed.')

        self.title = self.soup.find('meta', {'name': 'citation_title'})['content']

class SeleniumScraper():
    def __init__(self, url, driver_path=None) -> None:
        self.url = url
        self.driver_path = driver_path
        self.soup = None
        self.title = None

    def parse(self):
        # loads a specific web page, scrapes HTML and stores it in a beautiful soup object
        service = Service(executable_path=ChromeDriverManager().install())

        # Load default profile
        options = webdriver.ChromeOptions()
        #options.add_argument("--headless")

        driver = webdriver.Chrome(service=service, options=options)
        driver.get(self.url)

        source = driver.page_source

        driver.quit()

        self.soup = bs4.BeautifulSoup(source, "html.parser")
    
    def scrape_title(self):
        # extracts title from the bs4 object and stores it in the info dictionary
        if not self.soup:
            raise NameError('The url is not parsed.')

        self.title = self.soup.find('meta', {'name':"citation_title"})["content"]
    
    def scrape_table(self):
        # extracts variable/value pairs from tables in the bs4 object and stores them in the info dictionary
        tables = self.soup.find_all('table')
        print("p")
        print(tables)
        for table in tables:
            #print(table)
            print("found ")
            variables = [variable.contents for variable in table.thead.tr.find_all('th') if variable.contents]
            values = []
            trs = table.tbody.find_all('tr')

            next = False
            for tr in trs:
                if next:
                    variables += [variable.contents for variable in tr.find_all('td') if variable.contents]
                    # print(variables)
                    next = False
                else:
                    values += [value.contents[0] for value in tr.find_all('td') if value.contents]
                    # print(values)
                    next = True

            print(variables)
            print(values)

            if len(variables) != len(values):
                continue

            for i in range(len(variables)):
                try:
                    float(values[i])
                except ValueError:
                    continue

                # name, unit, value = '', '', float(values[i])
                # print(variables[i])

                if isinstance(variables[i][0], bs4.element.Tag):
                    # replace mathml with python
                    mathml = variables[i][0].find('script', attrs={'id': re.compile(r'^MathJax')})
                    # print(mathml)
                    if mathml:
                        mathml_expr = '<math xmlns="http://www.w3.org/1998/Math/MathML">' + mathml.contents[0][6:]
                        #tex_expr = self.converter.mml2tex(mathml_expr)[0]
                        #ascii_expr = self.converter.tex2ascii(tex_expr)
                        # python_expr = self.converter.ascii2python(ascii_expr)
                        #variables[i][0] = ascii_expr

                variables[i] = [str(v) for v in variables[i]]

                expr = ''.join(variables[i])

                # separate units
                if expr[-1] == ')':
                    idx = expr.rfind(' (')
                    if idx <= 0:
                        name = expr
                        unit = 'Unknown'
                    else:
                        name = expr[:idx]
                        unit = expr[idx+1:]
                else:
                    name = expr
                    unit = 'Unknown'

                #name = self.converter.name_post(name)
                #unit = self.converter.unit_post(unit)

                if not name[0].isdigit():
                    self.info['var_dict'][' '.join([name, unit])] = float(values[i])

            # print(variables)
            # print(values)
            # print(self.info['var_dict'])

# soup_scraper = SoupScraper("https://www.sciencedirect.com/science/article/pii/S0749641904001664")
# soup_scraper.parse()
# soup_scraper.scrape_title()
# print(soup_scraper.soup)







In [14]:
sel_scraper = SeleniumScraper(url="https://www.sciencedirect.com/science/article/pii/S0749641904001664", driver_path="C:\\Users\\kyanj\\AppData\\Local\\Google\\Chrome\\User Data")
sel_scraper.parse()

In [15]:
sel_scraper.scrape_title()
sel_scraper.scrape_table()
print(sel_scraper.title)

p
[]
An investigation of the effects of solution heat treatment on mechanical properties for AA 6xxx alloys: experimentation and modelling


In [18]:
XX = 1.2
YY = 1.1
ZZ = 0.1

value = XX
value = value - YY
print(f"{value:.16f}")

0.0999999999999999
