In [108]:
from pathlib import Path
from copy import deepcopy

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from mersearch.helpers import smiles_to_mol_file_format

In [109]:
def scroll_shim(passed_in_driver, object):
    """Allows firefox driver to scroll to elements without throwing an error"""
    x = object.location['x']
    y = object.location['y']
    scroll_by_coord = 'window.scrollTo(%s,%s);' % (
        x,
        y
    )
    scroll_nav_out_of_way = 'window.scrollBy(0, -120);'
    passed_in_driver.execute_script(scroll_by_coord)
    passed_in_driver.execute_script(scroll_nav_out_of_way)

In [110]:
profile_path = str(Path('~') / 'Library' / 'Application Support' / 'Firefox' / 'Profiles' / 'pwofczxw.default')
PROXY_HOST = "12.12.12.123"
PROXY_PORT = "1234"
options=Options()
options.set_preference('profile', profile_path)
options.set_preference("network.proxy.type", 1)
options.set_preference("network.proxy.http", PROXY_HOST)
options.set_preference("network.proxy.http_port", int(PROXY_PORT))
options.set_preference("dom.webdriver.enabled", False)
options.set_preference('useAutomationExtension', False)
options.set_preference("excludeSwitches", "enable-automation")
service = Service(r'/usr/local/bin/geckodriver')

driver = webdriver.Firefox(service=service, options=options)

In [111]:
print(type(driver))

<class 'selenium.webdriver.firefox.webdriver.WebDriver'>


In [112]:
# navigate to this site to get auto logged in so search can be performed
url = "https://us.vwr.com/store/search/searchMol.jsp"
driver.get(url)

In [113]:
# navigate to actual search site
button_link = WebDriverWait(driver=driver, timeout=30).until(EC.presence_of_element_located((By.ID, "emolPunchout")))
button = button_link.find_element(By.TAG_NAME, 'input')
scroll_shim(driver, button)
actions = ActionChains(driver)
# scroll to the button
actions.move_to_element(button_link).perform()
# navigate to https://vwr.emolecules.com/index.php
# click button until button is actually registered
try:
    while True:
        button.click()
except StaleElementReferenceException:
    pass

In [114]:
# find button to load molfile and click it
import_molfile_button = WebDriverWait(driver=driver, timeout=30).until(EC.presence_of_element_located((By.XPATH, '//span[@title="Import Molfile"]')))
import_molfile_button.click()

In [115]:
# find textarea to add molfile data to
chemwriter = WebDriverWait(driver=driver, timeout=30).until(EC.presence_of_element_located((By.CLASS_NAME, "chemwriter")))
molfile_pane = chemwriter.find_element(By.CLASS_NAME, 'content')
molfile_textarea = molfile_pane.find_element(By.TAG_NAME, 'textarea')

In [116]:
# add molfile data
smiles = 'O=C1CCCCCCCCCCCCCCO1'
smiles = 'CCN(C)S(=O)(=O)c1ccc(Cl)c([N+](=O)[O-])c1'
# smiles = 'O=COC(=O)C1(C)CCCC2(C)C3CC(=O)OCC3CCC12'
molfile_textarea.clear()
molfile_textarea.send_keys(smiles_to_mol_file_format(smiles))

In [117]:
# find button to write molfile data to chemwriter
button = chemwriter.find_element(By.TAG_NAME, 'button')
button.click()

In [118]:
# run exact structure search
button = WebDriverWait(driver=driver, timeout=30).until(EC.presence_of_element_located((By.NAME, "ex_button")))
button.click()

In [89]:
# search listed structures for first one and click
try:
    (
        WebDriverWait(driver=driver, timeout=30).until(EC.presence_of_element_located((By.CLASS_NAME, "hitlist-table")))
        .find_element(By.ID, 'row_0.0')
        .find_element(By.CLASS_NAME, 'compound_actions_left')
        .find_element(By.TAG_NAME, 'img')
    ).click()
except NoSuchElementException:
    print("No element, smiles failed")

In [90]:
soup = BeautifulSoup(driver.page_source, 'lxml')

In [92]:
# find property data
table = soup.find("table", attrs={"id": "properties_table"})
table_data = table.tbody.find_all("tr")
properties_data = []
# ignore properties row by starting at 1
for tr in table_data[1:]:
    td = tr.find_all("td")
    properties_data.append({
        'property': td[0].string.strip(),
        'value': td[1].string.strip()
    })

In [93]:
# find name data
name_data = []
table = soup.find("div", attrs={"id": "name_table"})
table = table.find("table", attrs={"class": "data_table"})
table_data = table.tbody.find_all("tr")
# ignore known names row
for tr in table_data[1:]:
    td = tr.find_all("td")
    name_data.append({
        'name_type': td[0].string.strip().strip(':'),
        'name': td[1].string.strip()
    })

In [94]:
# find supplier data
supplier_data = []
table = soup.find("div", attrs={"id": "supplier_table"})
table = table.find("table", attrs={"class": "data_table"})
table_data = table.tbody.find_all("tr")
# ignore source, compound id row
for tr in table_data[1:]:
    td = tr.find_all("td")
    supplier_data.append({
        'supplier': td[0].string.strip().strip(':'),
        'supplier_id': td[1].string.strip()
    })

In [95]:
# find and click more info button to see prices 
WebDriverWait(driver=driver, timeout=30).until(EC.presence_of_element_located((By.ID, "add_item_0"))).click()

In [96]:
soup = BeautifulSoup(driver.page_source, 'lxml')
while soup.find("table", attrs={"class": "bbpricetable"}) is None:
    soup = BeautifulSoup(driver.page_source, 'lxml')

In [97]:
# get costs
table = soup.find("table", attrs={"class": "bbpricetable"})

In [98]:
table_data = table.tbody.find_all("tr")

In [99]:
price_data = []
for tr in table_data:
    td = tr.find_all('td')
    # large headers so skip
    if len(td) == 1:
        continue
    # column headers so skip
    elif len(td) == 7:
        continue
    elif len(td) == 8:
        row = {
            'supplier': td[0].text.strip(),
            'supplier_id': td[1].text.split('Name')[0].strip(),
        }
        row['amount'] = td[5].text.strip()
        row['units'] = td[6].text.strip()
        row['price'] = td[7].text.strip()
        price_data.append(row.copy())
    elif len(td) == 5:
        row['amount'] = td[2].text.strip()
        row['unit'] = td[3].text.strip()
        row['price'] = td[4].text.strip()
        price_data.append(row.copy())

In [102]:
driver.get('https://vwr.emolecules.com/index.php')

In [100]:
data = {
    'property_data': deepcopy(properties_data),
    'name_data': deepcopy(name_data),
    'supplier_data': deepcopy(supplier_data),
    'price_data': deepcopy(price_data)
}

In [101]:
data

{'property_data': [{'property': 'MWt', 'value': '240.387'},
  {'property': 'MF', 'value': 'C15H28O2'}],
 'name_data': [{'name_type': 'SMILES', 'name': 'O=C1CCCCCCCCCCCCCCO1'},
  {'name_type': 'CAS', 'name': '106-02-5'},
  {'name_type': 'Name', 'name': '1-oxacyclohexadecan-2-one'},
  {'name_type': 'Name', 'name': '15-Pentadecanolactone'},
  {'name_type': 'Name', 'name': '?-Pentadecalactone'},
  {'name_type': 'Name', 'name': '??-Pentadecalactone'},
  {'name_type': 'Name', 'name': 'Cyclopentadecanolide'},
  {'name_type': 'Name', 'name': 'Oxacyclohexadecan-2-one'},
  {'name_type': 'Name', 'name': 'PENTALIDE'},
  {'name_type': 'Name', 'name': 'Pentadecanolide'},
  {'name_type': 'Name', 'name': 'w-Pentadecalactone'},
  {'name_type': 'Name', 'name': '34219'},
  {'name_type': 'Name', 'name': '34220'},
  {'name_type': 'Name', 'name': '96104'},
  {'name_type': 'Name', 'name': 'A11341'},
  {'name_type': 'Name', 'name': 'A553352'},
  {'name_type': 'Name', 'name': 'AA003TI0'},
  {'name_type': 'Name

In [103]:
driver.close()

In [107]:
print(smiles_to_mol_file_format('CC1(C)C(=O)CCC2(C)C3CCC4(C)C(C5COC(=O)C5)CCC4(C)C3=CCC12'))


     RDKit          2D

 29 33  0  0  0  0  0  0  0  0999 V2000
   -5.2643   -4.8991    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -6.2284   -3.7500    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -7.1926   -4.8991    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -7.5275   -3.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -8.8265   -3.7500    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
   -7.5275   -1.5000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -6.2284   -0.7500    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -4.9294   -1.5000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -4.9294   -0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.6304   -0.7500    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.6304    0.7500    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.3313    1.5000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.0323    0.7500    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.8755    2.2418    0

In [119]:
smiles = 'O=C1CCCCCCCCCCCCCCO1'
print(smiles_to_mol_file_format(smiles))


     RDKit          2D

 17 17  0  0  0  0  0  0  0  0999 V2000
    5.3444    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    3.8444    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.5517   -1.4712    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.7184   -2.7184    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.4712   -3.5517    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0000   -3.8444    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.4712   -3.5517    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.7184   -2.7184    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.5517   -1.4712    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.8444    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.5517    1.4712    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.7184    2.7184    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.4712    3.5517    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    3.8444    0