# STFV - Web scraping the GRDB

## Useful links

1. Graded Ring Database - Smooth toric Fano varieties search: http://www.grdb.co.uk/search/toricsmooth

2. Stack Overflow - Selenium in Google Colab: https://stackoverflow.com/questions/51046454/how-can-we-use-selenium-webdriver-in-colab-research-google-com

3. Tutorials Point - Python web scraping dynamic websites: https://www.tutorialspoint.com/python_web_scraping/python_web_scraping_dynamic_websites.htm

4. Stack Overflow - Finding elements by class name (and CSS selector): https://stackoverflow.com/questions/30002313/selenium-finding-elements-by-class-name-in-python

5. Selenium Python Docs - Locating elements: https://selenium-python.readthedocs.io/locating-elements.html

## Install Selenium and import webdriver

From [2].

In [None]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:13 http://ppa.l

  if sys.path[0] == '':


## Web scraping

In [None]:
# Import libraries
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep

# Function to sleep for t seconds and click all "More details" buttons, 
# as (*) occasionally fails
def sleep_and_click_more_details_buttons(t):
  sleep(t)
  buttons = wd.find_elements(By.CSS_SELECTOR, "li[title='More details']")
  for b in buttons:
    # The following line increases the script's robustness (*)
    WebDriverWait(wd, 100).until(EC.element_to_be_clickable(b)).click()

# Main function
def main():
  # Get website
  wd.get("http://www.grdb.co.uk/search/toricsmooth")
  wd.implicitly_wait(100)
  wd.maximize_window()

  # Open output file
  f = open("out.txt", "a")

  # For each page, try to click all "More details" buttons with increasing 
  # sleep times
  for i in range(864):
    try:
      sleep_and_click_more_details_buttons(0)
    except:
      try:
        sleep_and_click_more_details_buttons(4)
      except:
        try:
          sleep_and_click_more_details_buttons(16)
        except:
          sleep_and_click_more_details_buttons(64)

    # Find all polytopes on page and append data to file
    for j in range(10):
      id = i*10 + j + 1
      if id < 8636:
        eval_line = "wd.find_element(By.ID, \"row" + str(id) + "\")"
        f.write(eval(eval_line).text)
        f.write("\n\n")
    
    # Go to next page
    if i < 863:
      go_to_button = wd.find_element(By.CSS_SELECTOR, "li[title='Go to page " + str(i + 2) + "']")
      go_to_button.click()
  
  # Close output file
  f.close()

if __name__ == "__main__":
    main()


## Error correction

In [None]:
# Open first input file and find initial corrupt polytopes
f1 = open("in1.txt", "r")
polytope_strings1 = f1.read().split("\n\n")
f1.close()
initial_corrupt_polytopes_indexes = []

for i in range(len(polytope_strings1)):
  if "Dual" not in polytope_strings1[i]:
    initial_corrupt_polytopes_indexes.append(i)

print("# Corrupt polytopes in first input file before execution:", len(initial_corrupt_polytopes_indexes), "\n")

# Open second input file and replace corrupt polytopes
f2 = open("in2.txt", "r")
polytope_strings2 = f2.read().split("\n\n")
f2.close()

for i in initial_corrupt_polytopes_indexes:
  if i < len(polytope_strings2) and "Dual" in polytope_strings2[i]:
    polytope_strings1[i] = polytope_strings2[i]

# Open and write output file, also count and print final corrupt polytopes
print("Corrupt polytopes in output file after execution:", "\n")

f3 = open("out.txt", "w")
final_corrupt_polytopes_count = 0

for ps in polytope_strings1:
  if "Dual" not in ps:
    final_corrupt_polytopes_count += 1
    print(ps, "\n")
  f3.write(ps + "\n\n")

f3.close()
print("# Corrupt polytopes in output file after execution:", final_corrupt_polytopes_count)


# Corrupt polytopes in first input file before execution: 21 

Corrupt polytopes in output file after execution: 

Final corrupt polytope:
7675
Smooth toric Fano 6-fold X = X(Q) with degree (-KX)6 = 40320
Vol(Q): 80
#Vertices: 13
#Facets: 80 

# Corrupt polytopes in output file after execution:  1


## Parsing

In [None]:
# Import libraries
import re
import numpy as np

# Function to parse a polytope string and assert its integrity
def parse_polytope(string, index):
  match = prog.match(string)
  groups = match.groups()
  assert(len(groups) == 11)
  id = int(groups[0])
  assert(id == index + 1)
  centrally_symmetric = groups[1] == "centrally symmetric "
  dim, deg = map(int, groups[2:4])
  zero_barycentre = groups[4] == "true"
  zero_dual_barycentre = groups[5] == "true"
  vol, n_vertices, n_facets = map(int, groups[6:9])
  parse_point = lambda s: list(map(int, s[1:-1].split(",")))
  vertices = np.array(list(map(parse_point, groups[9].split(", "))))
  assert n_vertices == np.shape(vertices)[0]
  dual = np.array(list(map(parse_point, groups[10].split(", "))))
  assert n_facets == np.shape(dual)[0]

  # n_vertices and vertices have been transposed with n_facets and dual, respectively
  return id, centrally_symmetric, dim, deg, zero_barycentre, zero_dual_barycentre, vol, n_facets, n_vertices, dual, vertices

# Define and compile regular expression used to parse a polytope string
pattern = r'''(\d+)
Smooth ([^\n]+)?toric Fano (?:[^\n]+) X = X\(Q\) with degree \(-KX\)(\d+) = (\d+)(?:
Zero barycentre: (\w+))?(?:
Zero dual barycentre: (\w+))?
Vol\(Q\): (\d+)
#Vertices: (\d+)
#Facets: (\d+)
Vertices: ([^\n]+)
Dual: ([^\n]+)'''
prog = re.compile(pattern)

# Loop through all polytope strings and parse them
f = open("in.txt", "r")
polytope_strings = f.read().split("\n\n")
polytopes = []
for i in range(len(polytope_strings)):
  polytopes.append(parse_polytope(polytope_strings[i], i))

# Save data as .npy binary file
data = np.array(polytopes, dtype=object)
np.save("data", data)