In [None]:
#### Installs ####
# perform in terminal:
# pip install selenium

# ChromeDriver download
# https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.264/win64/chromedriver-win64.zip

# Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import re
import sqlite3
from pathlib import Path
import string

In [None]:
# Initialize driver 
# Specify the path to your chromedriver
# NOTE: likely to throw errors if chromedriver and chrome versions differ
# current chrome version: Version 135.0.7049.42 (Official Build) (64-bit)
# to download most recent chrome driver, go to: https://googlechromelabs.github.io/chrome-for-testing/
# https://storage.googleapis.com/chrome-for-testing-public/135.0.7049.42/win64/chromedriver-win64.zip
#driver_path = "C://Users//jsk33//OneDrive//Github//scrape-addgene//chromedriver-win64//chromedriver.exe"  # Replace with the actual path to the WebDriver
driver_path = ".//chromedriver-win64//chromedriver-win64//chromedriver.exe"
service = Service(driver_path)

# Initialize WebDriver
#driver = webdriver.Chrome(service=service)

In [None]:
# create database if doesnt exist
# instantiate

# path to database file
db_path = Path("addgene_database.db")

# check if file exists, if not, create it
if not db_path.is_file():
    print(f"File '{db_path}' does not exist. Creating database file...")
    conn = sqlite3.connect('addgene_database.db')
else:
    print(f"File '{db_path}' already exists.")
    conn = sqlite3.connect('addgene_database.db')

#
#cursor = conn.cursor()

# Create table
#cursor.execute('''CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)''')

# Insert a row
#cursor.execute('INSERT INTO users (name, age) VALUES (?, ?)', ('Alice', 25))

# Query data
#cursor.execute('SELECT * FROM users')
#print(cursor.fetchall())

#conn.commit()
#conn.close()


In [None]:
#### A list that will contain the 5-6 digit IDs of the plasmids
target_plasmid_IDs = []

# Define a basic set of keyboard characters
keyboard_chars = string.ascii_letters + string.digits + string.punctuation + ' '

In [None]:
#### Loop through the addgene catalog ####
## NOTE: the addgene website will only provide 600 pages in the search results, so will have to figure out a workaround
## NOTE: a better workaround is to use a dictionay of molecular biology terms, gene names, and species names

####
try:

    # Initialize WebDriver
    driver = webdriver.Chrome(service=service)

    #### Initialize upper loop
    ## NOTE: need to have code that handles if a search returns no results (blank page)
    for char in keyboard_chars:

        ##
        try:
            #
            print("Scanning with the following list of characters:", keyboard_chars, "\n")

            # Open the target website
            url = 'https://www.addgene.org/'  # Replace with the website you want to scrape
            driver.get(url)
            # Wait for the page to load
            time.sleep(2)  # Adjust based on the website's loading speed
 
            # Locate the search bar element (e.g., by name, id, class, or CSS selector)
            # <input form="search-bar" id="search-text-input" class="suggest-input" aria-expanded="false" aria-haspopup="listbox" type="text" role="combobox" autocomplete="off" placeholder="e.g. 74218, Cas9, transformation protocol" name="q" aria-owns="awesomplete_list_2">
            search_bar = driver.find_element(By.ID, 'search-text-input') 
    
            # Enter text into the search bar
            # to search specifically for GFP-containing plasmids
            #search_text = "GFP"  # Replace with the text you want to enter
            # modifying to pull the whole database
            print("Scanning current character:", char, "\n")
            search_text = char
            search_bar.send_keys(search_text)
    
            # Submit the search (if needed, e.g., by pressing Enter)
            search_bar.send_keys(Keys.RETURN)
            # Wait for the results to load
            time.sleep(5)

            # Select the plasmids subcategory
            inplasmids_bar = driver.find_element(By.XPATH, "//span[@class='leaf-label' and text()='in Plasmids']")
            inplasmids_bar.click() # click it

            # Wait for the results to load
            time.sleep(5)

            # <a href="/185404/">GFP1-10-miniCMV-GFP11×11-GFP-tDeg</a>
            # Find all elements with an 'href' attribute
            elements_with_href = driver.find_elements(By.XPATH, "//a[@href]")
            # Extract the href values and store them in a list
            href_list = [element.get_attribute('href') for element in elements_with_href]
            # Print the list of hrefs
            # print("List of hrefs:", href_list)

            # filter the useful href
            # Regular expression to match only URLs starting with 'https://'
            regex = r"addgene.org/\d\d\d\d\d{1,2}/"
            # Filter the list using the regex
            filtered_items = [item for item in href_list if re.search(regex, item)]
            # Print the filtered list
            print("Number of filtered items:", len(filtered_items), "\n")
            print("Filtered items:", filtered_items, "\n")

            # target page at this step should be showing 20 plasmids per page, but we are getting returned only 14 IDs after regex
            ## NOTE: check if some IDs can have 1-4 digits also
            # manually constructing list of hrefs from first page to determine which are missing
            # <a href="/185404/">GFP1-10-miniCMV-GFP11×11-GFP-tDeg</a>
            # <a href="/87906/">pENTR221-H1-sgGFP1-U6-sgGFP2-7SK-sgGFP3</a>
            # <a href="/133815/">pFSW GFP IRES GFP</a>
            # After first 3 entries, it appears the ones missed have 5 digits, not 6. updating regex

            #### Pull numbers from the filtered strings
            # Remove everything except digits from each string
            clean_IDs = [re.sub(r"\D", "", item) for item in filtered_items]
            # Print the result
            #print("Cleaned IDs", clean_IDs, "\n")
            print("Current number of cleaned IDs:", len(clean_IDs), "\n")

            #### Add items to the list
            target_plasmid_IDs = target_plasmid_IDs + clean_IDs
            #print("Current list of target plasmid IDs", target_plasmid_IDs, "\n")
            print("Current number of target plasmid IDs:", len(target_plasmid_IDs), "\n")

            # Wait for the results to load
            time.sleep(5)

            #### Navigate to the next page
            ## Initialize lower loop to pass through 600 pages of results
            for i in range(1, 601):

                #
                print(f"Char: {char}, Page Number: {i}")
                # <a class="page-link" href="/search/catalog/plasmids/?q=gfp&amp;page_number=2">Next</a>
                # Find the next button and click it
                next_button = driver.find_element(By.LINK_TEXT, "Next")
                next_button.click() # click it

                # Wait for the results to load
                time.sleep(5)

                ##
                elements_with_href = driver.find_elements(By.XPATH, "//a[@href]")
                href_list = [element.get_attribute('href') for element in elements_with_href]
                regex = r"addgene.org/\d\d\d\d\d{1,2}/"
                filtered_items = [item for item in href_list if re.search(regex, item)]
                clean_IDs = [re.sub(r"\D", "", item) for item in filtered_items]
                # add the new IDs to the list
                target_plasmid_IDs = target_plasmid_IDs + clean_IDs
                # remove any duplicates
                target_plasmid_IDs = list(set(target_plasmid_IDs))
                # report
                #print("Current list of target plasmid IDs", target_plasmid_IDs, "\n")
                print("Number of unique IDs:", len(target_plasmid_IDs), "\n")
                # go to the next page
                next_button = driver.find_element(By.LINK_TEXT, "Next")
                next_button.click() # click it
        except:
            print("Exception encountered, proceeding. \n")
        
####
finally:
    # Close the WebDriver
    print(f"Operation complete.")
    driver.quit()

