In [14]:
#### Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import sqlite3
from pathlib import Path
import string
import my_utils
import os
import requests


## NOTE: the "publication" hyperlink on addgene website is just a link to an internal record page on addgene, pull only citation for now

#### Code to download plasmid details from the addgene website using the identified plasmid IDs
plasmid_ID = "102679"

## create the output directory for the plasmid info. store locally ##
output_dir = "../0.local/scrape-addgene/plasmids/" + plasmid_ID
os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist
print(f"Created output directory for plasmid info at {output_dir}")

## open the webpage
# Initialize WebDriver
driver_path = ".//chromedriver-win64//chromedriver-win64//chromedriver.exe"
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

# define sleep time
sleep_time = 5

# Open the target website
url = 'https://www.addgene.org/'
plasmid_url = url + plasmid_ID
print(plasmid_url)
driver.get(plasmid_url)
time.sleep(sleep_time) # Wait for the page to load

#### Instantiate the temporary storage
current_info = {
    "addgene_ID": None,
    "plasmid_name": None,
    "plasmid_purpose": None,
    "depositor": None,
    "vector_backbone": None,
    "backbone_size": None,
    "total_size": None,
    "insert_size": None,
    "vector_type": None,
    "selectable_markers": None,
    "bacterial_resistance": None,
    "growth_temperature": None,
    "growth_strain": None,
    "copy_number": None,
    "gene_insert_name": None,
    "species": None,
    "genbank_ID": None,
    "entrez_gene": None,
    "tag_fusion_protein": None,
    "cloning_method": None,
    "citation": None,
    "commonly_requested_with": None
}

#### Instantiate the list to store info from multiple plasmids
plasmid_info = list()

#### Pull the info from the webpage  ####
current_info["addgene_ID"] = plasmid_ID
current_info["vector_backbone"] = my_utils.parse_fields("Vector backbone", driver)
current_info["backbone_size"] = int(my_utils.parse_fields("Backbone size", driver))
current_info["total_size"] = int(my_utils.parse_fields("Total vector size", driver))
current_info["insert_size"] = current_info["total_size"] - current_info["backbone_size"]
current_info["vector_type"] = my_utils.parse_fields("Vector type", driver)
current_info["selectable_markers"] = my_utils.parse_fields("Selectable markers", driver)
current_info["bacterial_resistance"] = my_utils.parse_fields("Bacterial Resistance", driver)
current_info["growth_temperature"] = my_utils.parse_fields("Growth Temperature", driver)
current_info["growth_strain"] = my_utils.parse_fields("Growth Strain", driver)
current_info["copy_number"] = my_utils.parse_fields("Copy number", driver)
current_info["gene_insert_name"] = my_utils.parse_fields("Gene/Insert name", driver)
current_info["species"] = my_utils.parse_fields("Species", driver)
current_info["genbank_ID"] = my_utils.parse_fields("GenBank ID", driver)
current_info["entrez_gene"] = my_utils.parse_fields("Entrez Gene", driver)
current_info["tag_fusion_protein"] = my_utils.parse_fields("Fusion Protein", driver)
current_info["cloning_method"] = my_utils.parse_fields("Cloning method", driver)

## plasmid name ##
# Find element labelled title
plasmid_name = driver.title
#print("Plasmid name:", plasmid_name)
current_info["plasmid_name"] = plasmid_name

## plasmid purpose ##
labels = driver.find_elements(By.CLASS_NAME, "field-label") # Find all field-label divs
# Loop to find the label "Purpose" and get the next sibling (field-content)
for label in labels:
    if label.text.strip() == "Purpose":
        # Get the parent of the label, then find the corresponding content
        content = label.find_element(By.XPATH, 'following-sibling::div[@class="field-content"]')
        plasmid_purpose = content.text.strip()
        break
current_info["plasmid_purpose"] = plasmid_purpose

## depositing lab ##
labels = driver.find_elements(By.CLASS_NAME, "field-label")
for label in labels:
    if label.text.strip() == "Depositing Lab":
        content_div = label.find_element(By.XPATH, 'following-sibling::div[@class="field-content"]')
        link = content_div.find_element(By.TAG_NAME, "a")
        depositor = link.text.strip()
        break
print("depositor:", depositor, "\n")
current_info["depositor"] = depositor


# Extract full text of citation
cite_element = driver.find_element(By.TAG_NAME, "cite")
full_text = cite_element.text
# Extract DOI (last part of citation text)
doi = full_text.split("doi:")[1].split()[0] if "doi:" in full_text else None
# Display parsed info
print("DOI:", doi)

## commonly requested with ##
# Find all <a class="material-name"> elements inside the "Commonly requested with" panel
links = driver.find_elements(By.CSS_SELECTOR, ".panel-body a.material-name")
# Extract all digit values from the href attributes
digits = []
for link in links:
    href = link.get_attribute("href")
    match = re.search(r'/(\d+)/', href)
    if match:
        digits.append(int(match.group(1)))
#print("Extracted digit values:", digits)
current_info["commonly_requested_with"] = digits

## output the list info
plasmid_info_file = output_dir + "/" + plasmid_ID + ".txt"

# Write to file as tab-delimited
with open(plasmid_info_file, "w",  encoding="utf-8") as f:
    for key, value in current_info.items():
        f.write(f"{key}\t{value}\n")

print(f"Dictionary written to {plasmid_info_file}")

#### Download the full plasmid fasta sequence ####

## locate the sequences button
# wait for previous processing to complete
time.sleep(sleep_time) # Wait for the page to load
# sequences webpage will be plasmid ID followed by 'sequences
# navigate to sequences page

sequences_url = plasmid_url + "/sequences"
print(sequences_url)
driver.get(sequences_url)
time.sleep(sleep_time) # Wait for the page to load

# --- Find the GenBank download link ---
genbank_link = driver.find_element(By.CSS_SELECTOR, "a.genbank-file-download")
gbk_url = genbank_link.get_attribute("href")

# --- Optional: Extract a filename from the URL ---
filename = gbk_url.split("/")[-1]
gbk_output_path = os.path.join(output_dir, filename)

# --- Download the .gbk file using requests ---
response = requests.get(gbk_url)
with open(gbk_output_path, "wb") as f:
    f.write(response.content)

print(f"Downloaded .gbk file to: {gbk_output_path}")


#### Close the driver ####
print(current_info)
print("Completed scrape")
driver.quit()



Created output directory for plasmid info at ../0.local/scrape-addgene/plasmids/102679
https://www.addgene.org/102679
Called parse_fields function with target text: Vector backbone
Called parse_fields function with target text: Backbone size
Called parse_fields function with target text: Total vector size
Called parse_fields function with target text: Vector type
Called parse_fields function with target text: Selectable markers
Called parse_fields function with target text: Bacterial Resistance
Called parse_fields function with target text: Growth Temperature
Called parse_fields function with target text: Growth Strain
Called parse_fields function with target text: Copy number
Called parse_fields function with target text: Gene/Insert name
Called parse_fields function with target text: Species
Called parse_fields function with target text: GenBank ID
Called parse_fields function with target text: Entrez Gene
Called parse_fields function with target text: Fusion Protein
Called parse_fie