In [None]:
## Setting
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time, csv, os

chrome_binary = "/home/jupyter/ADAPT_PCR_share/safe/tools/chrome_tmp/opt/google/chrome/google-chrome"
chromedriver_path = "/home/jupyter/ADAPT_PCR_share/safe/tools/chrome_tmp/chromedriver-linux64/chromedriver"
BASE_URL = "https://www.origene.com/catalog/gene-expression/qpcr-primer-pairs?species=124&p="
OUTPUT_CSV = '/home/jupyter/ADAPT_PCR_share/safe/resources/origene_primers_0731.csv'
VISITED_FILE = '/home/jupyter/ADAPT_PCR_share/safe/resources/visited_links.txt'

In [None]:
# Setting driver
options = Options()
options.binary_location = chrome_binary
options.add_argument("--headless=new")      
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=options)

In [None]:
visited = set()
if os.path.exists(VISITED_FILE):
    with open(VISITED_FILE, "r") as f:
        visited = set(line.strip() for line in f)
    print(f"[INFO] {len(visited)} links are already processed (skip)")
    

if not os.path.exists(OUTPUT_CSV):
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Gene", "NM", "Forward", "Reverse", "URL"])

!head -5 $OUTPUT_CSV

In [None]:
def scrape_detail_page(url):
    driver.get(url)
    time.sleep(1)  
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    meta_title = soup.find("meta", {"name": "title"})
    if meta_title and meta_title.has_attr("content"):
        info = meta_title["content"].split('|')[0].strip()
    else:
        return '', '', '', ''
    gene_name = info.split('Human')[0].strip()
    nm = info.split()[-1][1:-1]
    
    forward_td = soup.find("td", {"data-th": "Forward Sequence"})
    reverse_td = soup.find("td", {"data-th": "Reverse Sequence"})
    forward = forward_td.get_text(strip=True) if forward_td else ""
    reverse = reverse_td.get_text(strip=True) if reverse_td else ""

    return gene_name, nm, forward, reverse

In [None]:
page_num = 1
visited = set()

while True:
    page_url = BASE_URL + str(page_num)
    print(f"[*] page {page_num} processing...")
    driver.get(page_url)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = [a['href'] for a in soup.select(".product-item a[href]")]
    product_links = set([l for l in links if not l.endswith('citations')])

    for link in product_links:
        if not link.startswith("http"):
            link = "https://www.origene.com" + link
        if link in visited:
            continue

        try:
            gene, nm, forward, reverse = scrape_detail_page(link)
            with open(OUTPUT_CSV, "a", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow([gene, nm, forward, reverse, link])

            visited.add(link)
            with open(VISITED_FILE, "a") as f:
                f.write(link + "\n")

            print(f"[+] Saved: {gene}")

        except Exception as e:
            print(f"[!] Error parsing {link}: {e}")
            continue        

    page_num += 1