In [1]:
import concurrent.futures
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Function to safely extract text from a BeautifulSoup Tag or return None if the Tag is None
def get_text_safe(tag, split_text=None, index=None, strip=True):
    if tag:
        text = tag.get_text(strip=strip)
        if split_text and index is not None:
            try:
                return text.split(split_text)[index]
            except IndexError:
                return None
        return text
    return None

# Function to extract details from a webpage using Selenium
def extract_details_from_webpage(url):
    with webdriver.Chrome(service=Service(ChromeDriverManager().install())) as driver:
        driver.get(url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body")))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        data = {
            'License Number': None,
            'Org/Last Name': None,
            'First Name': None,
            'Middle Name': None,
            'Address Line 1': None,
            'Address Line 2': None,
            'County': None,
            'Original Issue Date': None,
            'Expiration Date': None,
            'License Status': None,
            'URL': url  # Include the original URL
        }
        
        
        # Extract License Number
        lic_detail_elem = soup.find(id="licDetail")
        if lic_detail_elem:
            lic_detail_text = lic_detail_elem.get_text(strip=True)
            # Splitting the text at ':' and taking the part after it
            data['License Number'] = lic_detail_text.split(':')[1].strip() if ':' in lic_detail_text else lic_detail_text


        # Extract Org/Last Name, First Name, Middle Name
        name_elem = soup.find(id="name")
        if name_elem:
            name_parts = name_elem.get_text(strip=True).split('Name:')[-1].strip().split(',')
            data['Org/Last Name'] = name_parts[0].strip() if name_parts else None
            first_name_parts = name_parts[1].strip().split() if len(name_parts) > 1 else [None]
            data['First Name'] = first_name_parts[0] if first_name_parts else None
            data['Middle Name'] = ' '.join(first_name_parts[1:]) if len(first_name_parts) > 1 else None

        # Extract Address Line 1, City, County, State
        address_elem = soup.find(id="address")
        if address_elem:
            paragraphs = address_elem.find_all('p', class_='wrapWithSpace')
            if paragraphs and len(paragraphs) > 1:
                address_lines = paragraphs[1].get_text(separator='|').split('|')

                # Assign lines to respective fields
                data['Address Line 1'] = address_lines[0].strip() if len(address_lines) > 0 else None
                data['Address Line 2'] = address_lines[1].strip() if len(address_lines) > 1 else None
                # Remove 'county' from the third line and assign it to 'County'
                data['County'] = address_lines[2].replace('county', '').strip() if len(address_lines) > 2 else None



        # Extract Original Issue Date
        data['Original Issue Date'] = get_text_safe(soup.find(id="issueDate"))

        # Extract Expiration Date
        data['Expiration Date'] = get_text_safe(soup.find(id="expDate"))

        # Extract License Status
        status_elem = soup.find(id="primaryStatus")
        if status_elem:
            status_classes = ["status_1", "status_2", "status_3", "status_4", "status_5", "status_6", "status_7", "status_8", "status_9", "status_10", "status_11", "status_12", "status_13", "status_14", "status_15", "status_16", "status_17", "status_18", "status_19", "status_20", "status_21", "status_22", "status_23", "status_24", "status_25", "status_26", "status_27", "status_28", "status_29", "status_30", "status_31", "status_32", "status_33", "status_34", "status_35", "status_36", "status_37", "status_38", "status_39"]
            for status_class in status_classes:
                status = status_elem.find(class_=status_class)
                if status:
                    data['License Status'] = status.get_text(strip=True)
                    break

        return data


In [2]:
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
chrome_options.add_argument('--disable-gpu')  # Disable GPU (optional, might improve performance)
chrome_options.add_argument("--disable-extensions")  # Disable extensions
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)


In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import time

# Assuming extract_details_from_webpage is defined elsewhere

# Function to process a batch of URLs
def process_batch(urls):
    with ThreadPoolExecutor(max_workers=7) as executor:
        future_to_url = {executor.submit(extract_details_from_webpage, url): url for url in urls}
        results = []
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                results.append(data)
            except Exception as e:
                print(f"Error scraping {url}: {e}")
        return results

# Read URLs from CSV file
df_websites = pd.read_csv('missing_links_civil.csv')
base_url = "https://search.dca.ca.gov"

# Prepend base URL to each link in the DataFrame
urls_to_scrape = [base_url + link for link in df_websites['More Detail Link']]

# Define batch size and delay
batch_size = 200
delay_between_batches = 10  # seconds

# Initialize CSV file for all details
all_details_file = "details_civil.csv"
# Ensure the file is empty and write headers
pd.DataFrame().to_csv(all_details_file, index=False)

# Process URLs in batches with a delay between each batch
for i in range(0, len(urls_to_scrape), batch_size):
    batch_urls = urls_to_scrape[i:i + batch_size]
    batch_results = process_batch(batch_urls)
    
    # Append batch results to CSV, without headers if file is not empty
    pd.DataFrame(batch_results).to_csv(all_details_file, mode='a', header=not bool(i), index=False)
    
    # Remove processed links from df_websites and update CSV
    df_websites = df_websites[~df_websites['More Detail Link'].isin(batch_urls)]
    df_websites.to_csv('updated_details_civil_batch.csv', index=False)

    # Delay before processing the next batch
    if i + batch_size < len(urls_to_scrape):  # Skip delay after last batch
        time.sleep(delay_between_batches)


Error scraping https://search.dca.ca.govhttps://search.dca.ca.gov/details/7500/C/7208/c34a40a8b32008e62ff305515877d0a5: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=121.0.6167.162)
Stacktrace:
	GetHandleVerifier [0x004E1673+52979]
	(No symbol) [0x00467961]
	(No symbol) [0x0034DD3D]
	(No symbol) [0x0033239A]
	(No symbol) [0x003AA71B]
	(No symbol) [0x003BACB6]
	(No symbol) [0x003A4286]
	(No symbol) [0x0037C063]
	(No symbol) [0x0037CECD]
	GetHandleVerifier [0x007F8D83+3294723]
	GetHandleVerifier [0x00836CC2+3548482]
	GetHandleVerifier [0x00831C9C+3527964]
	GetHandleVerifier [0x0057870E+671630]
	(No symbol) [0x00471EB4]
	(No symbol) [0x0046D808]
	(No symbol) [0x0046D92D]
	(No symbol) [0x0045F7E0]
	BaseThreadInitThunk [0x766CFCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77117C6E+286]
	RtlGetAppContainerNamedObjectPath [0x77117C3E+238]

Error scraping https://search.dca.ca.govhttps://search.dca.ca.gov/details/7500/C/710

In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import time

# Assuming extract_details_from_webpage is defined elsewhere

# Function to process a batch of URLs
def process_batch(urls):
    with ThreadPoolExecutor(max_workers=7) as executor:
        future_to_url = {executor.submit(extract_details_from_webpage, url): url for url in urls}
        results = []
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                results.append(data)
            except Exception as e:
                print(f"Error scraping {url}: {e}")
        return results

# Read URLs from CSV file
df_websites = pd.read_csv('missing_links_civil.csv')
base_url = "https://search.dca.ca.gov"

# Prepend base URL to each link in the DataFrame
urls_to_scrape = [link for link in df_websites['More Detail Link']]

# Define batch size and delay
batch_size = 200
delay_between_batches = 10  # seconds

# Initialize CSV file for all details
all_details_file = "details_civil.csv"
# Ensure the file is empty and write headers
pd.DataFrame().to_csv(all_details_file, index=False)

# Process URLs in batches with a delay between each batch
for i in range(0, len(urls_to_scrape), batch_size):
    batch_urls = urls_to_scrape[i:i + batch_size]
    batch_results = process_batch(batch_urls)
    
    # Append batch results to CSV, without headers if file is not empty
    pd.DataFrame(batch_results).to_csv(all_details_file, mode='a', header=not bool(i), index=False)
    
    # Remove processed links from df_websites and update CSV
    df_websites = df_websites[~df_websites['More Detail Link'].isin(batch_urls)]
    df_websites.to_csv('updated_details_civil_batch.csv', index=False)

    # Delay before processing the next batch
    if i + batch_size < len(urls_to_scrape):  # Skip delay after last batch
        time.sleep(delay_between_batches)
