In [11]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import json

# Setup Edge WebDriver with optimized settings
edge_options = Options()
edge_options.add_argument("--window-size=1920,1080")
edge_options.add_argument("--disable-extensions")  # Disable extensions to improve performance
edge_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
edge_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource issues
edge_options.add_argument("--no-sandbox")  # Bypass OS security model
edge_options.page_load_strategy = 'normal'  # Wait for full page load

service = Service("C:/Users/Alex/Downloads/edgedriver_win64 (1)/msedgedriver.exe")
driver = webdriver.Edge(service=service, options=edge_options)

try:
    # Navigate to the page
    driver.get("https://healthy.arkansas.gov/programs-services/diseases-conditions/covid-19/covid-19-vaccines/covid-19-vaccination-clinics-locations/")
    print("Navigated to the page. Waiting for it to load...")
    
    # Wait for iframes to be present
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "iframe"))
    )
    
    # Find iframes
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"Found {len(iframes)} iframes on the page.")
    
    # Switch to the first iframe
    driver.switch_to.frame(iframes[0])
    print("Switched to iframe 1.")
    
    # Wait for the Vaadin grid to load
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "vaadin-grid"))
    )
    print("Vaadin grid found.")
    
    # Extract headers
    headers = []
    try:
        header_elements = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "vaadin-grid-sorter"))
        )
        headers = [header.text.strip() for header in header_elements if header.text.strip()]
        print(f"Found headers: {headers}")
    except Exception as e:
        print(f"Error getting headers: {e}")
    
    # Modified JavaScript specifically for Vaadin grid scrolling and extraction
    js_script = """
    return (async function() {
        const grid = document.querySelector('vaadin-grid');
        if (!grid) return { error: "Grid not found" };
        
        // Function to safely extract current visible data
        function extractVisibleData() {
            const headers = Array.from(document.querySelectorAll('vaadin-grid-sorter'))
                .map(h => h.textContent.trim())
                .filter(h => h); // Filter out empty headers
                
            if (headers.length === 0) {
                return { error: "No headers found" };
            }
            
            const numCols = headers.length;
            const allCells = Array.from(document.querySelectorAll('vaadin-grid-cell-content'));
            let rows = [];
            
            // Process cells into rows
            for (let i = 0; i < allCells.length; i += numCols) {
                if (i + numCols <= allCells.length) {
                    let row = {};
                    let hasContent = false;
                    
                    for (let j = 0; j < numCols; j++) {
                        const cellContent = allCells[i + j].textContent.trim();
                        row[headers[j] || `Column${j+1}`] = cellContent;
                        if (cellContent) hasContent = true;
                    }
                    
                    // Only add non-empty rows
                    if (hasContent) {
                        rows.push(row);
                    }
                }
            }
            
            return rows;
        }
        
        // Get an estimate of total rows to scroll
        function getEstimatedRowCount() {
            // Try to get row count from Vaadin grid properties
            if (grid._effectiveSize) return grid._effectiveSize;
            if (grid.size) return grid.size;
            
            // If no properties available, make an educated guess
            // based on the visible rows and assuming more are available
            return 500; // Default to a reasonably large number
        }
        
        // Use Vaadin grid's native scrollToIndex method with progressive loading
        async function progressiveScroll() {
            let allData = [];
            let lastDataLength = -1;
            let seenDataStrings = new Set();
            const totalRows = getEstimatedRowCount();
            const batchSize = 20; // How many rows to jump each time
            
            console.log(`Starting to scroll through approximately ${totalRows} rows`);
            
            // Initial data extraction
            let currentData = extractVisibleData();
            if (Array.isArray(currentData)) {
                currentData.forEach(row => {
                    const rowStr = JSON.stringify(row);
                    if (!seenDataStrings.has(rowStr)) {
                        seenDataStrings.add(rowStr);
                        allData.push(row);
                    }
                });
            }
            
            // Scroll through the grid in batches
            for (let index = 0; index < totalRows; index += batchSize) {
                try {
                    // Use the grid's native scrollToIndex method
                    grid.scrollToIndex(index);
                    
                    // Wait for rendering
                    await new Promise(resolve => setTimeout(resolve, 300));
                    
                    // Extract data from current view
                    currentData = extractVisibleData();
                    if (Array.isArray(currentData)) {
                        let newRowsAdded = 0;
                        
                        currentData.forEach(row => {
                            const rowStr = JSON.stringify(row);
                            if (!seenDataStrings.has(rowStr)) {
                                seenDataStrings.add(rowStr);
                                allData.push(row);
                                newRowsAdded++;
                            }
                        });
                        
                        console.log(`Scrolled to index ${index}, collected ${allData.length} total rows, added ${newRowsAdded} new rows`);
                        
                        // If we've scrolled several times without getting new data, we might be at the end
                        if (newRowsAdded === 0 && index > 100) {
                            const additionalAttempts = 3;
                            let foundNewData = false;
                            
                            // Make a few more attempts at larger scroll increments
                            for (let j = 1; j <= additionalAttempts; j++) {
                                grid.scrollToIndex(index + j * batchSize * 2);
                                await new Promise(resolve => setTimeout(resolve, 500));
                                
                                const moreData = extractVisibleData();
                                if (Array.isArray(moreData)) {
                                    let moreNewRows = 0;
                                    
                                    moreData.forEach(row => {
                                        const rowStr = JSON.stringify(row);
                                        if (!seenDataStrings.has(rowStr)) {
                                            seenDataStrings.add(rowStr);
                                            allData.push(row);
                                            moreNewRows++;
                                            foundNewData = true;
                                        }
                                    });
                                    
                                    console.log(`Additional attempt ${j}: added ${moreNewRows} more rows`);
                                }
                            }
                            
                            if (!foundNewData) {
                                console.log("No new data found after multiple attempts. Assuming we've reached the end.");
                                break;
                            }
                        }
                    }
                } catch (error) {
                    console.log(`Error during scrolling at index ${index}: ${error.message}`);
                    // If we hit an error, try a few more times with larger jumps
                    try {
                        grid.scrollToIndex(index + batchSize * 3);
                        await new Promise(resolve => setTimeout(resolve, 500));
                    } catch (finalError) {
                        console.log("Failed additional scroll attempt. Collecting what we have so far.");
                        break;
                    }
                }
            }
            
            // Make one final attempt to scroll to the absolute bottom
            try {
                // Scroll to a very large index to try to reach the bottom
                grid.scrollToIndex(totalRows * 2);
                await new Promise(resolve => setTimeout(resolve, 1000));
                
                // Get any final rows
                currentData = extractVisibleData();
                if (Array.isArray(currentData)) {
                    currentData.forEach(row => {
                        const rowStr = JSON.stringify(row);
                        if (!seenDataStrings.has(rowStr)) {
                            seenDataStrings.add(rowStr);
                            allData.push(row);
                        }
                    });
                }
            } catch (error) {
                console.log("Error during final scroll attempt: " + error.message);
            }
            
            console.log(`Scrolling complete. Collected ${allData.length} total unique rows.`);
            return allData;
        }
        
        // Execute the scrolling strategy and return the data
        return await progressiveScroll();
    })();
    """
    
    # Set script timeout to a higher value
    driver.set_script_timeout(300)  # 5 minutes timeout for scripts
    
    # Execute the JavaScript with a more reliable approach - collect data in chunks
    print("Starting data extraction...")
    
    # First approach: Get total row count if possible
    try:
        row_count_script = """
        const grid = document.querySelector('vaadin-grid');
        if (!grid) return 0;
        if (grid._effectiveSize) return grid._effectiveSize;
        if (grid.size) return grid.size;
        return 500; // default estimate
        """
        estimated_rows = driver.execute_script(row_count_script)
        print(f"Estimated row count: {estimated_rows}")
    except Exception as e:
        print(f"Error estimating row count: {e}")
        estimated_rows = 500
    
    # Use a chunked approach to avoid timeouts
    chunk_size = 50
    total_data = []
    seen_rows = set()
    
    for start_idx in range(0, estimated_rows * 2, chunk_size):
        try:
            chunk_script = f"""
            return (async function() {{
                const grid = document.querySelector('vaadin-grid');
                if (!grid) return {{ error: "Grid not found" }};
                
                // Function to extract visible data
                function extractVisibleData() {{
                    const headers = Array.from(document.querySelectorAll('vaadin-grid-sorter'))
                        .map(h => h.textContent.trim())
                        .filter(h => h);
                    
                    if (headers.length === 0) return [];
                    
                    const numCols = headers.length;
                    const allCells = Array.from(document.querySelectorAll('vaadin-grid-cell-content'));
                    let rows = [];
                    
                    for (let i = 0; i < allCells.length; i += numCols) {{
                        if (i + numCols <= allCells.length) {{
                            let row = {{}};
                            let hasContent = false;
                            
                            for (let j = 0; j < numCols; j++) {{
                                const cellContent = allCells[i + j].textContent.trim();
                                row[headers[j] || `Column${{j+1}}`] = cellContent;
                                if (cellContent) hasContent = true;
                            }}
                            
                            if (hasContent) rows.push(row);
                        }}
                    }}
                    
                    return rows;
                }}
                
                // Scroll to chunk start
                try {{
                    grid.scrollToIndex({start_idx});
                    await new Promise(resolve => setTimeout(resolve, 500));
                    
                    // Scroll through this chunk
                    let allData = [];
                    for (let i = 0; i < {chunk_size}; i++) {{
                        grid.scrollToIndex({start_idx} + i);
                        await new Promise(resolve => setTimeout(resolve, 100));
                        
                        const data = extractVisibleData();
                        if (data && data.length > 0) {{
                            allData = [...allData, ...data];
                        }}
                    }}
                    
                    return allData;
                }} catch (e) {{
                    return {{ error: e.toString() }};
                }}
            }})();
            """
            
            print(f"Fetching rows {start_idx} to {start_idx + chunk_size}...")
            chunk_data = driver.execute_script(chunk_script)
            
            if isinstance(chunk_data, list):
                # Add only unique rows to our total dataset
                for row in chunk_data:
                    row_hash = str(sorted([(k, v) for k, v in row.items()]))
                    if row_hash not in seen_rows:
                        seen_rows.add(row_hash)
                        total_data.append(row)
                
                print(f"Collected {len(total_data)} unique rows so far")
                
                # If we didn't get any new rows in multiple consecutive chunks, we might be done
                if len(chunk_data) == 0 and start_idx > estimated_rows:
                    print("No more data found, likely reached the end")
                    break
                
            elif isinstance(chunk_data, dict) and 'error' in chunk_data:
                print(f"Error in chunk: {chunk_data['error']}")
                # Try to continue with the next chunk
            else:
                print(f"Unexpected data format in chunk: {type(chunk_data)}")
                
        except Exception as e:
            print(f"Error processing chunk starting at {start_idx}: {e}")
            # Let's try a few more chunks before giving up completely
            if start_idx > estimated_rows:
                print("Already collected significant data, stopping due to errors")
                break
    
    all_data = total_data
    print(f"Extraction complete. Found {len(all_data)} unique items.")
    
    # Convert to DataFrame and save
    if all_data and isinstance(all_data, list) and len(all_data) > 0:
        # Remove duplicates (as a safeguard)
        df = pd.DataFrame(all_data).drop_duplicates()
        
        # Data validation
        print(f"Data validation: DataFrame shape: {df.shape}")
        
        # Save to CSV
        csv_filename = "vaccination_clinics_complete.csv"
        df.to_csv(csv_filename, index=False)
        print(f"Data saved to {csv_filename}")
        print("\nSample data:")
        print(df.head())
    elif isinstance(all_data, dict) and 'error' in all_data:
        print(f"Error in JavaScript execution: {all_data['error']}")
    else:
        print("Failed to extract data or no data found.")

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    # Even if we had an exception, try to save whatever data we collected
    if 'all_data' in locals() and isinstance(all_data, list) and len(all_data) > 0:
        try:
            # Save to CSV
            csv_filename = "vaccination_clinics_partial.csv"
            pdf = pd.DataFrame(all_data).drop_duplicates()
            pdf.to_csv(csv_filename, index=False)
            print(f"Saved {len(pdf)} rows to {csv_filename} before quitting")
            
            # Also save raw JSON as backup
            with open("vaccination_clinics_raw.json", "w") as f:
                json.dump(all_data, f)
                print("Saved raw data to vaccination_clinics_raw.json as backup")
        except Exception as save_error:
            print(f"Error saving partial data: {save_error}")
    
    # Close the driver
    print("Closing the browser...")
    driver.quit()

Navigated to the page. Waiting for it to load...
Found 3 iframes on the page.
Switched to iframe 1.
Vaadin grid found.
Found headers: ['Name', 'Street1', 'City', 'County', 'Phone']
Starting data extraction...
Estimated row count: 1671
Fetching rows 0 to 50...
Collected 58 unique rows so far
Fetching rows 50 to 100...
Collected 108 unique rows so far
Fetching rows 100 to 150...
Collected 158 unique rows so far
Fetching rows 150 to 200...
Collected 208 unique rows so far
Fetching rows 200 to 250...
Collected 258 unique rows so far
Fetching rows 250 to 300...
Collected 308 unique rows so far
Fetching rows 300 to 350...
Collected 358 unique rows so far
Fetching rows 350 to 400...
Collected 408 unique rows so far
Fetching rows 400 to 450...
Collected 458 unique rows so far
Fetching rows 450 to 500...
Collected 508 unique rows so far
Fetching rows 500 to 550...
Collected 558 unique rows so far
Fetching rows 550 to 600...
Collected 608 unique rows so far
Fetching rows 600 to 650...
Collected 