# This script is a LinkedIn employee scraper specifically designed to collect information about MSCHF employees.
- It uses Selenium WebDriver to automate browser interactions and navigate through LinkedIn search results.

# The script includes functionality for:
- Browser initialization with Chrome
- LinkedIn authentication
- Scraping employee data across multiple pages
- Data storage in a structured format


In [None]:
# selenium_mschf_employee_scraper.py

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

import pandas as pd
import time

# === CONFIGURATION ===
USERNAME = "joshstrupp@gmail.com"      # ← LinkedIn email
PASSWORD = "Stonerjoe1"          # ← LinkedIn password
SEARCH_URL = (
    "https://www.linkedin.com/search/results/people/"
    "?currentCompany=%5B%2218430865%22%5D"
    "&heroEntityKey=urn%3Ali%3Aorganization%3A18430865"
    "&keywords=mschf&origin=FACETED_SEARCH"
    "&position=0&searchId=0c69a3cb-25e8-45a2-adf1-3e6c0b681c6c&sid=fC%40"
)
NUM_PAGES = 15
IMPLICIT_WAIT = 5   # seconds


def init_driver() -> webdriver.Chrome:
    """Initialize Chrome WebDriver with basic options."""
    chrome_opts = Options()
    chrome_opts.add_argument("--start-maximized")
    # chrome_opts.add_argument("--headless")  # ← optional, but headless is more detectable
    service = Service()  # Assumes chromedriver is in PATH
    driver = webdriver.Chrome(service=service, options=chrome_opts)
    driver.implicitly_wait(IMPLICIT_WAIT)
    return driver


def linkedin_login(driver: webdriver.Chrome):
    """Log into LinkedIn with USERNAME / PASSWORD."""
    try:
        driver.get("https://www.linkedin.com/login")
        time.sleep(3)  # Increased wait time for page load

        # 1. Enter credentials with human-like delays
        username_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "username"))
        )
        password_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "password"))
        )
        
        # Type username with random delays
        for char in USERNAME:
            username_field.send_keys(char)
            time.sleep(0.1)  # Small delay between keystrokes
        
        time.sleep(0.5)  # Pause before password
        
        # Type password with random delays
        for char in PASSWORD:
            password_field.send_keys(char)
            time.sleep(0.1)  # Small delay between keystrokes

        time.sleep(1)  # Pause before submitting

        # 2. Submit form
        password_field.submit()

        # 3. Wait for either successful login or error message
        try:
            # Try multiple selectors for successful login
            login_success_selectors = [
                "nav[role='navigation']",
                "#global-nav-search",
                "input.search-global-typeahead__input",
                "div[data-test-id='nav-search-typeahead']"
            ]
            
            login_success = False
            for selector in login_success_selectors:
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                    )
                    login_success = True
                    print(f"Login successful! Found element: {selector}")
                    break
                except:
                    continue
            
            if not login_success:
                # Check for various error conditions
                error_conditions = [
                    (By.CLASS_NAME, "alert-content"),
                    (By.CLASS_NAME, "form__error"),
                    (By.CLASS_NAME, "error-for-password"),
                    (By.CLASS_NAME, "error-for-username")
                ]
                
                for by, value in error_conditions:
                    try:
                        error_element = driver.find_element(by, value)
                        print(f"Login failed: {error_element.text}")
                        raise Exception(f"Login failed: {error_element.text}")
                    except:
                        continue
                
                # If no specific error found, check for security challenges
                if "security" in driver.current_url.lower():
                    print("Security challenge detected!")
                    raise Exception("LinkedIn security challenge detected")
                
                print("Login failed: Could not detect navigation bar or error message")
                print("Current URL:", driver.current_url)
                print("Page title:", driver.title)
                raise Exception("Login verification failed")

        except TimeoutException:
            print("Login timeout - page might be loading slowly or blocked")
            print("Current URL:", driver.current_url)
            print("Page title:", driver.title)
            raise

        time.sleep(2)
    except Exception as e:
        print(f"Error during login: {str(e)}")
        raise


def scrape_page(driver: webdriver.Chrome) -> list[dict]:
    """
    On the current search page, find every <li> that contains
    a child <div data-view-name="search-entity-result-universal-template">.
    Extract profile_url, image_url, name, title, location, badge_text.
    """
    page_results = []

    # 1. Wait until at least one result-block is present
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((
            By.XPATH,
            "//li[.//div[@data-view-name='search-entity-result-universal-template']]"
        ))
    )

    # 2. Find all such <li> elements
    li_elements = driver.find_elements(
        By.XPATH,
        "//li[.//div[@data-view-name='search-entity-result-universal-template']]"
    )

    for li in li_elements:
        try:
            # Locate the inner div with the desired data attribute using find_elements for robustness
            result_divs = li.find_elements(
                By.XPATH,
                ".//div[@data-view-name='search-entity-result-universal-template']"
            )
            result_div = result_divs[0] if result_divs else None

            entry = {}

            # Profile URL (strip any query params)
            a_tags = result_div.find_elements(By.TAG_NAME, "a") if result_div else []
            entry["profile_url"] = a_tags[0].get_attribute("href").split("?")[0] if a_tags else ""

            # Image URL (first <img> inside the result_div)
            img_tags = result_div.find_elements(By.TAG_NAME, "img") if result_div else []
            entry["image_url"] = img_tags[0].get_attribute("src") if img_tags else ""

            # Name: <span aria-hidden="true">...</span>
            name_spans = result_div.find_elements(By.XPATH, './/span[@aria-hidden="true"]') if result_div else []
            entry["name"] = name_spans[0].text.strip() if name_spans else ""

            # Title: try to find a <div> with classes t-black and t-normal, or fallback to any with t-black or t-normal
            title_divs = result_div.find_elements(
                By.XPATH,
                './/div[contains(@class, "t-black") and contains(@class, "t-normal")]'
            ) if result_div else []
            if not title_divs and result_div:
                title_divs = result_div.find_elements(
                    By.XPATH,
                    './/div[contains(@class, "t-black") or contains(@class, "t-normal")]'
                )
            entry["title"] = title_divs[0].text.strip() if title_divs else ""

            # Location: try to find a <div> with classes t-14 and t-normal, then fallback to a broader selector
            location_divs = result_div.find_elements(
                By.XPATH,
                './/div[contains(@class, "t-14") and contains(@class, "t-normal")]'
            ) if result_div else []
            if (not location_divs or len(location_divs) < 2) and result_div:
                location_divs = result_div.find_elements(
                    By.XPATH,
                    './/div[contains(@class, "t-14") or contains(@class, "t-normal")]'
                )
            if len(location_divs) >= 2:
                entry["location"] = location_divs[1].text.strip()
            elif len(location_divs) == 1:
                entry["location"] = location_divs[0].text.strip()
            else:
                entry["location"] = ""

            # Badge text (if present)
            badge_spans = result_div.find_elements(
                By.XPATH,
                './/span[contains(@class, "entity-result__badge")]'
            ) if result_div else []
            entry["badge_text"] = badge_spans[0].text.strip() if badge_spans else ""

            page_results.append(entry)

        except Exception as e:
            print("Skipped one entry:", e)
            print("HTML for skipped entry:", li.get_attribute('outerHTML'))
            continue

    return page_results


def click_next(driver: webdriver.Chrome) -> bool:
    """
    Click the "Next" pagination button.
    Return True if click succeeded, False if it's not found or disabled.
    """
    try:
        next_btn = driver.find_element(
            By.XPATH,
            "//button[contains(@aria-label, 'Next') "
            "and contains(@class, 'artdeco-pagination__button--next')]"
        )
        # If the button is disabled, bail out
        if "disabled" in next_btn.get_attribute("class"):
            return False

        next_btn.click()
        time.sleep(3)
        return True

    except Exception:
        return False



def main():
    driver = init_driver()
    all_people = []

    try:
        linkedin_login(driver)
        
        # Add additional verification before proceeding with search
        try:
            # Try to find search bar again before proceeding
            search_bar = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "input.search-global-typeahead__input"))
            )
            print("Search bar verified before proceeding with search")
        except:
            print("Warning: Could not verify search bar before proceeding")
            print("Current URL:", driver.current_url)
            print("Page title:", driver.title)
        
        driver.get(SEARCH_URL)
        time.sleep(3)

        # Scrape pages 1 through 15
        total_pages = 15
        for page_num in range(1, total_pages + 1):
            print(f"→ Scraping page {page_num} of {total_pages} …")
            page_data = scrape_page(driver)
            all_people.extend(page_data)

            # Attempt to click "Next"; if it fails, break early
            if not click_next(driver):
                print("⛔ No more pages or Next button disabled.")
                break

        # Build DataFrame
        df = pd.DataFrame(all_people)

        # Save to CSV (optional)
        df.to_csv("mschf_linkedin_employees.csv", index=False)
        print("✅ Finished. Saved to 'mschf_linkedin_employees.csv'.")
        print(df.head())

    finally:
        driver.quit()


if __name__ == "__main__":
    main()

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('mschf_linkedin_employees.csv')

# Display first few rows
df.head()


In [8]:
# hi
# Updated profile_detail_scraper.py with improved error handling and CSS selector fallbacks

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import time
import json  # Added import for json
import pickle
import os

# ─── CONFIG ─────────────────────────────────────────────────────
INPUT_CSV    = "mschf_linkedin_employees.csv"
OUTPUT_CSV   = "mschf_profiles_detailed_full.csv"  # Full results
PROGRESS_PICKLE = "scraping_progress.pkl"  # Temporary progress file
IMPLICIT_WAIT = 5
PAGE_LOAD_WAIT = 3
USERNAME = "joshstrupp@gmail.com"      # ← LinkedIn email
PASSWORD = "Stonerjoe1"          # ← LinkedIn password

# ─── SETUP ──────────────────────────────────────────────────────
def init_driver():
    print("Initializing Chrome driver...")
    opts = Options()
    opts.add_argument("--start-maximized")
    # opts.add_argument("--headless")  # optional
    driver = webdriver.Chrome(service=Service(), options=opts)
    driver.implicitly_wait(IMPLICIT_WAIT)
    print("✅ Driver initialized successfully")
    return driver

def linkedin_login(driver):
    print("\n🔑 Logging into LinkedIn...")
    driver.get("https://www.linkedin.com/login")
    time.sleep(2)
    
    # Enter credentials
    username = driver.find_element(By.ID, "username")
    password = driver.find_element(By.ID, "password")
    
    # Type username with random delays
    for char in USERNAME:
        username.send_keys(char)
        time.sleep(0.1)  # Small delay between keystrokes
    
    time.sleep(0.5)  # Pause before password
    
    # Type password with random delays
    for char in PASSWORD:
        password.send_keys(char)
        time.sleep(0.1)  # Small delay between keystrokes

    time.sleep(1)  # Pause before submitting

    # Click login button
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
    
    # Wait for login to complete - just check if we're redirected away from login page
    try:
        WebDriverWait(driver, 10).until(
            lambda driver: "login" not in driver.current_url.lower()
        )
        print("✅ Successfully logged into LinkedIn")
        return True
    except:
        print("❌ Failed to log into LinkedIn")
        return False

# Helper function to try multiple selectors
def find_element_with_fallbacks(parent, selectors):
    """Try multiple CSS selectors until one works"""
    for selector in selectors:
        try:
            return parent.find_element(By.CSS_SELECTOR, selector)
        except:
            continue
    return None

def find_elements_with_fallbacks(parent, selectors):
    """Try multiple CSS selectors until one works"""
    for selector in selectors:
        try:
            elements = parent.find_elements(By.CSS_SELECTOR, selector)
            if elements:
                return elements
        except:
            continue
    return []

# ─── SCRAPE ONE PROFILE ─────────────────────────────────────────
def scrape_profile(driver, url):
    print(f"\n📄 Loading profile: {url}")
    driver.get(url)
    time.sleep(PAGE_LOAD_WAIT)
    
    # Check if we hit a login wall
    if "login" in driver.current_url.lower():
        print("⚠️ Hit login wall, attempting to log in...")
        if not linkedin_login(driver):
            raise Exception("Failed to log in when hitting login wall")
        # Retry loading profile
        driver.get(url)
        time.sleep(PAGE_LOAD_WAIT)

    data = {"profile_url": url}

    # — About —
    try:
        print("  → Scraping About section...")
        # Multiple selectors to try for about section
        about_selectors = [
            "div.display-flex.ph5.pv-top-card",
            "section[data-section='summary']",
            "div.pv-about-section",
            "section.artdeco-card.pv-profile-card",
            "div.ph5.pb5"
        ]
        
        about_text_selectors = [
            "div.inline-show-more-text--is-collapsed span[aria-hidden='true']",
            "div.pv-about__summary-text span[aria-hidden='true']",
            "div.inline-show-more-text span[aria-hidden='true']",
            "span.break-words",
            "div.display-flex span[aria-hidden='true']"
        ]
        
        about_div = None
        for selector in about_selectors:
            try:
                about_div = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                break
            except:
                continue
        
        if about_div:
            about_element = find_element_with_fallbacks(about_div, about_text_selectors)
            if about_element:
                data["about"] = about_element.text.strip()
            else:
                data["about"] = ""
        else:
            data["about"] = ""
        print("  ✅ About section scraped")
    except Exception as e:
        print(f"  ⚠️ Could not scrape About section: {str(e)}")
        data["about"] = ""

    # — Followers —
    try:
        print("  → Scraping Followers...")
        follower_selectors = [
            "p.pvs-header__optional-link span[aria-hidden='true']",
            "span.t-14.t-normal.t-black--light",
            "span.pv-top-card--list-bullet",
            "li.pv-top-card--list-bullet span"
        ]
        
        fol_element = find_element_with_fallbacks(driver, follower_selectors)
        data["followers"] = fol_element.text.strip() if fol_element else ""
        print("  ✅ Followers scraped")
    except Exception as e:
        print(f"  ⚠️ Could not scrape Followers: {str(e)}")
        data["followers"] = ""

    # — Experience —
    exps = []
    try:
        print("  → Scraping Experience section...")
        driver.execute_script("window.scrollTo(0, document.getElementById('experience').offsetTop);")
        time.sleep(1)
        exp_section = driver.find_element(By.ID, "experience")
        ul = exp_section.find_element(By.XPATH, "./following-sibling::div//ul")
        items = ul.find_elements(By.TAG_NAME, "li")
        print(f"  → Found {len(items)} experience entries")
        
        current_exp = None  # Track current experience for sub-items
        
        for i, li in enumerate(items, 1):
            try:
                # Check if this li has the main experience container (main job entry)
                exp_containers = li.find_elements(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
                
                if exp_containers:
                    # This is a main experience entry
                    exp_container = exp_containers[0]
                    
                    # Job Title - Look for the specific pattern we identified
                    title_selectors = [
                        "div.mr1.hoverable-link-text.t-bold span[aria-hidden='true']",
                        "div.display-flex.align-items-center.mr1.hoverable-link-text.t-bold span[aria-hidden='true']",
                        "div.hoverable-link-text.t-bold span[aria-hidden='true']",
                        "span.mr1.t-bold span[aria-hidden='true']"
                    ]
                    
                    # Company Name - First span.t-14.t-normal
                    company_selectors = [
                        "span.t-14.t-normal span[aria-hidden='true']",
                        "span.t-14.t-normal",
                        "a[data-field='experience_company_logo'] span.t-14.t-normal span[aria-hidden='true']"
                    ]
                    
                    # Extract basic job info
                    title_element = find_element_with_fallbacks(exp_container, title_selectors)
                    company_element = find_element_with_fallbacks(exp_container, company_selectors)
                    
                    title = title_element.text.strip() if title_element else ""
                    company = company_element.text.strip() if company_element else ""
                    
                    # Clean up company name (remove "· Full-time" etc.)
                    if company and '·' in company:
                        company = company.split('·')[0].strip()
                    
                    if title or company:  # Only create if we got at least title or company
                        current_exp = {
                            "exp_title": title,
                            "exp_company": company
                        }
                        exps.append(current_exp)  # Add immediately, no need to track sub-items
                        print(f"  → Scraped experience {i}/{len(items)}: {title} at {company}")
                    else:
                        print(f"  ⚠️ Could not extract main data for experience {i}")
                        
                else:
                    # Skip sub-items since we're not tracking dates/location anymore
                    print(f"  ⏩ Skipping sub-item {i}/{len(items)}")
                    
            except Exception as e:
                print(f"  ⚠️ Could not process item {i}: {str(e)}")
                continue
        print("  ✅ Experience section scraped")
    except Exception as e:
        print(f"  ⚠️ Could not access Experience section: {str(e)}")
    data["experience"] = exps

    # — Skills —
    skills = []
    try:
        print("  → Scraping Skills section...")
        driver.execute_script("window.scrollTo(0, document.getElementById('skills').offsetTop);")
        time.sleep(1)
        sk_section = driver.find_element(By.ID, "skills")
        sk_ul = sk_section.find_element(By.XPATH, "./following-sibling::div//ul")
        sk_items = sk_ul.find_elements(By.TAG_NAME, "li")
        print(f"  → Found {len(sk_items)} skills")
        
        skill_count = 0
        for i, sk in enumerate(sk_items, 1):
            try:
                # Look for skill name (should not contain numbers like "8 endorsements")
                skill_name_selectors = [
                    "span[aria-hidden='true']",
                    "div.hoverable-link-text span[aria-hidden='true']",
                    "a span[aria-hidden='true']",
                    ".pvs-entity__path-node span"
                ]
                
                name_element = find_element_with_fallbacks(sk, skill_name_selectors)
                name = name_element.text.strip() if name_element else ""
                
                # Only add if we got a skill name AND it doesn't look like an endorsement count
                if name and not any(indicator in name.lower() for indicator in ['endorsement', 'endorsing', 'connection']):
                    # Additional check: skip if it's just numbers or contains only numbers and common words
                    if not (name.replace(' ', '').isdigit() or 
                           any(name.lower().startswith(num) for num in ['1 ', '2 ', '3 ', '4 ', '5 ', '6 ', '7 ', '8 ', '9 '])):
                        skills.append(name)  # Just store the skill name as a string
                        skill_count += 1
                        print(f"  → Scraped skill {skill_count}: {name}")
                    else:
                        print(f"  ⏩ Skipping endorsement count: {name}")
                else:
                    if name:
                        print(f"  ⏩ Skipping endorsement item: {name}")
            except Exception as e:
                print(f"  ⚠️ Could not scrape skill {i}: {str(e)}")
                continue
        print("  ✅ Skills section scraped")
    except Exception as e:
        print(f"  ⚠️ Could not access Skills section: {str(e)}")
    data["skills"] = skills

    # — Education —
    education = []
    try:
        print("  → Scraping Education section...")
        driver.execute_script("window.scrollTo(0, document.getElementById('education').offsetTop);")
        time.sleep(1)
        
        education_section = driver.find_element(By.ID, "education")
        ul = education_section.find_element(By.XPATH, "./following-sibling::div//ul")
        items = ul.find_elements(By.TAG_NAME, "li")
        print(f"  → Found {len(items)} education entries")
        
        for i, li in enumerate(items, 1):
            try:
                # Find education containers (similar pattern to experience)
                edu_containers = li.find_elements(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
                
                if edu_containers:
                    edu_container = edu_containers[0]
                    
                    # School Name - Look for the bold hoverable link text
                    school_selectors = [
                        "div.mr1.hoverable-link-text.t-bold span[aria-hidden='true']",
                        "div.hoverable-link-text.t-bold span[aria-hidden='true']",
                        "div.display-flex.align-items-center.mr1.hoverable-link-text.t-bold span[aria-hidden='true']"
                    ]
                    
                    # Degree - Look for span.t-14.t-normal (but not the one with t-black--light)
                    degree_selectors = [
                        "span.t-14.t-normal span[aria-hidden='true']",
                        "span.t-14.t-normal"
                    ]
                    
                    # Years - Look for the caption wrapper
                    years_selectors = [
                        "span.pvs-entity__caption-wrapper[aria-hidden='true']",
                        "span.t-14.t-normal.t-black--light span.pvs-entity__caption-wrapper[aria-hidden='true']",
                        "span.t-14.t-normal.t-black--light span[aria-hidden='true']"
                    ]
                    
                    # Extract data
                    school_element = find_element_with_fallbacks(edu_container, school_selectors)
                    degree_element = find_element_with_fallbacks(edu_container, degree_selectors)
                    years_element = find_element_with_fallbacks(edu_container, years_selectors)
                    
                    school = school_element.text.strip() if school_element else ""
                    degree = degree_element.text.strip() if degree_element else ""
                    years = years_element.text.strip() if years_element else ""
                    
                    if school:  # Only add if we got at least a school name
                        education.append({
                            "school": school,
                            "degree": degree,
                            "years": years
                        })
                        print(f"  → Scraped education {i}/{len(items)}: {school} - {degree} ({years})")
                    else:
                        print(f"  ⚠️ Could not extract school name for education {i}")
                else:
                    # Skip sub-items for education (like activities)
                    print(f"  ⏩ Skipping education sub-item {i}/{len(items)}")
                    
            except Exception as e:
                print(f"  ⚠️ Could not process education item {i}: {str(e)}")
                continue
                
        print("  ✅ Education section scraped")
    except Exception as e:
        print(f"  ⚠️ Could not access Education section: {str(e)}")
    
    data["education"] = education
    print("✅ Profile scraping completed")
    return data

def load_progress():
    """Load existing progress from pickle file"""
    if os.path.exists(PROGRESS_PICKLE):
        try:
            with open(PROGRESS_PICKLE, 'rb') as f:
                progress = pickle.load(f)
                print(f"📂 Loaded progress: {len(progress['results'])} profiles already scraped")
                return progress['results'], progress['processed_indices']
        except Exception as e:
            print(f"⚠️ Could not load progress file: {e}")
    return [], set()

def save_progress(results, processed_indices):
    """Save current progress to pickle file"""
    try:
        progress = {
            'results': results,
            'processed_indices': processed_indices
        }
        with open(PROGRESS_PICKLE, 'wb') as f:
            pickle.dump(progress, f)
        print(f"💾 Progress saved: {len(results)} profiles completed")
    except Exception as e:
        print(f"⚠️ Could not save progress: {e}")

def main():
    print("\n🚀 Starting LinkedIn profile scraper...")
    
    # Load existing progress
    results, processed_indices = load_progress()
    
    driver = init_driver()
    
    # Log in first before scraping
    if not linkedin_login(driver):
        driver.quit()
        raise Exception("Failed to log into LinkedIn. Please check credentials.")
    
    print(f"\n📖 Reading input CSV: {INPUT_CSV}")
    df_urls = pd.read_csv(INPUT_CSV)
    print(f"✅ Found {len(df_urls)} total profiles")
    
    # Filter out headless URLs and already processed ones
    valid_profiles = []
    for idx, row in df_urls.iterrows():
        url = row["profile_url"]
        if "https://www.linkedin.com/search/results/people/headless" in url:
            print(f"⏩ Skipping headless URL: {url}")
            continue
        if idx in processed_indices:
            print(f"⏩ Already processed profile {idx+1}")
            continue
        valid_profiles.append((idx, row))
    
    total_remaining = len(valid_profiles)
    print(f"📊 {len(processed_indices)} already completed, {total_remaining} remaining to process")

    for count, (idx, row) in enumerate(valid_profiles, 1):
        url = row["profile_url"]
        print(f"\n🔄 Processing profile {count}/{total_remaining} (Row {idx+1}): {url}")
        
        try:
            profile_data = scrape_profile(driver, url)
            results.append({**row.to_dict(), **profile_data})
            processed_indices.add(idx)
            
            # Save progress every 5 profiles
            if count % 5 == 0:
                save_progress(results, processed_indices)
                
        except Exception as e:
            print(f"  ❌ Error on {url}: {e}")
            processed_indices.add(idx)  # Mark as processed even if failed
            continue

    driver.quit()

    # Final save
    save_progress(results, processed_indices)

    # Normalize lists for CSV (JSON-dump experience, skills, education)
    print("\n📊 Preparing final results...")
    for r in results:
        r["experience"] = json.dumps(r["experience"])  # List of {exp_title, exp_company}
        r["skills"]     = json.dumps(r["skills"])      # List of skill name strings
        r["education"]  = json.dumps(r["education"])   # List of {school, degree, years}

    df_prof = pd.DataFrame(results)
    df_prof.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Done! Saved {len(results)} profiles to {OUTPUT_CSV}")
    
    # Clean up progress file
    if os.path.exists(PROGRESS_PICKLE):
        os.remove(PROGRESS_PICKLE)
        print("🧹 Cleaned up temporary progress file")
    
    print(f"\n📈 Final Summary:")
    print(f"   • Total profiles in CSV: {len(df_urls)}")
    print(f"   • Successfully scraped: {len(results)}")
    print(f"   • Headless URLs skipped: {len([r for _, r in df_urls.iterrows() if 'https://www.linkedin.com/search/results/people/headless' in r['profile_url']])}")
    print(f"\nSample of scraped profiles:")
    print(df_prof.head())

if __name__ == "__main__":
    main()



🚀 Starting LinkedIn profile scraper...
Initializing Chrome driver...
✅ Driver initialized successfully

🔑 Logging into LinkedIn...
✅ Successfully logged into LinkedIn

📖 Reading input CSV: mschf_linkedin_employees.csv
✅ Found 150 total profiles
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: https://www.linkedin.com/search/results/people/headless
⏩ Skipping headless URL: 

In [9]:
df_prof = pd.read_csv('mschf_profiles_detailed_full.csv')

