In [1]:
import os
import sys
import time
import random
import re
import logging
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.action_chains import ActionChains
from dotenv import load_dotenv
from uuid import uuid4

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class LinkedInScraper:
    def __init__(self, headless=False, debug=True, max_posts=50):
        """Initialize the LinkedIn scraper with login credentials."""
        self.email = os.getenv("LINKEDIN_EMAIL")
        self.password = os.getenv("LINKEDIN_PASSWORD")
        self.debug = debug
        self.max_posts = max_posts
        self.session_id = str(uuid4())[:8]  # Generate a unique session ID
        
        # Create debug directory
        if self.debug:
            os.makedirs('debug', exist_ok=True)
        
        # Create data directory
        os.makedirs('data', exist_ok=True)
        
        # Setup Selenium options
        options = webdriver.ChromeOptions()
        if headless:
            options.add_argument('--headless=new')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-notifications')
        options.add_argument('--window-size=1920,1080')
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        # Initialize the driver
        self.driver = webdriver.Chrome(options=options)
        self.wait = WebDriverWait(self.driver, 15)
        self.logged_in = False
        
        # Add a variable to track accumulated posts during scraping
        self.accumulated_posts = []
    
    def login(self):
        """Log in to LinkedIn."""
        try:
            logger.info("Navigating to LinkedIn login page")
            self.driver.get('https://www.linkedin.com/login')
            time.sleep(3)
            
            # Take screenshot of login page
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_login_page.png')
            
            # Wait for login page to load
            self.wait.until(EC.presence_of_element_located((By.ID, 'username')))
            
            # Enter email
            username_field = self.driver.find_element(By.ID, 'username')
            username_field.clear()
            username_field.send_keys(self.email)
            
            # Enter password
            password_field = self.driver.find_element(By.ID, 'password')
            password_field.clear()
            password_field.send_keys(self.password)
            
            # Click the login button
            self.driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]').click()
            
            # Wait for the homepage to load
            try:
                self.wait.until(EC.presence_of_element_located((By.ID, 'global-nav')))
                logger.info("Successfully logged in")
                self.logged_in = True
                
                # Take screenshot after login
                if self.debug:
                    self.driver.save_screenshot(f'debug/{self.session_id}_after_login.png')
                
                # Wait a bit after login
                time.sleep(5)
                
            except TimeoutException:
                # Check if we got a security verification page
                if "security verification" in self.driver.page_source.lower() or "challenge" in self.driver.page_source.lower():
                    logger.warning("Security verification detected. Please complete it manually.")
                    if self.debug:
                        self.driver.save_screenshot(f'debug/{self.session_id}_security_verification.png')
                    input("Complete the security verification and press Enter to continue...")
                    self.logged_in = True
                else:
                    logger.error("Login failed - couldn't detect navigation bar")
                    if self.debug:
                        self.driver.save_screenshot(f'debug/{self.session_id}_login_failure.png')
        
        except Exception as e:
            logger.error(f"Failed to login: {str(e)}")
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_login_error.png')
            raise e
    
    def navigate_to_profile(self, profile_url):
        """Navigate to a LinkedIn profile and ensure it's loaded."""
        if not self.logged_in:
            self.login()
        
        try:
            # Ensure URL is the recent-activity/all page
            if "recent-activity/all" not in profile_url:
                if not profile_url.endswith('/'):
                    profile_url = profile_url + '/'
                profile_url = profile_url + "recent-activity/all/"
            
            logger.info(f"Navigating to activity page: {profile_url}")
            self.driver.get(profile_url)
            
            # Wait for page to load
            time.sleep(5)
            
            # Take screenshot of profile page
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_profile_page.png')
            
            # Check if we're on the right page
            if "recent-activity" not in self.driver.current_url:
                logger.warning(f"Not on activity page. Current URL: {self.driver.current_url}")
                if self.debug:
                    self.driver.save_screenshot(f'debug/{self.session_id}_wrong_page.png')
                return False
            
            # Wait for content to load
            try:
                # Wait for any of these elements that indicate posts are loaded
                selectors = [
                    ".occludable-update",
                    ".feed-shared-update-v2",
                    ".profile-creator-shared-feed-update__container"
                ]
                
                for selector in selectors:
                    try:
                        self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
                        logger.info(f"Found posts with selector: {selector}")
                        return True
                    except:
                        continue
                
                logger.warning("Could not find any post elements")
                if self.debug:
                    self.driver.save_screenshot(f'debug/{self.session_id}_no_posts_found.png')
                return False
                
            except TimeoutException:
                logger.warning("Timeout waiting for posts to load")
                if self.debug:
                    self.driver.save_screenshot(f'debug/{self.session_id}_posts_timeout.png')
                return False
                
        except Exception as e:
            logger.error(f"Error navigating to profile: {str(e)}")
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_navigation_error.png')
            return False
    
    def scroll_to_top(self):
        """Scroll to the top of the page."""
        try:
            self.driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(2)
            logger.info("Scrolled to top of page")
            return True
        except Exception as e:
            logger.error(f"Error scrolling to top: {str(e)}")
            return False
        
    def check_for_redirect(self, original_url):
        """Check if LinkedIn has redirected us to a different page."""
        try:
            current_url = self.driver.current_url
            
            # Extract the core profile identifier from both URLs
            def extract_profile_id(url):
                if '/in/' in url:
                    # Extract profile slug from URL like /in/profile-name/
                    profile_part = url.split('/in/')[1].split('/')[0]
                    return profile_part
                return None
            
            original_profile = extract_profile_id(original_url)
            current_profile = extract_profile_id(current_url)
            
            # Check if we're still on the same profile
            if original_profile and current_profile:
                if original_profile != current_profile:
                    logger.warning(f"Profile redirect detected: {original_profile} -> {current_profile}")
                    return True
            
            # Check for common LinkedIn redirect patterns
            redirect_indicators = [
                '/feed/',
                '/search/',
                '/company/',
                '/school/',
                '/checkpoint/',
                '/uas/login',
                '/authwall',
                '/login',
                'linkedin.com/404',
                'linkedin.com/error'
            ]
            
            for indicator in redirect_indicators:
                if indicator in current_url.lower():
                    logger.warning(f"LinkedIn redirect detected to: {current_url}")
                    return True
            
            # Check if we're no longer on a recent-activity page when we should be
            if 'recent-activity' in original_url and 'recent-activity' not in current_url:
                logger.warning(f"Redirected away from activity page: {current_url}")
                return True
            
            return False
            
        except Exception as e:
            logger.error(f"Error checking for redirect: {str(e)}")
            return False
    
    def scroll_and_extract_incrementally(self, category, original_url, max_scrolls=15):
        """Scroll the page and extract posts incrementally. Stop if redirected but keep accumulated posts."""
        try:
            # First scroll to top
            self.scroll_to_top()
            
            # Take screenshot before scrolling
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_before_scrolling.png')
            
            posts_loaded = set()  # Track unique posts to avoid counting duplicates
            self.accumulated_posts = []  # Reset accumulated posts for this profile
            processed_texts = set()  # Track processed posts to avoid duplicates
            
            # Get profile name once
            profile_name = self.extract_profile_name()
            
            # Scroll down gradually to load posts
            for i in range(max_scrolls):
                logger.info(f"Scroll {i+1}/{max_scrolls}")
                
                # Check for redirect before continuing
                if self.check_for_redirect(original_url):
                    logger.warning(f"Redirect detected during scroll {i+1}")
                    logger.info(f"Saving {len(self.accumulated_posts)} posts collected so far")
                    if self.debug:
                        self.driver.save_screenshot(f'debug/{self.session_id}_redirect_detected.png')
                    return self.accumulated_posts  # Return what we have so far
                
                # Find all "see more" links and expand them
                self.expand_all_see_more()
                
                # Extract posts from current view
                current_batch_posts = self.extract_current_posts(category, profile_name, processed_texts)
                
                # Add new posts to accumulated posts
                new_posts_count = 0
                for post in current_batch_posts:
                    if post['post_text'] not in processed_texts:
                        self.accumulated_posts.append(post)
                        processed_texts.add(post['post_text'])
                        new_posts_count += 1
                
                logger.info(f"Found {new_posts_count} new posts. Total accumulated: {len(self.accumulated_posts)}")
                
                # If we have enough posts, we can stop scrolling
                if len(self.accumulated_posts) >= self.max_posts:
                    logger.info(f"Reached target of {self.max_posts} posts")
                    break
                
                # Scroll down more aggressively for more posts
                self.driver.execute_script("window.scrollBy(0, 800);")
                time.sleep(3)
                
                # Check for redirect after scrolling
                if self.check_for_redirect(original_url):
                    logger.warning(f"Redirect detected after scroll {i+1}")
                    logger.info(f"Saving {len(self.accumulated_posts)} posts collected so far")
                    if self.debug:
                        self.driver.save_screenshot(f'debug/{self.session_id}_redirect_after_scroll.png')
                    return self.accumulated_posts  # Return what we have so far
                
                # Every 3 scrolls, take a screenshot and check URL
                if self.debug and i % 3 == 0:
                    self.driver.save_screenshot(f'debug/{self.session_id}_scrolling_{i+1}.png')
            
            # Final expansion of "see more" links
            self.expand_all_see_more()
            
            # Final redirect check
            if self.check_for_redirect(original_url):
                logger.warning("Redirect detected during final expansion")
                logger.info(f"Saving {len(self.accumulated_posts)} posts collected so far")
                return self.accumulated_posts
            
            # Final extraction
            final_batch_posts = self.extract_current_posts(category, profile_name, processed_texts)
            for post in final_batch_posts:
                if post['post_text'] not in processed_texts:
                    self.accumulated_posts.append(post)
                    processed_texts.add(post['post_text'])
            
            # Take screenshot after scrolling
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_after_scrolling.png')
            
            logger.info(f"Successfully completed scraping. Total posts: {len(self.accumulated_posts)}")
            return self.accumulated_posts
            
        except Exception as e:
            logger.error(f"Error during scrolling: {str(e)}")
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_scrolling_error.png')
            # Even on error, return what we have accumulated
            logger.info(f"Returning {len(self.accumulated_posts)} posts despite error")
            return self.accumulated_posts
    
    def extract_current_posts(self, category, profile_name, processed_texts):
        """Extract posts currently visible on the page."""
        try:
            # Find all post containers
            post_selectors = [
                ".feed-shared-update-v2",
                ".occludable-update",
                ".profile-creator-shared-feed-update__container"
            ]
            
            all_posts = []
            for selector in post_selectors:
                posts = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if posts:
                    all_posts = posts
                    break
            
            if not all_posts:
                return []
            
            # Extract data from each post
            post_data = []
            
            for i, post in enumerate(all_posts):
                try:
                    # Check if this is an original post
                    if not self.is_original_post(post):
                        continue
                    
                    # Extract post data
                    post_text = self.extract_post_text(post)
                    
                    # Skip if we've already processed this text or it's too short
                    if post_text in processed_texts or len(post_text.strip()) < 10:
                        continue
                    
                    # Add to post data
                    post_data.append({
                        'profile_name': profile_name,
                        'post_text': post_text,
                        'category': category
                    })
                    
                except Exception as e:
                    logger.error(f"Error processing post {i+1}: {str(e)}")
                    continue
            
            return post_data
            
        except Exception as e:
            logger.error(f"Error extracting current posts: {str(e)}")
            return []
    
    def count_loaded_posts(self):
        """Count the number of posts currently loaded on the page."""
        try:
            post_selectors = [
                ".feed-shared-update-v2",
                ".occludable-update",
                ".profile-creator-shared-feed-update__container"
            ]
            
            max_count = 0
            for selector in post_selectors:
                posts = self.driver.find_elements(By.CSS_SELECTOR, selector)
                max_count = max(max_count, len(posts))
            
            return max_count
        except:
            return 0
    
    def expand_all_see_more(self):
        """Find and click all 'see more' links on the page."""
        try:
            # Find all elements that might be "see more" buttons
            see_more_selectors = [
                ".inline-show-more-text__button",
                ".feed-shared-inline-show-more-text__see-more",
                ".feed-shared-text-view__see-more",
                ".see-more",
                "span.lt-line-clamp__more"
            ]
            
            for selector in see_more_selectors:
                try:
                    see_more_buttons = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    logger.info(f"Found {len(see_more_buttons)} potential 'see more' buttons with selector: {selector}")
                    
                    for button in see_more_buttons:
                        try:
                            if button.is_displayed():
                                # Try to scroll to the button
                                self.driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", button)
                                time.sleep(1)
                                
                                # Try JavaScript click
                                try:
                                    self.driver.execute_script("arguments[0].click();", button)
                                    logger.info("Expanded post with JS click")
                                    time.sleep(1)
                                except:
                                    # Try regular click
                                    try:
                                        button.click()
                                        logger.info("Expanded post with regular click")
                                        time.sleep(1)
                                    except:
                                        pass
                        except:
                            continue
                except:
                    continue
            
            # Also try a more aggressive approach with JavaScript
            try:
                expanded_count = self.driver.execute_script("""
                    const expandButtons = [];
                    
                    // Find all elements containing "...more" or "see more" text
                    const allElements = document.querySelectorAll('*');
                    for (const el of allElements) {
                        const text = el.textContent;
                        if ((text.includes('…more') || 
                             text.includes('...more') || 
                             text.toLowerCase().includes('see more')) && 
                            el.offsetWidth > 0 && 
                            el.offsetHeight > 0) {
                            
                            try {
                                el.click();
                                expandButtons.push(el);
                            } catch (e) {
                                // Try parent element
                                try {
                                    el.parentElement.click();
                                    expandButtons.push(el.parentElement);
                                } catch (e2) {
                                    // Ignore
                                }
                            }
                        }
                    }
                    
                    return expandButtons.length;
                """)
                
                if expanded_count > 0:
                    logger.info(f"Expanded {expanded_count} 'see more' buttons with JavaScript")
                    time.sleep(2)
            except:
                pass
                
            return True
            
        except Exception as e:
            logger.error(f"Error expanding 'see more' links: {str(e)}")
            return False
    
    def extract_posts(self, category):
        """Extract all original posts from the current page. Modified to handle 50 posts."""
        try:
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_before_extraction.png')
            
            # First, scroll to top to ensure we start from the top (most recent posts)
            self.scroll_to_top()
            time.sleep(3)  # Give page time to load top content
            
            # Find all post containers - increased limit to get more posts
            post_selectors = [
                ".feed-shared-update-v2",
                ".occludable-update",
                ".profile-creator-shared-feed-update__container"
            ]
            
            all_posts = []
            for selector in post_selectors:
                posts = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if posts:
                    logger.info(f"Found {len(posts)} posts with selector: {selector}")
                    # Increased from 15 to handle more posts, but cap at reasonable number
                    all_posts = posts[:100]  # Take up to 100 posts to filter from
                    break
            
            if not all_posts:
                logger.warning("No posts found")
                if self.debug:
                    self.driver.save_screenshot(f'debug/{self.session_id}_no_posts.png')
                return []
            
            # Extract data from each post
            post_data = []
            post_count = 0
            processed_texts = set()  # Track processed posts to avoid duplicates
            
            for i, post in enumerate(all_posts):
                try:
                    # Stop if we've reached our target
                    if post_count >= self.max_posts:
                        logger.info(f"Reached target of {self.max_posts} posts")
                        break
                    
                    logger.info(f"Processing post {i+1}/{len(all_posts)} (extracted: {post_count})")
                    
                    # Scroll to the post to ensure it's in view
                    try:
                        self.driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", post)
                        time.sleep(1)
                    except:
                        logger.warning(f"Could not scroll to post {i+1}")
                    
                    # Debug screenshot every 10 posts
                    if self.debug and i % 10 == 0:
                        self.driver.save_screenshot(f'debug/{self.session_id}_post_{i+1}.png')
                    
                    # Check if this is an original post
                    if not self.is_original_post(post):
                        logger.info(f"Post {i+1} is not an original post, skipping")
                        continue
                    
                    # Extract post data
                    post_text = self.extract_post_text(post)
                    
                    # Skip if we've already processed this text (duplicate detection)
                    if post_text in processed_texts or len(post_text.strip()) < 10:
                        logger.info(f"Post {i+1} is duplicate or too short, skipping")
                        continue
                    
                    processed_texts.add(post_text)
                    
                    # Log the post data being extracted
                    logger.info(f"Post {i+1}: Text length={len(post_text)}")
                    
                    # Get profile name
                    profile_name = self.extract_profile_name()
                    
                    # Add to post data - only the required fields
                    post_data.append({
                        'profile_name': profile_name,
                        'post_text': post_text,
                        'category': category
                    })
                    
                    logger.info(f"Successfully extracted post {post_count + 1}")
                    
                    # Increment post count
                    post_count += 1
                    
                except Exception as e:
                    logger.error(f"Error processing post {i+1}: {str(e)}")
                    continue
            
            logger.info(f"Extracted {len(post_data)} posts")
            return post_data
            
        except Exception as e:
            logger.error(f"Error extracting posts: {str(e)}")
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_extraction_error.png')
            return []
    
    def is_original_post(self, post):
        """Check if a post is an original post (not a like, comment, etc.)."""
        try:
            # Check for activity indicators
            activity_texts = [
                "liked", "commented on", "replied", "reposted", 
                "shared", "celebrates", "mentioned in", "follows"
            ]
            
            post_text = post.text.lower()
            
            # If the post contains any activity indicators at the beginning, it's not original
            for activity in activity_texts:
                if post_text.startswith(activity) or f"\n{activity}" in post_text[:50]:
                    return False
            
            # Check for content indicators
            content_selectors = [
                ".feed-shared-update-v2__description",
                ".feed-shared-text",
                ".update-components-text",
                ".feed-shared-text-view",
                ".update-components-update-v2__commentary"
            ]
            
            for selector in content_selectors:
                content_elements = post.find_elements(By.CSS_SELECTOR, selector)
                if content_elements and any(el.text.strip() for el in content_elements):
                    return True
            
            # If we can't determine, assume it's not original
            return False
            
        except Exception as e:
            logger.error(f"Error checking if post is original: {str(e)}")
            return False
    
    def extract_post_text(self, post):
        """Extract the text content of a post."""
        try:
            # Try to expand "see more" links in this post
            self.expand_see_more_in_post(post)
            
            # Try different selectors for post content
            content_selectors = [
                ".feed-shared-update-v2__description",
                ".feed-shared-text",
                ".update-components-text",
                ".feed-shared-text-view"
            ]
            
            post_text = ""
            for selector in content_selectors:
                elements = post.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    text = element.text.strip()
                    if text and len(text) > len(post_text):
                        post_text = text
            
            # If no text found, try JavaScript
            if not post_text:
                post_text = self.driver.execute_script("""
                    const post = arguments[0];
                    
                    // Try to find the main text content
                    const contentElements = post.querySelectorAll('p, span.break-words, div.break-words');
                    let text = '';
                    
                    for (const el of contentElements) {
                        if (el.textContent.trim() && el.offsetWidth > 0 && el.offsetHeight > 0) {
                            text += el.textContent.trim() + '\\n';
                        }
                    }
                    
                    return text.trim();
                """, post)
            
            # Clean up the text
            post_text = re.sub(r'\n\s*\n', '\n\n', post_text)  # Remove extra newlines
            post_text = re.sub(r' +', ' ', post_text)  # Remove extra spaces
            
            return post_text.strip()
            
        except Exception as e:
            logger.error(f"Error extracting post text: {str(e)}")
            return "Error extracting text"
    
    def expand_see_more_in_post(self, post):
        """Expand 'see more' links in a specific post."""
        try:
            # Find all "see more" links in this post
            see_more_selectors = [
                ".inline-show-more-text__button",
                ".feed-shared-inline-show-more-text__see-more",
                ".feed-shared-text-view__see-more",
                ".see-more",
                "span.lt-line-clamp__more"
            ]
            
            for selector in see_more_selectors:
                try:
                    see_more_buttons = post.find_elements(By.CSS_SELECTOR, selector)
                    for button in see_more_buttons:
                        try:
                            if button.is_displayed():
                                # Try JavaScript click
                                self.driver.execute_script("arguments[0].click();", button)
                                time.sleep(1)
                        except:
                            pass
                except:
                    continue
            
            # Also try with JavaScript
            self.driver.execute_script("""
                const post = arguments[0];
                
                // Find all elements containing "...more" or "see more" text
                const allElements = post.querySelectorAll('*');
                for (const el of allElements) {
                    const text = el.textContent;
                    if ((text.includes('…more') || 
                         text.includes('...more') || 
                         text.toLowerCase().includes('see more')) && 
                        el.offsetWidth > 0 && 
                        el.offsetHeight > 0) {
                        
                        try {
                            el.click();
                        } catch (e) {
                            // Try parent element
                            try {
                                el.parentElement.click();
                            } catch (e2) {
                                // Ignore
                            }
                        }
                    }
                }
            """, post)
            
            return True
            
        except Exception as e:
            logger.error(f"Error expanding 'see more' in post: {str(e)}")
            return False
    
    def extract_profile_name(self):
        """Extract the profile name from the current page."""
        try:
            # Try to find the profile name
            profile_name = self.driver.execute_script("""
                // Try to find profile name
                const nameElement = document.querySelector('h1.text-heading-xlarge') || 
                                   document.querySelector('.pv-text-details__left-panel h1');
                return nameElement ? nameElement.textContent.trim() : null;
            """)
            
            if not profile_name:
                # Try to extract from URL
                current_url = self.driver.current_url
                if '/in/' in current_url:
                    profile_name = current_url.split('/in/')[1].split('/')[0].replace('-', ' ').title()
                else:
                    profile_name = "Unknown Profile"
            
            return profile_name
            
        except Exception as e:
            logger.error(f"Error extracting profile name: {str(e)}")
            return "Unknown Profile"
    
    def scrape_profile(self, profile_url, category):
        """Scrape a LinkedIn profile for original posts. Save posts incrementally even if redirected."""
        try:
            # Navigate to the profile
            if not self.navigate_to_profile(profile_url):
                logger.error(f"Failed to navigate to profile: {profile_url}")
                return []
            original_url = profile_url
            
            # Use the incremental scrolling method that handles redirects
            posts = self.scroll_and_extract_incrementally(category, original_url)
            
            # Save posts immediately, even if we were redirected
            if posts:
                self.save_posts_to_csv(posts, category)
                logger.info(f"Successfully saved {len(posts)} posts from profile")
            else:
                logger.warning("No posts were extracted from the profile")
            
            return posts
            
        except Exception as e:
            logger.error(f"Error scraping profile {profile_url}: {str(e)}")
            if self.debug:
                self.driver.save_screenshot(f'debug/{self.session_id}_scrape_profile_error.png')
            
            # Even on error, try to save any accumulated posts
            if hasattr(self, 'accumulated_posts') and self.accumulated_posts:
                logger.info(f"Saving {len(self.accumulated_posts)} posts despite error")
                self.save_posts_to_csv(self.accumulated_posts, category)
                return self.accumulated_posts
            
            return []
    
    def save_posts_to_csv(self, posts, category):
        """Save posts to a CSV file."""
        try:
            if not posts:
                logger.warning("No posts to save")
                return False
            
            # Create DataFrame
            df = pd.DataFrame(posts)
            
            # Create filename with timestamp
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"data/linkedin_posts_{category}_{timestamp}.csv"
            
            # Save to CSV
            df.to_csv(filename, index=False, encoding='utf-8')
            logger.info(f"Saved {len(posts)} posts to {filename}")
            
            # Also save a backup with session ID
            backup_filename = f"data/linkedin_posts_{category}_{self.session_id}.csv"
            df.to_csv(backup_filename, index=False, encoding='utf-8')
            logger.info(f"Backup saved to {backup_filename}")
            
            return True
            
        except Exception as e:
            logger.error(f"Error saving posts to CSV: {str(e)}")
            return False
    
    def close(self):
        """Close the browser and clean up."""
        try:
            if hasattr(self, 'driver'):
                self.driver.quit()
                logger.info("Browser closed successfully")
        except Exception as e:
            logger.error(f"Error closing browser: {str(e)}")

def main():
    """Main function to run the scraper."""
    MAX_POSTS_PER_PROFILE = 20  # Maximum posts to scrape per profile
    HEADLESS = False            # Set to True to run without browser window
    DEBUG = True                # Set to False to disable debug screenshots

    # Initialize scraper
    scraper = LinkedInScraper(
        headless=HEADLESS,
        debug=DEBUG,
        max_posts=MAX_POSTS_PER_PROFILE
    )

    try:
        data = pd.read_csv("build.csv")

        # Prepare list to collect all posts
        all_posts = []

        for idx, row in data.iterrows():
            profile_url = row["LinkedIn_URL"]
            category = row["Cat"]

            posts = scraper.scrape_profile(profile_url, category)
            
            # Assume each post is a dictionary
            for post in posts:
                post["profile_url"] = profile_url
                post["category"] = category
                all_posts.append(post)

        # Save all collected posts to a new CSV file
        if all_posts:
            df_posts = pd.DataFrame(all_posts)
            df_posts.to_csv("scraped_posts.csv", index=False)
            logger.info(f"Saved {len(all_posts)} posts to scraped_posts.csv")

    except Exception as e:
        logger.error(f"Error during scraping: {str(e)}")

    finally:
        scraper.close()


if __name__ == "__main__":
    main()

2025-06-20 13:26:00,327 - INFO - Navigating to LinkedIn login page
2025-06-20 13:26:15,049 - INFO - Successfully logged in
2025-06-20 13:26:20,265 - INFO - Navigating to activity page: https://www.linkedin.com/in/simonsinek/recent-activity/all/
2025-06-20 13:26:28,852 - INFO - Found posts with selector: .occludable-update
2025-06-20 13:26:30,864 - INFO - Scrolled to top of page
2025-06-20 13:26:31,087 - INFO - Scroll 1/15
2025-06-20 13:26:31,106 - INFO - Found 0 potential 'see more' buttons with selector: .inline-show-more-text__button
2025-06-20 13:26:31,116 - INFO - Found 0 potential 'see more' buttons with selector: .feed-shared-inline-show-more-text__see-more
2025-06-20 13:26:31,125 - INFO - Found 0 potential 'see more' buttons with selector: .feed-shared-text-view__see-more
2025-06-20 13:26:31,134 - INFO - Found 4 potential 'see more' buttons with selector: .see-more
2025-06-20 13:26:32,289 - INFO - Expanded post with JS click
2025-06-20 13:26:34,342 - INFO - Expanded post with JS

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# === Step 1: Load CSV ===
df = pd.read_csv("scraped_posts.csv")  # Replace with actual path
assert {'profile_name','post_text','category','profile_url'}.issubset(df.columns)

# === Step 2: Embed Text ===
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['post_text'].tolist(), show_progress_bar=True)

# === Step 3: Initialize Fresh ChromaDB ===
client = chromadb.PersistentClient(path="chroma_db")  # This is now a new DB
collection = client.get_collection(name="posts_collection")

# === Step 4: Insert Data ===
collection.add(
    documents=df['post_text'].tolist(),
    embeddings=embeddings,
    ids=[f"post_{i}" for i in range(len(df))],
    metadatas=[
        {
            "profile_name": row["profile_name"],
            "category": row["category"]
        } for _, row in df.iterrows()
    ]
)

print("✅ New ChromaDB created and populated.")


  from .autonotebook import tqdm as notebook_tqdm
2025-06-20 13:30:25,231 - INFO - Use pytorch device_name: cpu
2025-06-20 13:30:25,233 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.73it/s]
2025-06-20 13:30:29,989 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


✅ New ChromaDB created and populated.


In [3]:
# Reconnect to the ChromaDB collection
client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_collection(name="posts_collection")

# Get a few entries to inspect (e.g., first 5)
results = collection.get(
    ids=[f"post_{i}" for i in range(80)],  # Adjust the range as needed
    include=["documents", "metadatas", "embeddings"]
)

# Print them nicely
for i in range(len(results["ids"])):
    print(f"ID: {results['ids'][i]}")
    print(f"Post Text: {results['documents'][i]}")
    print(f"Metadata: {results['metadatas'][i]}")
    print("-" * 40)


ID: post_0
Post Text: Curious to know what 1 day of ChatGPT costs OpenAI in terms of energy (in ovens 🔥) and water (in bathtubs 🛀)?

1 day of ChatGPT = ~1bn queries
That's roughly: 
💧🛀 1000 bathtubs of water
⚡🔥 6000 ovens running for a day

Here's Sam Altman's original quote:

"The average query uses about 0.34 watt-hours, about what an oven would use in a little over one second, or a high-efficiency lightbulb would use in a couple of minutes. It also uses about 0.000085 gallons of water; roughly one fifteenth of a teaspoon."
Metadata: {'profile_name': 'cassie-kozyrkov-9531919', 'category': 'AI'}
----------------------------------------
ID: post_1
Post Text: There are 2 kinds of AI-first memos:
1. The kind CEOs issue to sound bold and visionary.
2. The kind Cassie cassie-kozyrkov-9531919 sent to her team.

We saw Shopify, Duolingo, Fiverr and others try the first way.
Didn’t exactly go over well.

But Cassie’s version sets the gold standard.

Not just for “AI-firstitude” as she calls i

In [None]:
# Reconnect to ChromaDB collection
client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_collection(name="posts_collection")

# Fetch a specific post with its embedding
result = collection.get(
    ids=["post_0"],
    include=["documents", "metadatas", "embeddings"]
)

# Access the embedding
embedding_vector = result["embeddings"][0]

# Print part of the embedding for inspection
print("Embedding vector (first 5 values):", embedding_vector[:5])
print("Vector length:", len(embedding_vector))


Embedding vector (first 5 values): [ 1.70568079e-02  4.12511155e-02  8.53130594e-03  9.25791562e-02
 -3.10870651e-02 -1.37444437e-01  1.26389891e-01 -6.08916767e-03
  2.68113576e-02 -2.79910062e-02 -6.71681389e-02 -8.77571851e-02
 -3.70843858e-02  8.91789608e-03  4.78884056e-02 -5.89795075e-02
  1.12260051e-01 -6.53123707e-02 -6.84304461e-02 -5.67816980e-02
  1.50067871e-02 -2.03076079e-02  9.16799754e-02 -4.07517739e-02
  9.18676108e-02  2.31047999e-02  1.31657475e-03  1.83610134e-02
 -2.33126469e-02  5.43295220e-02 -2.41325628e-02  1.93827990e-02
 -3.57656479e-02 -3.79402265e-02  2.94638127e-02  4.52185003e-03
 -5.07431217e-02 -5.43532111e-02 -5.45531698e-02  4.07217890e-02
  5.64346323e-03 -7.22044706e-02  3.55672613e-02 -8.98056105e-03
 -2.72821505e-02 -4.65902546e-03 -4.00151834e-02  9.35490523e-03
 -2.43916586e-02  5.57625620e-03 -2.48581115e-02 -5.64693566e-03
 -1.16323624e-02  3.30885611e-02  7.04717170e-03 -6.66778116e-03
 -3.89769003e-02 -2.68542636e-02  3.29584405e-02  1.021

In [4]:
profile_name = "simonsinek"  # Replace with your desired creator

results = collection.get(
    where={"profile_name": profile_name},
    include=["documents", "metadatas"]
)

for i, post in enumerate(results["documents"]):
    print(f"{i+1}. {post}\n")


In [8]:
# Fetch all documents in the collection (or in batches if it's large)
results = collection.get(include=["metadatas"])

# Extract profile names from metadata
profile_names = [meta['profile_name'] for meta in results['metadatas']]

# Get unique profile names
unique_profile_names = list(set(profile_names))

print(unique_profile_names)


['andrewyng', 'Saurabh Jain', 'simonsinek', 'cassie-kozyrkov-9531919', 'Archit Anand']


In [None]:
# Step 1: Fetch all documents with the old profile_name
results = collection.get(
    where={"profile_name": "Saurabh Jain"},
    include=["documents", "metadatas", "embeddings"]
)

# Step 2: Prepare new metadatas with updated profile_name
new_metadatas = []
for meta in results['metadatas']:
    meta['profile_name'] = "Saurabh Jain"
    new_metadatas.append(meta)

# Step 3: Delete old documents
collection.delete(ids=results['ids'])

# Step 4: Re-insert with updated profile_name
collection.add(
    documents=results['documents'],
    embeddings=results['embeddings'],
    ids=results['ids'],
    metadatas=new_metadatas
)

print("✅ profile_name updated successfully.")


✅ profile_name updated successfully.
