<a href="https://colab.research.google.com/github/hotsun1508/KISDI/blob/main/LinkedIn_Scraper_for_50_Profiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import time
import csv
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from bs4 import BeautifulSoup as bs
import re

# --- 1. CONFIGURATION ---
COOKIES_FILE = "cookies.txt"
# Set to a very high number to capture posts from the last ~2.5 years.
# The script will stop automatically if it reaches the end of the profile's feed.
MAX_POST_SCROLLS = 250
SCROLL_PAUSE_TIME = 2.5 # Increase if your internet connection is slow

profile_urls = [
    "https://www.linkedin.com/in/narendramodi/", "https://www.linkedin.com/in/williamhgates/",
    "https://www.linkedin.com/in/hamdan-bin-mohammed-al-maktoum-72761524a/", "https://www.linkedin.com/in/barackobama/",
    "https://www.linkedin.com/in/satyanadella/", "https://www.linkedin.com/in/sundarpichai/",
    "https://www.linkedin.com/in/mark-carney-5b9744205/", "https://www.linkedin.com/in/nithin-kamath-81136242/",
    "https://www.linkedin.com/in/andrewyng/", "https://www.linkedin.com/in/adammgrant/",
    "https://www.linkedin.com/in/ursula-von-der-leyen/", "https://www.linkedin.com/in/antonio-guterres/",
    "https://www.linkedin.com/in/hh-sheikh-mohamed-bin-zayed-al-nahyan/", "https://www.linkedin.com/in/giorgiameloni/",
    "https://www.linkedin.com/in/rafanadal/", "https://www.linkedin.com/in/alexxubyte/",
    "https://www.linkedin.com/in/aravind-srinivas-16051987/", "https://www.linkedin.com/in/piyushgoyalofficial/",
    "https://www.linkedin.com/in/shantanu-naidu/", "https://www.linkedin.com/in/mario-sergio-cortella/",
    "https://www.linkedin.com/in/nikhilkamathcio/", "https://www.linkedin.com/in/midudev/",
    "https://www.linkedin.com/in/andrew-huberman/", "https://www.linkedin.com/in/sel%C3%A7uk-bayraktar-a54bb619/",
    "https://www.linkedin.com/in/lawrence-wong-15728a18/", "https://www.linkedin.com/in/jean-marc-jancovici/",
    "https://www.linkedin.com/in/pauloguedeseconomista/", "https://www.linkedin.com/in/emmanuelmacron/",
    "https://www.linkedin.com/in/ekremimamoglu/", "https://www.linkedin.com/in/mattgarman/",
    "https://www.linkedin.com/in/sarablakely27/", "https://www.linkedin.com/in/emollick/",
    "https://www.linkedin.com/in/zak-brown-46b168104/", "https://www.linkedin.com/in/revant-himatsingka-food-pharmer-68326126/",
    "https://www.linkedin.com/in/aadit-palicha/", "https://www.linkedin.com/in/guillaume-pley-a1877731/",
    "https://www.linkedin.com/in/lexfridman/", "https://www.linkedin.com/in/raydalio/",
    "https://www.linkedin.com/in/antoniofilosa/", "https://www.linkedin.com/in/tonyelumelu/",
    "https://www.linkedin.com/in/bodour-al-qasimi-61a79b165/", "https://www.linkedin.com/in/mr-beast/",
    "https://www.linkedin.com/in/danielpink/", "https://www.linkedin.com/in/andersonrcorreia/",
    "https://www.linkedin.com/in/buschroland/", "https://www.linkedin.com/in/larrymadowo/",
    "https://www.linkedin.com/in/sirlewishamilton/", "https://www.linkedin.com/in/alicjasmin/",
    "https://www.linkedin.com/in/dharmesh/", "https://www.linkedin.com/in/aman-gupta-7217a515/"
]
# -------------------------------------------------------------------

# --- HELPER FUNCTIONS ---
def load_cookies(browser, file_path):
    """Loads cookies from a Netscape-formatted file into the browser session."""
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if not line.startswith('#') and line.strip():
                fields = line.strip().split('\t')
                if len(fields) == 7:
                    browser.add_cookie({
                        'name': fields[5], 'value': fields[6], 'domain': fields[0],
                        'path': fields[2], 'expiry': int(fields[4]) if fields[4].isdigit() else None,
                        'secure': fields[3] == 'TRUE'
                    })

def convert_abbreviated_to_number(s):
    """Converts strings like '1K' or '2.5M' to integers."""
    if not isinstance(s, str): return 0
    s = s.upper().strip().replace(',', '')
    if 'K' in s:
        return int(float(s.replace('K', '')) * 1000)
    elif 'M' in s:
        return int(float(s.replace('M', '')) * 1_000_000)
    else:
        try:
            return int(s)
        except ValueError:
            return 0

# --- COMMENT SCRAPING FUNCTIONS ---
def expand_all_comments(browser):
    """Continuously clicks 'Show previous comments' and 'Load previous replies'."""
    while True:
        buttons_found = False
        try:
            show_more_buttons = browser.find_elements(By.XPATH, "//button[span[text()='Show previous comments']]")
            for button in show_more_buttons:
                browser.execute_script("arguments[0].click();", button)
                buttons_found = True
                time.sleep(1.5)

            load_replies_buttons = browser.find_elements(By.XPATH, "//button[span[text()='Load previous replies']]")
            for button in load_replies_buttons:
                browser.execute_script("arguments[0].click();", button)
                buttons_found = True
                time.sleep(1.5)

            if not buttons_found:
                print("[*] No more 'show/load' buttons found.")
                break
        except (NoSuchElementException, StaleElementReferenceException):
            print("[*] All comment expansion buttons handled.")
            break
        except Exception as e:
            print(f"[!] Error during comment expansion: {e}")
            break

def parse_and_save_comments(html, post_id, comments_csv_path):
    """Parses the HTML of a post page to extract all comments and replies."""
    soup = bs(html, 'html.parser')
    # **FIX**: Updated selector to find the main container for each comment.
    comment_wrappers = soup.find_all("article", class_=lambda x: x and "comments-comment-entity" in x)

    comments_data = []
    for comment_wrapper in comment_wrappers:
        try:
            # **FIX**: Updated selector for the commenter's name.
            commenter_name_tag = comment_wrapper.find("span", class_="comments-comment-meta__description-title")
            commenter_name = commenter_name_tag.get_text(strip=True) if commenter_name_tag else "N/A"

            # **FIX**: Updated selector for the main comment text content.
            comment_text_div = comment_wrapper.find("div", class_="update-components-text")
            comment_text = comment_text_div.get_text(separator="\n", strip=True) if comment_text_div else ""

            # **FIX**: Updated selector for finding the likes count button.
            likes_button = comment_wrapper.find("button", class_=lambda x: x and "reactions-count" in x)
            if likes_button:
                likes_span = likes_button.find("span", class_="v-align-middle")
                comment_likes = convert_abbreviated_to_number(likes_span.get_text(strip=True)) if likes_span else 0
            else:
                comment_likes = 0

            # **FIX**: Updated logic to determine if a comment is a reply.
            is_reply = "comments-comment-entity--reply" in comment_wrapper.get('class', [])

            comments_data.append({
                "Post_ID": post_id, "Commenter_Name": commenter_name,
                "Comment_Text": comment_text, "Comment_Likes": comment_likes,
                "Is_Reply": is_reply
            })
        except Exception as e:
            print(f"[!] Error parsing a single comment: {e}")
            continue

    with open(comments_csv_path, mode='a', encoding='utf-8', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["Post_ID", "Commenter_Name", "Comment_Text", "Comment_Likes", "Is_Reply"])
        writer.writerows(comments_data)

    return len(comments_data)

# --- MAIN SCRIPT LOGIC ---
def main():
    """Main function to orchestrate the scraping process."""
    chrome_options = Options()
    # chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    print("[*] Initializing Chrome driver...")
    browser = webdriver.Chrome(options=chrome_options)
    browser.set_window_size(1920, 1080)

    print(f"[*] Loading cookies from {COOKIES_FILE}...")
    browser.get('https://www.linkedin.com/')
    time.sleep(2)
    load_cookies(browser, COOKIES_FILE)
    browser.refresh()

    try:
        WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, "global-nav")))
        print("[*] Login successful.")
    except TimeoutException:
        print("[!] Login failed. Check your cookies.txt file. Exiting.")
        browser.quit()
        return

    for profile_url in profile_urls:
        profile_name = profile_url.strip('/').split('/')[-1]
        posts_csv_path = f"{profile_name}_posts.csv"
        comments_csv_path = f"{profile_name}_comments.csv"

        print(f"\n--- Scraping profile: {profile_name} ---")

        with open(posts_csv_path, mode='w', encoding='utf-8', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Post_ID", "Post_URL", "Author", "Post_Time", "Content", "Reactions", "Comment_Count"])
        with open(comments_csv_path, mode='w', encoding='utf-8', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Post_ID", "Commenter_Name", "Comment_Text", "Comment_Likes", "Is_Reply"])

        try:
            activity_url = f"{profile_url.strip('/')}/recent-activity/all/"
            browser.get(activity_url)
            time.sleep(5)

            print(f"[*] Scrolling profile feed for up to {MAX_POST_SCROLLS} iterations...")
            last_height = browser.execute_script("return document.body.scrollHeight")
            for i in range(MAX_POST_SCROLLS):
                print(f"  > Scroll {i+1}/{MAX_POST_SCROLLS}")
                browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(SCROLL_PAUSE_TIME)
                new_height = browser.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    print("[*] Reached the end of the page.")
                    break
                last_height = new_height

            print("[*] Parsing post feed...")
            soup = bs(browser.page_source, "html.parser")
            post_wrappers = soup.find_all("div", class_=lambda x: x and "feed-shared-update-v2" in x)

            posts_to_visit = []
            unique_post_ids = set()
            for pw in post_wrappers:
                data_urn = pw.get("data-urn", "")
                if "urn:li:activity:" not in data_urn: continue

                post_id = data_urn.split(":")[-1]
                if post_id in unique_post_ids: continue
                unique_post_ids.add(post_id)

                post_url = f"https://www.linkedin.com/feed/update/{data_urn}/"
                author_name = pw.find("span", {"class": "update-components-actor__name"}).get_text(strip=True) if pw.find("span", {"class": "update-components-actor__name"}) else "N/A"
                post_time = pw.find("span", {"class": "update-components-actor__sub-description"}).get_text(strip=True) if pw.find("span", {"class": "update-components-actor__sub-description"}) else "N/A"
                content_div = pw.find("div", {"class": "update-components-text"})
                post_content = content_div.get_text(separator="\n", strip=True) if content_div else ""
                reactions_span = pw.find("span", {"class": "social-details-social-counts__reactions-count"})
                post_reactions = convert_abbreviated_to_number(reactions_span.get_text(strip=True)) if reactions_span else 0
                comments_li = pw.find("li", {"class": "social-details-social-counts__comments"})
                comment_count = convert_abbreviated_to_number(re.match(r"[\d,KkMm]+", comments_li.get_text(strip=True)).group(0)) if comments_li else 0

                posts_to_visit.append({"id": post_id, "url": post_url})

                with open(posts_csv_path, mode='a', encoding='utf-8', newline='') as file:
                    writer = csv.writer(file)
                    writer.writerow([post_id, post_url, author_name, post_time, post_content, post_reactions, comment_count])

            print(f"[*] Found {len(posts_to_visit)} unique posts. Now visiting each to scrape comments.")

            for i, post in enumerate(posts_to_visit):
                try:
                    print(f"\n--- Visiting post {i+1}/{len(posts_to_visit)} (ID: {post['id']}) ---")
                    browser.get(post['url'])
                    time.sleep(5)

                    print("[*] Expanding all comments...")
                    expand_all_comments(browser)

                    print("[*] Parsing and saving comments...")
                    comment_count = parse_and_save_comments(browser.page_source, post['id'], comments_csv_path)
                    print(f"[*] Saved {comment_count} comments for post {post['id']}.")
                except Exception as e:
                    print(f"❌ FAILED to scrape post {post['id']}. Error: {e}. Skipping to next post.")
                    continue

        except Exception as e:
            print(f"❌ A critical error occurred for profile {profile_name}: {e}")
            continue

    browser.quit()
    print("\n[*] All profiles processed.")

if __name__ == "__main__":
    main()
