In [None]:
### Marketplace code 

########## Import required libraries ##########
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

########## SETUP SELENIUM ##########
chrome_driver_path = "/Users/fadil/Downloads/chromedriver-mac-arm642/chromedriver"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--user-data-dir=/Users/fadil/Library/Application Support/Google/Chrome")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")

# ✅ Start Chrome Driver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

########## Function to Mimic Human Scrolling ##########
def human_scroll(driver, times=5):
    """Scrolls randomly like a human user to load more posts."""
    for _ in range(times):
        scroll_distance = random.randint(700, 900)  # Random scroll height
        driver.execute_script(f"window.scrollBy(0, {scroll_distance});")
        time.sleep(random.uniform(2, 4))

########## Navigate to Facebook Marketplace ##########
marketplace_url = "https://www.facebook.com/marketplace/search/?query=power%20bank"
driver.get(marketplace_url)
time.sleep(5)
human_scroll(driver, times=5)

########## Extract Post Links from Marketplace ##########
def extract_marketplace_posts():
    """Extracts individual post links from Facebook Marketplace."""
    soup = BeautifulSoup(driver.page_source, "html.parser")
    posts = []

    items = soup.find_all("a", href=True)
    
    for item in items:
        link = item["href"]
        if "/marketplace/item/" in link:  # Ensure it's a marketplace post
            full_link = "https://www.facebook.com" + link.split("?")[0]  # Remove tracking parameters
            posts.append(full_link)

    return list(set(posts))  # Remove duplicates

# ✅ Get all Marketplace post links
marketplace_posts = extract_marketplace_posts()
print(f"🔍 Found {len(marketplace_posts)} Marketplace Listings.")

########## Visit Each Marketplace Post and Extract Final Link ##########
data = []
for index, post_url in enumerate(marketplace_posts):
    print(f"🔗 Visiting Post {index+1}/{len(marketplace_posts)}: {post_url}")
    
    driver.get(post_url)
    time.sleep(random.uniform(3, 6))  # Random delay for human-like interaction

    # Store post details
    data.append({"Post Link": post_url})

########## Save results to CSV ##########
df = pd.DataFrame(data)
csv_filename = "marketplace_powerbank_listings.csv"
df.to_csv(csv_filename, index=False)

########## Close browser ##########
driver.quit()

print(f"\n✅ Facebook Marketplace Crawling Completed! Data saved to '{csv_filename}'")


In [None]:
### Pages code
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import time
import pandas as pd
import os
import sys
import json

# Setup Chrome WebDriver with profile
CHROME_DRIVER_PATH = "/Users/fadil/Downloads/chromedriver-mac-arm642/chromedriver"
PROGRESS_FILE = "fb_crawler_progress.json"
OUTPUT_FILE = "facebook_powerbank_posts_thai.csv"


# Keywords for search
SEARCH_KEYWORDS = [
    "พาวเวอร์แบงค์",
    "พาวเวอร์แบงก์",
    "เพาเวอร์แบงค์",
    "เพาเวอร์แบงก์",
    "แบตเตอรี่สำรอง",
    "Powerbank",
    "แบตสำรอง",
    "Eloop"
]


# Keywords for filtering posts
FILTER_KEYWORDS = ["Power bank","พาวเวอร์แบงค์","พาวเวอร์แบงก์","เพาเวอร์แบงค์","เพาเวอร์แบงก์","แบตเตอรี่สำรอง","Powerbank","แบตสำรอง","Eloop"]


# Timing parameters
PAGE_SEARCH_PAUSE_TIME = 5       # Time to wait between page search scrolls (seconds)
PAGE_SEARCH_MAX_SCROLLS = 50     # Maximum number of scrolls when searching for pages

POST_SCROLL_PAUSE_TIME = 5       # Time to wait between post scrolls (seconds) 
POST_SCROLL_MAX_ATTEMPTS = 50    # Maximum number of scrolls per page when looking for posts
POST_NO_CHANGE_LIMIT = 10         # Number of consecutive scrolls with no new posts before stopping

# Page load parameters
PAGE_LOAD_WAIT_TIME = 5          # Time to wait after page loads (seconds)
SAVE_PROGRESS_INTERVAL = 5       # Save progress after every X pages processed

#################################################
#           SCRIPT IMPLEMENTATION               #
#################################################

# Setup Chrome WebDriver with profile
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--user-data-dir=/Users/fadil/Library/Application Support/Google/Chrome")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-notifications")  # Disable Facebook notifications

# Initialize WebDriver
service = Service(CHROME_DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Store extracted data
all_post_urls = set()  # Avoid duplicate posts
all_page_links = set()  # Avoid duplicate pages across ALL keywords
completed_keywords = []  # Track which keywords we've already processed

# Initialize or load progress tracking
def init_progress():
    global all_post_urls, all_page_links, completed_keywords
    
    if not os.path.exists(PROGRESS_FILE) or restart_mode:
        progress = {
            "completed_keywords": [],
            "page_links": [],
            "post_urls": []
        }
        # Create the file
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f)
        return progress
    
    try:
        with open(PROGRESS_FILE, 'r') as f:
            progress = json.load(f)
            # Load the saved data into our variables
            all_post_urls = set(progress["post_urls"])
            all_page_links = set(progress["page_links"])
            completed_keywords = progress["completed_keywords"]
            return progress
    except Exception as e:
        print(f"⚠️ Error loading progress file: {e}")
        print("Creating new progress file...")
        progress = {
            "completed_keywords": [],
            "page_links": [],
            "post_urls": []
        }
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f)
        return progress

# Save progress
def save_progress():
    try:
        progress = {
            "completed_keywords": completed_keywords,
            "page_links": list(all_page_links),
            "post_urls": list(all_post_urls)
        }
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f)
        
        # Also save posts to CSV
        df = pd.DataFrame(list(all_post_urls), columns=["Post URL"])
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"💾 Progress saved - {len(all_post_urls)} posts collected so far.")
    except Exception as e:
        print(f"⚠️ Error saving progress: {e}")

# Function to extract pages and posts
def extract_pages_and_posts(search_query):
    """ Searches Facebook for pages, extracts relevant posts, and saves them """
    global all_post_urls, all_page_links, completed_keywords
    
    print(f"\n🔍 Searching: {search_query}")

    # Open Facebook search for pages
    search_url = f"https://www.facebook.com/search/pages/?q={search_query.replace(' ', '%20')}"
    driver.get(search_url)
    time.sleep(PAGE_LOAD_WAIT_TIME)  # Allow time for the page to load

    # Scroll to load more pages (Auto-Stop if No New Content Appears)
    previous_count = 0  # Track loaded pages count
    stop_scroll_attempts = 0  # Count how many times no new pages load
    
    # New pages found in this specific search
    new_page_links = set()

    print("📜 Scrolling to find pages...")
    for scroll_num in range(1, PAGE_SEARCH_MAX_SCROLLS + 1):  # Configurable max scroll attempts
        pages = driver.find_elements(By.XPATH, "//a[contains(@href, 'facebook.com/') and not(contains(@href, 'profile.php'))]")
        current_count = len(pages)

        # Show scrolling progress
        print(f"📜 Page search scroll #{scroll_num} - Found {current_count} potential pages")

        if current_count == previous_count:
            stop_scroll_attempts += 1
            print(f"⚠️ No new pages found ({stop_scroll_attempts}/{POST_NO_CHANGE_LIMIT})")
        else:
            stop_scroll_attempts = 0  # Reset if new pages appear
            print(f"✅ Found {current_count - previous_count} new potential pages")

        if stop_scroll_attempts >= POST_NO_CHANGE_LIMIT:
            print("✅ No more new results. Stopping scroll.")
            break

        previous_count = current_count
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(PAGE_SEARCH_PAUSE_TIME)  # Configurable pause time

    # Extract only NEW page links that haven't been processed before
    for page in pages:
        try:
            link = page.get_attribute("href")
            # Check if this is a valid FB link AND it hasn't been processed before
            if link and "facebook.com" in link:
                # Print if it's a duplicate or new page
                if link in all_page_links:
                    print(f"🔄 DUPLICATE PAGE: {link}")
                else:
                    print(f"🆕 NEW PAGE: {link}")
                    new_page_links.add(link)
                    all_page_links.add(link)  # Add to global tracking set

        except Exception as e:
            print(f"⚠️ Error extracting page link: {e}")

    print(f"🔹 Found {len(new_page_links)} NEW Facebook pages for this keyword.")
    print(f"🔸 Skipped {len(pages) - len(new_page_links)} DUPLICATE pages.")
    print(f"🔹 Total unique pages collected so far: {len(all_page_links)}")

    # Visit each NEW Facebook page found in this search
    for index, page_link in enumerate(new_page_links):
        try:
            print(f"🔹 Visiting NEW Page {index + 1}/{len(new_page_links)}: {page_link}")

            driver.get(page_link)  # Open the page link directly
            time.sleep(PAGE_LOAD_WAIT_TIME)  # Configurable wait time

            # Detailed scrolling with visual indicators
            scroll_page_and_extract_posts(page_link)

            # Save progress after configured interval
            if (index + 1) % SAVE_PROGRESS_INTERVAL == 0:
                save_progress()

        except Exception as e:
            print(f"⚠️ Error accessing page: {e}")
    
    # Mark this keyword as completed
    if search_query not in completed_keywords:
        completed_keywords.append(search_query)
    
    # Save progress after each keyword
    save_progress()

def scroll_page_and_extract_posts(page_link):
    """Scroll a page with detailed progress and extract posts"""
    global all_post_urls
    
    print(f"📜 Starting to scroll page to load ALL posts...")
    previous_post_count = 0
    consecutive_no_change = 0
    posts = []
    
    # Add visual scroll progress indicator
    print("📊 Scroll progress: [", end="")
    
    for scroll_attempt in range(1, POST_SCROLL_MAX_ATTEMPTS + 1):
        # Scroll down
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(POST_SCROLL_PAUSE_TIME)
        
        # Calculate current page height
        current_height = driver.execute_script("return document.body.scrollHeight")
        
        # Try to find posts with multiple different XPath patterns
        posts = driver.find_elements(By.XPATH, "//a[contains(@href, '/posts/') or contains(@href, '/photos/')]")
        
        # If main pattern didn't work well, try alternative patterns
        if len(posts) <= 1:
            # Try alternative XPath to find more post elements
            alt_posts = driver.find_elements(By.XPATH, "//div[contains(@class, 'userContentWrapper')]//a[contains(@href, '/')]")
            if len(alt_posts) > len(posts):
                posts = alt_posts
                
        current_post_count = len(posts)
        
        # Show scrolling progress
        if scroll_attempt % 5 == 0:
            print("=", end="", flush=True)
            
        # Detailed log
        if scroll_attempt % 5 == 0 or current_post_count != previous_post_count:
            print(f"\n📜 Scroll #{scroll_attempt} - Found {current_post_count} posts (Height: {current_height}px)", end="")
            if current_post_count > previous_post_count:
                print(f" (+{current_post_count - previous_post_count} new posts)")
            else:
                print("")
        
        # Check if we've reached the end (no new posts after multiple scrolls)
        if current_post_count == previous_post_count:
            consecutive_no_change += 1
            if scroll_attempt % 5 == 0:
                print(f"⚠️ No new posts for {consecutive_no_change} consecutive scrolls")
            
            if consecutive_no_change >= POST_NO_CHANGE_LIMIT:
                print(f"\n✅ No new posts after {POST_NO_CHANGE_LIMIT} consecutive scrolls. Reached end of page.")
                break
        else:
            consecutive_no_change = 0  # Reset counter if we found new posts
            
        previous_post_count = current_post_count
    
    print("] Complete")
    print(f"✅ Scrolling complete. Found {len(posts)} total posts on this page.")

    # Find and save post URLs
    post_count = 0
    for post in posts:
        try:
            post_url = post.get_attribute("href")
            post_text = post.text.lower() if post.text else ""

            # Ensure URL is valid and contains at least one keyword
            if post_url and any(keyword.lower() in post_text for keyword in FILTER_KEYWORDS):
                if post_url not in all_post_urls:
                    all_post_urls.add(post_url)
                    post_count += 1
                    print(f"✅ Found Post: {post_url}")  # Print URL in real-time

        except Exception as e:
            print(f"⚠️ Error extracting post: {e}")

    print(f"✅ Extracted {post_count} new posts from this page. Total posts so far: {len(all_post_urls)}")

# Main execution
if __name__ == "__main__":
    # Check for restart flag
    restart_mode = len(sys.argv) > 1 and sys.argv[1] == "--restart"
    if restart_mode:
        print("\n🔄 RESTART MODE: Starting fresh and clearing progress...")
        if os.path.exists(PROGRESS_FILE):
            os.remove(PROGRESS_FILE)
        if os.path.exists(OUTPUT_FILE):
            backup_file = f"{OUTPUT_FILE}.bak"
            if os.path.exists(backup_file):
                os.remove(backup_file)
            os.rename(OUTPUT_FILE, backup_file)
            print(f"Previous results backed up to {backup_file}")
    else:
        print("\n▶️ RESUME MODE: Continuing from previous run...")
    
    # Dictionary to store stats for each keyword
    keyword_stats = {}
    
    try:
        # Initialize/load progress
        progress = init_progress()
        print(f"📊 Starting with {len(all_page_links)} known pages and {len(all_post_urls)} posts")
        print(f"✅ Already completed keywords: {completed_keywords}")
        
        # Setup the initial output file if it doesn't exist
        if not os.path.exists(OUTPUT_FILE):
            with open(OUTPUT_FILE, 'w') as f:
                f.write("Post URL\n")
        
        # Print configuration for user reference
        print("\n⚙️ CURRENT CONFIGURATION:")
        print(f"- Page search scroll pause time: {PAGE_SEARCH_PAUSE_TIME} seconds")
        print(f"- Page search max scrolls: {PAGE_SEARCH_MAX_SCROLLS}")
        print(f"- Post scroll pause time: {POST_SCROLL_PAUSE_TIME} seconds")
        print(f"- Post scroll max attempts: {POST_SCROLL_MAX_ATTEMPTS}")
        print(f"- Post no-change limit: {POST_NO_CHANGE_LIMIT}")
        print(f"- Page load wait time: {PAGE_LOAD_WAIT_TIME} seconds")
        print(f"- Save progress interval: Every {SAVE_PROGRESS_INTERVAL} pages")
        
        # Process each keyword
        for i, keyword in enumerate(SEARCH_KEYWORDS):
            # Skip already completed keywords unless in restart mode
            if keyword in completed_keywords and not restart_mode:
                print(f"\n⏭️ Skipping already completed keyword: {keyword}")
                continue
                
            print(f"\n📌 Processing keyword {i+1}/{len(SEARCH_KEYWORDS)}: {keyword}")
            
            # Store page count before this keyword
            pages_before = len(all_page_links)
            posts_before = len(all_post_urls)
            
            # Process the keyword
            extract_pages_and_posts(keyword)
            
            # Calculate statistics for this keyword
            new_pages = len(all_page_links) - pages_before
            new_posts = len(all_post_urls) - posts_before
            keyword_stats[keyword] = {
                "new_pages": new_pages,
                "total_pages_so_far": len(all_page_links),
                "new_posts": new_posts,
                "posts_so_far": len(all_post_urls)
            }
            
            # Save progress after each keyword (already done in extract_pages_and_posts)
            
    except Exception as e:
        print(f"❌ Error during execution: {e}")
        # Save progress even if there's an error
        save_progress()
    finally:
        # Print summary statistics
        print("\n📈 CRAWLING RESULTS 📈")
        print("-" * 85)
        print(f"{'KEYWORD':<25} {'NEW PAGES':<12} {'NEW POSTS':<12} {'TOTAL PAGES':<12} {'TOTAL POSTS':<12}")
        print("-" * 85)
        
        running_total_pages = 0
        running_total_posts = 0
        for keyword, stats in keyword_stats.items():
            running_total_pages += stats["new_pages"]
            running_total_posts += stats["new_posts"]
            print(f"{keyword:<25} {stats['new_pages']:<12} {stats['new_posts']:<12} {running_total_pages:<12} {running_total_posts:<12}")
        
        print("-" * 85)
        print(f"{'GRAND TOTAL':<25} {len(all_page_links):<12} {'':<12} {'':<12} {len(all_post_urls):<12}")

        print("\n🎉 Crawling completed! Data saved to 'facebook_powerbank_posts_thai.csv'.")
        print(f"📊 Final Statistics: Processed {len(all_page_links)} unique pages and found {len(all_post_urls)} relevant posts.")

        # Close the browser
        driver.quit()

In [None]:
##Pages and Groups
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import time
import pandas as pd
import os
import sys
import json

# Setup Chrome WebDriver with profile
CHROME_DRIVER_PATH = "/Users/fadil/Downloads/chromedriver-mac-arm642/chromedriver"
PROGRESS_FILE = "fb_crawler_progress.json"
OUTPUT_FILE = "facebook_powerbank_posts_groups_thai.csv"


# Keywords for search
SEARCH_KEYWORDS = [
    "Power Bank",
    "พาวเวอร์แบงค์",
    "พาวเวอร์แบงก์",
    "เพาเวอร์แบงค์",
    "เพาเวอร์แบงก์",
    "แบตเตอรี่สำรอง",
    "Powerbank",
    "แบตสำรอง",
    "Eloop"
]


# Keywords for filtering posts
FILTER_KEYWORDS = ["Power Bank","พาวเวอร์แบงค์","พาวเวอร์แบงก์","เพาเวอร์แบงค์","เพาเวอร์แบงก์","แบตเตอรี่สำรอง","Powerbank","แบตสำรอง","Eloop", "แบตฯ",  "พาวเวอร์",  "พาวเวอร์แบ็งค์", "พาเวอร์แบงค์", ]


# Timing parameters
PAGE_SEARCH_PAUSE_TIME = 10       # Time to wait between page search scrolls (seconds)
PAGE_SEARCH_MAX_SCROLLS = 50     # Maximum number of scrolls when searching for pages
GROUP_SEARCH_PAUSE_TIME = 10      # Time to wait between group search scrolls (seconds)
GROUP_SEARCH_MAX_SCROLLS = 50    # Maximum number of scrolls when searching for groups

POST_SCROLL_PAUSE_TIME = 10       # Time to wait between post scrolls (seconds) 
POST_SCROLL_MAX_ATTEMPTS = 50    # Maximum number of scrolls per page when looking for posts
POST_NO_CHANGE_LIMIT = 10        # Number of consecutive scrolls with no new posts before stopping

# Page load parameters
PAGE_LOAD_WAIT_TIME = 10          # Time to wait after page loads (seconds)
SAVE_PROGRESS_INTERVAL = 5       # Save progress after every X pages processed

#################################################
#           SCRIPT IMPLEMENTATION               #
#################################################

# Setup Chrome WebDriver with profile
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--user-data-dir=/Users/fadil/Library/Application Support/Google/Chrome")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-notifications")  # Disable Facebook notifications

# Initialize WebDriver
service = Service(CHROME_DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Store extracted data
all_post_urls = set()        # Avoid duplicate posts
all_page_links = set()       # Avoid duplicate pages across ALL keywords
all_group_links = set()      # Avoid duplicate groups across ALL keywords
completed_keywords = []      # Track which keywords we've already processed

# Initialize or load progress tracking
def init_progress():
    global all_post_urls, all_page_links, all_group_links, completed_keywords
    
    if not os.path.exists(PROGRESS_FILE) or restart_mode:
        progress = {
            "completed_keywords": [],
            "page_links": [],
            "group_links": [],
            "post_urls": []
        }
        # Create the file
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f)
        return progress
    
    try:
        with open(PROGRESS_FILE, 'r') as f:
            progress = json.load(f)
            # Load the saved data into our variables
            all_post_urls = set(progress["post_urls"])
            all_page_links = set(progress["page_links"])
            all_group_links = set(progress.get("group_links", []))  # Handle older progress files
            completed_keywords = progress["completed_keywords"]
            return progress
    except Exception as e:
        print(f"⚠️ Error loading progress file: {e}")
        print("Creating new progress file...")
        progress = {
            "completed_keywords": [],
            "page_links": [],
            "group_links": [],
            "post_urls": []
        }
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f)
        return progress

# Save progress
def save_progress():
    try:
        progress = {
            "completed_keywords": completed_keywords,
            "page_links": list(all_page_links),
            "group_links": list(all_group_links),
            "post_urls": list(all_post_urls)
        }
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f)
        
        # Also save posts to CSV
        df = pd.DataFrame(list(all_post_urls), columns=["Post URL"])
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"💾 Progress saved - {len(all_post_urls)} posts collected so far.")
    except Exception as e:
        print(f"⚠️ Error saving progress: {e}")

# Function to search for FB pages
def find_pages_for_keyword(search_query):
    """ Searches Facebook for pages with the given keyword """
    global all_page_links
    
    print(f"\n🔍 Searching for PAGES with keyword: {search_query}")

    # Open Facebook search for pages
    search_url = f"https://www.facebook.com/search/pages/?q={search_query.replace(' ', '%20')}"
    driver.get(search_url)
    time.sleep(PAGE_LOAD_WAIT_TIME)  # Allow time for the page to load

    # Scroll to load more pages (Auto-Stop if No New Content Appears)
    previous_count = 0  # Track loaded pages count
    stop_scroll_attempts = 0  # Count how many times no new pages load
    
    # New pages found in this specific search
    new_page_links = set()

    print("📜 Scrolling to find pages...")
    for scroll_num in range(1, PAGE_SEARCH_MAX_SCROLLS + 1):  # Configurable max scroll attempts
        pages = driver.find_elements(By.XPATH, "//a[contains(@href, 'facebook.com/') and not(contains(@href, 'profile.php'))]")
        current_count = len(pages)

        # Show scrolling progress
        print(f"📜 Page search scroll #{scroll_num} - Found {current_count} potential pages")

        if current_count == previous_count:
            stop_scroll_attempts += 1
            print(f"⚠️ No new pages found ({stop_scroll_attempts}/{POST_NO_CHANGE_LIMIT})")
        else:
            stop_scroll_attempts = 0  # Reset if new pages appear
            print(f"✅ Found {current_count - previous_count} new potential pages")

        if stop_scroll_attempts >= POST_NO_CHANGE_LIMIT:
            print("✅ No more new results. Stopping scroll.")
            break

        previous_count = current_count
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(PAGE_SEARCH_PAUSE_TIME)  # Configurable pause time

    # Extract only NEW page links that haven't been processed before
    for page in pages:
        try:
            link = page.get_attribute("href")
            # Check if this is a valid FB link AND it hasn't been processed before
            if link and "facebook.com" in link:
                # Filter out non-page links
                if "facebook.com/groups" in link:
                    continue  # Skip groups here, we'll handle them separately
                
                # Print if it's a duplicate or new page
                if link in all_page_links:
                    print(f"🔄 DUPLICATE PAGE: {link}")
                else:
                    print(f"🆕 NEW PAGE: {link}")
                    new_page_links.add(link)
                    all_page_links.add(link)  # Add to global tracking set

        except Exception as e:
            print(f"⚠️ Error extracting page link: {e}")

    print(f"🔹 Found {len(new_page_links)} NEW Facebook pages for this keyword.")
    print(f"🔸 Skipped {len(pages) - len(new_page_links)} DUPLICATE pages.")
    print(f"🔹 Total unique pages collected so far: {len(all_page_links)}")
    
    return new_page_links

# Function to search for FB groups
def find_groups_for_keyword(search_query):
    """ Searches Facebook for groups with the given keyword """
    global all_group_links
    
    print(f"\n🔍 Searching for GROUPS with keyword: {search_query}")

    # Open Facebook search for groups
    search_url = f"https://www.facebook.com/search/groups/?q={search_query.replace(' ', '%20')}"
    driver.get(search_url)
    time.sleep(PAGE_LOAD_WAIT_TIME)  # Allow time for the page to load

    # Scroll to load more groups
    previous_count = 0
    stop_scroll_attempts = 0
    new_group_links = set()

    print("📜 Scrolling to find groups...")
    for scroll_num in range(1, GROUP_SEARCH_MAX_SCROLLS + 1):
        # Find all group links
        groups = driver.find_elements(By.XPATH, "//a[contains(@href, 'facebook.com/groups/')]")
        current_count = len(groups)

        # Show scrolling progress
        print(f"📜 Group search scroll #{scroll_num} - Found {current_count} potential groups")

        if current_count == previous_count:
            stop_scroll_attempts += 1
            print(f"⚠️ No new groups found ({stop_scroll_attempts}/{POST_NO_CHANGE_LIMIT})")
        else:
            stop_scroll_attempts = 0
            print(f"✅ Found {current_count - previous_count} new potential groups")

        if stop_scroll_attempts >= POST_NO_CHANGE_LIMIT:
            print("✅ No more new group results. Stopping scroll.")
            break

        previous_count = current_count
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(GROUP_SEARCH_PAUSE_TIME)

    # Extract only NEW group links
    for group in groups:
        try:
            link = group.get_attribute("href")
            # Ensure it's a valid group link
            if link and "facebook.com/groups/" in link:
                # Check if already processed
                if link in all_group_links:
                    print(f"🔄 DUPLICATE GROUP: {link}")
                else:
                    print(f"🆕 NEW GROUP: {link}")
                    new_group_links.add(link)
                    all_group_links.add(link)
        except Exception as e:
            print(f"⚠️ Error extracting group link: {e}")

    print(f"🔹 Found {len(new_group_links)} NEW Facebook groups for this keyword.")
    print(f"🔸 Skipped {len(groups) - len(new_group_links)} DUPLICATE groups.")
    print(f"🔹 Total unique groups collected so far: {len(all_group_links)}")
    
    return new_group_links

# Function to check if a group is public
def is_group_public(driver, group_link):
    """Determine if a Facebook group is public or private"""
    try:
        driver.get(group_link)
        time.sleep(PAGE_LOAD_WAIT_TIME)
        
        # Look for join button - if present, it's likely private
        join_buttons = driver.find_elements(By.XPATH, "//div[contains(text(), 'Join group')]")
        if join_buttons:
            # Check if there's any indicator this is a private group
            private_indicators = driver.find_elements(By.XPATH, "//span[contains(text(), 'Private group')]")
            if private_indicators:
                print(f"🔒 Private group detected: {group_link}")
                return False
                
        # Check if we can see posts - a sign it's public or we're already a member
        posts = driver.find_elements(By.XPATH, "//a[contains(@href, '/posts/') or contains(@href, '/photos/')]")
        if posts:
            print(f"🌐 Public group or member of group: {group_link}")
            return True
            
        # If in doubt, try scrolling to see if content loads
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(3)
        posts = driver.find_elements(By.XPATH, "//a[contains(@href, '/posts/') or contains(@href, '/photos/')]")
        if posts:
            print(f"🌐 Confirmed public or member group: {group_link}")
            return True
            
        print(f"🔒 Likely private group (cannot view content): {group_link}")
        return False
        
    except Exception as e:
        print(f"⚠️ Error checking group type: {e}")
        return False  # Assume private/inaccessible if there's an error

def scroll_page_and_extract_posts(source_link, source_type="page"):
    """Scroll a page or group with detailed progress and extract posts"""
    global all_post_urls
    
    print(f"📜 Starting to scroll {source_type} to load ALL posts...")
    previous_post_count = 0
    consecutive_no_change = 0
    posts = []
    
    # Add visual scroll progress indicator
    print("📊 Scroll progress: [", end="")
    
    for scroll_attempt in range(1, POST_SCROLL_MAX_ATTEMPTS + 1):
        # Scroll down
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(POST_SCROLL_PAUSE_TIME)
        
        # Calculate current page height
        current_height = driver.execute_script("return document.body.scrollHeight")
        
        # Try to find posts with multiple different XPath patterns
        posts = driver.find_elements(By.XPATH, "//a[contains(@href, '/posts/') or contains(@href, '/photos/')]")
        
        # If main pattern didn't work well, try alternative patterns
        if len(posts) <= 1:
            # Try alternative XPath to find more post elements
            alt_posts = driver.find_elements(By.XPATH, "//div[contains(@class, 'userContentWrapper')]//a[contains(@href, '/')]")
            if len(alt_posts) > len(posts):
                posts = alt_posts
                
        current_post_count = len(posts)
        
        # Show scrolling progress
        if scroll_attempt % 5 == 0:
            print("=", end="", flush=True)
            
        # Detailed log
        if scroll_attempt % 5 == 0 or current_post_count != previous_post_count:
            print(f"\n📜 Scroll #{scroll_attempt} - Found {current_post_count} posts (Height: {current_height}px)", end="")
            if current_post_count > previous_post_count:
                print(f" (+{current_post_count - previous_post_count} new posts)")
            else:
                print("")
        
        # Check if we've reached the end (no new posts after multiple scrolls)
        if current_post_count == previous_post_count:
            consecutive_no_change += 1
            if scroll_attempt % 5 == 0:
                print(f"⚠️ No new posts for {consecutive_no_change} consecutive scrolls")
            
            if consecutive_no_change >= POST_NO_CHANGE_LIMIT:
                print(f"\n✅ No new posts after {POST_NO_CHANGE_LIMIT} consecutive scrolls. Reached end of {source_type}.")
                break
        else:
            consecutive_no_change = 0  # Reset counter if we found new posts
            
        previous_post_count = current_post_count
    
    print("] Complete")
    print(f"✅ Scrolling complete. Found {len(posts)} total posts on this {source_type}.")

    # Find and save post URLs
    post_count = 0
    # Replace the current post filtering method with this more robust version
    for post in posts:
        try:
            post_url = post.get_attribute("href")
            
            # Get text from the post element and its children
            try:
                # Try to get full element HTML to search within
                post_html = post.get_attribute("outerHTML")
                # Also try direct text in case HTML isn't accessible
                post_text = post.text.lower() if post.text else ""
                
                # Find parent container to get more context
                parent = driver.execute_script("return arguments[0].parentNode;", post)
                parent_text = ""
                if parent:
                    parent_text = parent.text.lower() if parent.text else ""
                    
                # Check both post text and HTML content for keywords
                text_to_search = (post_text + " " + parent_text).lower()
                html_to_search = post_html.lower() if post_html else ""
                
                # If any keyword is found in either text or HTML, consider it a match
                if post_url and (
                    any(keyword.lower() in text_to_search for keyword in FILTER_KEYWORDS) or
                    any(keyword.lower() in html_to_search for keyword in FILTER_KEYWORDS)
                ):
                    if post_url not in all_post_urls:
                        all_post_urls.add(post_url)
                        post_count += 1
                        print(f"✅ Found Post: {post_url}")
                        
                        # Debug info to see what was matched
                        matched_keywords = [k for k in FILTER_KEYWORDS if k.lower() in text_to_search or k.lower() in html_to_search]
                        print(f"   ↪ Matched keywords: {', '.join(matched_keywords)}")
            except:
                # Fallback method if advanced extraction fails
                # Just check if the URL itself contains any keywords as a last resort
                if post_url and any(keyword.lower() in post_url.lower() for keyword in FILTER_KEYWORDS):
                    if post_url not in all_post_urls:
                        all_post_urls.add(post_url)
                        post_count += 1
                        print(f"✅ Found Post (URL match): {post_url}")
                        
        except Exception as e:
            print(f"⚠️ Error extracting post: {e}")
# Process a keyword
def process_keyword(keyword):
    global all_post_urls, all_page_links, all_group_links, completed_keywords
    
    print(f"\n📌 Processing keyword: {keyword}")
    
    # Store counts before this keyword
    pages_before = len(all_page_links)
    groups_before = len(all_group_links)
    posts_before = len(all_post_urls)
    
    # Step 1: Find pages for this keyword
    new_pages = find_pages_for_keyword(keyword)
    
    # Step 2: Find groups for this keyword
    new_groups = find_groups_for_keyword(keyword)
    
    # Step 3: Process each page
    print(f"\n🔍 Processing {len(new_pages)} pages for keyword: {keyword}")
    pages_processed = 0
    
    for index, page_link in enumerate(new_pages):
        try:
            print(f"🔹 Visiting PAGE {index + 1}/{len(new_pages)}: {page_link}")
            driver.get(page_link)
            time.sleep(PAGE_LOAD_WAIT_TIME)
            
            # Extract posts from this page
            scroll_page_and_extract_posts(page_link, "page")
            pages_processed += 1
            
            # Save progress periodically
            if pages_processed % SAVE_PROGRESS_INTERVAL == 0:
                save_progress()
                
        except Exception as e:
            print(f"⚠️ Error processing page {page_link}: {e}")
    
    # Step 4: Process each public group
    print(f"\n🔍 Processing {len(new_groups)} groups for keyword: {keyword}")
    groups_processed = 0
    public_groups = 0
    
    for index, group_link in enumerate(new_groups):
        try:
            print(f"🔹 Checking GROUP {index + 1}/{len(new_groups)}: {group_link}")
            
            # Check if it's a public group
            if is_group_public(driver, group_link):
                print(f"🌐 Processing PUBLIC group: {group_link}")
                scroll_page_and_extract_posts(group_link, "group")
                public_groups += 1
            else:
                print(f"🔒 Skipping PRIVATE group: {group_link}")
                
            groups_processed += 1
            
            # Save progress periodically
            if groups_processed % SAVE_PROGRESS_INTERVAL == 0:
                save_progress()
                
        except Exception as e:
            print(f"⚠️ Error processing group {group_link}: {e}")
    
    # Calculate statistics for this keyword
    new_page_count = len(all_page_links) - pages_before
    new_group_count = len(all_group_links) - groups_before
    new_post_count = len(all_post_urls) - posts_before
    
    print(f"\n✅ Keyword '{keyword}' complete:")
    print(f"  - Pages: {new_page_count} new (processed {pages_processed})")
    print(f"  - Groups: {new_group_count} new (processed {groups_processed}, {public_groups} public)")
    print(f"  - Posts: {new_post_count} new")
    
    # Mark this keyword as completed
    if keyword not in completed_keywords:
        completed_keywords.append(keyword)
    
    # Save progress after completing the keyword
    save_progress()
    
    return {
        "new_pages": new_page_count,
        "total_pages": len(all_page_links),
        "new_groups": new_group_count,
        "total_groups": len(all_group_links),
        "public_groups": public_groups,
        "new_posts": new_post_count,
        "total_posts": len(all_post_urls)
    }

# Main execution
if __name__ == "__main__":
    # Check for restart flag
    restart_mode = len(sys.argv) > 1 and sys.argv[1] == "--restart"
    if restart_mode:
        print("\n🔄 RESTART MODE: Starting fresh and clearing progress...")
        if os.path.exists(PROGRESS_FILE):
            os.remove(PROGRESS_FILE)
        if os.path.exists(OUTPUT_FILE):
            backup_file = f"{OUTPUT_FILE}.bak"
            if os.path.exists(backup_file):
                os.remove(backup_file)
            os.rename(OUTPUT_FILE, backup_file)
            print(f"Previous results backed up to {backup_file}")
    else:
        print("\n▶️ RESUME MODE: Continuing from previous run...")
    
    # Dictionary to store stats for each keyword
    keyword_stats = {}
    
    try:
        # Initialize/load progress
        progress = init_progress()
        print(f"📊 Starting with {len(all_page_links)} pages, {len(all_group_links)} groups, {len(all_post_urls)} posts")
        print(f"✅ Already completed keywords: {completed_keywords}")
        
        # Setup the initial output file if it doesn't exist
        if not os.path.exists(OUTPUT_FILE):
            with open(OUTPUT_FILE, 'w') as f:
                f.write("Post URL\n")
        
        # Print configuration for user reference
        print("\n⚙️ CURRENT CONFIGURATION:")
        print(f"- Page search pause time: {PAGE_SEARCH_PAUSE_TIME} seconds")
        print(f"- Page search max scrolls: {PAGE_SEARCH_MAX_SCROLLS}")
        print(f"- Group search pause time: {GROUP_SEARCH_PAUSE_TIME} seconds")
        print(f"- Group search max scrolls: {GROUP_SEARCH_MAX_SCROLLS}")
        print(f"- Post scroll pause time: {POST_SCROLL_PAUSE_TIME} seconds")
        print(f"- Post scroll max attempts: {POST_SCROLL_MAX_ATTEMPTS}")
        print(f"- Post no-change limit: {POST_NO_CHANGE_LIMIT}")
        print(f"- Page load wait time: {PAGE_LOAD_WAIT_TIME} seconds")
        print(f"- Save progress interval: Every {SAVE_PROGRESS_INTERVAL} items")
        
        # Process each keyword
        for i, keyword in enumerate(SEARCH_KEYWORDS):
            # Skip already completed keywords unless in restart mode
            if keyword in completed_keywords and not restart_mode:
                print(f"\n⏭️ Skipping already completed keyword: {keyword}")
                continue
                
            print(f"\n📌 Processing keyword {i+1}/{len(SEARCH_KEYWORDS)}: {keyword}")
            
            # Process the keyword (find and crawl pages and groups)
            stats = process_keyword(keyword)
            keyword_stats[keyword] = stats
            
    except Exception as e:
        print(f"❌ Error during execution: {e}")
        # Save progress even if there's an error
        save_progress()
    finally:
        # Print summary statistics
        print("\n📈 CRAWLING RESULTS 📈")
        print("-" * 120)
        print(f"{'KEYWORD':<20} {'NEW PAGES':<10} {'NEW GROUPS':<10} {'PUBLIC GRPS':<10} {'NEW POSTS':<10} {'TOTAL PAGES':<10} {'TOTAL GROUPS':<10} {'TOTAL POSTS':<10}")
        print("-" * 120)
        
        running_total_pages = 0
        running_total_groups = 0
        running_total_posts = 0
        
        for keyword, stats in keyword_stats.items():
            running_total_pages += stats["new_pages"]
            running_total_groups += stats["new_groups"]
            running_total_posts += stats["new_posts"]
            
            print(f"{keyword:<20} {stats['new_pages']:<10} {stats['new_groups']:<10} {stats.get('public_groups', 0):<10} " +
                  f"{stats['new_posts']:<10} {running_total_pages:<10} {running_total_groups:<10} {running_total_posts:<10}")
        
        print("-" * 120)
        print(f"{'GRAND TOTAL':<20} {len(all_page_links):<10} {len(all_group_links):<10} {'':<10} {len(all_post_urls):<10}")

        print("\n🎉 Crawling completed! Data saved to 'facebook_powerbank_posts_thai.csv'.")
        print(f"📊 Final Statistics: Processed {len(all_page_links)} pages, {len(all_group_links)} groups, and found {len(all_post_urls)} relevant posts.")

        # Close the browser
        driver.quit()