In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
import pandas as pd
from dotenv import load_dotenv

In [12]:
file_path = 'fb_groups_data.xlsx'
df1 = pd.read_excel(file_path, skiprows=1, header=0)

In [13]:
load_dotenv()
FB_EMAIL = os.getenv('FB_EMAIL')
FB_PASSWORD = os.getenv('FB_PASSWORD')

In [14]:
def login_to_facebook(driver):
    driver.get('https://www.facebook.com/login')
    driver.find_element(By.ID, "email").send_keys(FB_EMAIL)
    driver.find_element(By.ID, "pass").send_keys(FB_PASSWORD)
    driver.find_element(By.NAME, "login").click()
    time.sleep(5)  # Wait for login to complete
    print("Logged in to Facebook.")

def optimized_scroll_and_collect_urls(driver, max_wait_time=10):
    scroll_pause_time = 0.5  # Time to wait after each scroll (in seconds)
    last_height = driver.execute_script("return document.body.scrollHeight")
    start_time = time.time()
    
    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)
        
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        # Check if more content is loaded
        if new_height == last_height:
            # Break the loop if no new content has loaded for a while
            if time.time() - start_time > max_wait_time:
                break
        else:
            # Reset the start time if new content is loaded
            start_time = time.time()
        
        last_height = new_height
    
    # Once fully scrolled, get the page HTML
    page_html = driver.page_source
    urls = extract_urls_from_html(page_html)
    
    print(f"Collected {len(urls)} URLs from the People section.")
    return urls

def extract_urls_from_html(members_html):
    soup = BeautifulSoup(members_html, 'html.parser')
    links = soup.find_all('a', href=True)
    urls = [link['href'] for link in links]
    return urls

def click_members_link(driver):
    try:
        # Wait for the tab buttons to appear and collect all the potential links
        tab_links = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[role='tab']"))
        )

        for tab_link in tab_links:
            # Get the text and href attributes safely
            tab_text = tab_link.text.strip() if tab_link.text else ""
            tab_href = tab_link.get_attribute('href') if tab_link.get_attribute('href') else ""

            # Convert to lowercase for comparison
            tab_text = tab_text.lower()
            tab_href = tab_href.lower()
            
            # Check if the tab matches the "People" or "Members" link criteria
            if 'people' in tab_text or 'members' in tab_text or '/members/' in tab_href or '/people/' in tab_href:
                driver.execute_script("arguments[0].scrollIntoView(true);", tab_link)
                time.sleep(1)  # Give some time for scrolling
                driver.execute_script("arguments[0].click();", tab_link)
                print(f"Clicked on the '{tab_text.capitalize()}' link.")
                return True
        
        print("Members/People section not found.")
        return False

    except Exception as e:
        print(f"An error occurred while trying to click the 'People' or 'Members' link: {e}")
        return False

def visit_public_group_and_get_all_member_urls(driver, group_url):
    try:
        driver.get(group_url)
        time.sleep(3)  # Ensure the group page is fully loaded

        # Click the "People" or "Members" link
        if click_members_link(driver):
            # Collect all URLs after fully scrolling the page
            member_urls = optimized_scroll_and_collect_urls(driver)
            return member_urls
        else:
            print("Failed to navigate to the 'People' section.")
            return []

    except Exception as e:
        print(f"Error visiting group: {e}")
        return []

def extract_member_urls(urls):
    member_url_pattern = "/user/"
    member_urls = [url for url in urls if member_url_pattern in url]
    return member_urls

def convert_to_proper_url(broken_url):
    numeric_id = broken_url.split('/')[-2]  
    proper_url = f"https://www.facebook.com/profile.php?id={numeric_id}"
    return proper_url

def process_member_urls(urls):
    list1 = []
    urls2 = extract_member_urls(urls)
    
    for url in urls2:
        list1.append(convert_to_proper_url(url))
    
    final_set_of_urls = set(list1)
    return list(final_set_of_urls)

def retrieve_and_process_member_urls(df1, index=52):
    group_url = df1.loc[index, 'FACEBOOK ID']
    group_name = df1.loc[index, 'PAGES']
    
    # Set up the WebDriver
    service = Service(r'C:\\Users\\hp\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe')
    driver = webdriver.Chrome(service=service)
    
    try:
        # Step 1: Log in to Facebook
        login_to_facebook(driver)
        
        # Step 2: Visit the group and get all member URLs
        all_member_urls = visit_public_group_and_get_all_member_urls(driver, group_url)
        
        # Step 3: Process the URLs to get the final list of profile URLs
        final_urls = process_member_urls(all_member_urls)
        
        # Step 4: Create a DataFrame with the required columns
        result_df = pd.DataFrame({
            'Group Name': [group_name] * len(final_urls),
            'Member URL': final_urls
        })
        
        return result_df
    
    finally:
        driver.quit()

In [16]:
result_df = retrieve_and_process_member_urls(df1, index=52)     # Set index to a low member group (e.g. 52)- use member_counts.ipynb for this.

Logged in to Facebook.
Clicked on the 'People' link.
Collected 51 URLs from the People section.


In [17]:
result_df

Unnamed: 0,Group Name,Member URL
0,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100046...
1,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100054...
2,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100042...
3,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100073...
4,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100055...
5,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100027...
6,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100041...
7,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100082...
8,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100083...
9,PAKISTAN PROPERTY COUNCIL (0332-2638077),https://www.facebook.com/profile.php?id=100039...
