In [29]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

def login_and_scrape(root_url, username, password):
    visited_urls = set()
    urls_to_scrape = set([root_url])
    scraped_texts = []

    driver.get(root_url)

    # Example login mechanism
    # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'username'))).send_keys(username)
    # driver.find_element(By.NAME, 'password').send_keys(password)
    # driver.find_element(By.NAME, 'password').send_keys(Keys.RETURN)
    
    while urls_to_scrape:
        url = urls_to_scrape.pop()
        normalized_url = normalize_url(url, root_url)
        if normalized_url in visited_urls:
            continue

        print("processing url: " + url)
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        text_content = ' '.join([p.get_text() for p in soup.find_all('p')])
        text_content = ' '.join(text_content.split())
    
        scraped_texts.append(clean_text(text_content))
        visited_urls.add(normalized_url)

        # Find and filter links, ensuring only URLs within the root domain are added
        links = [normalize_url(urljoin(url, a['href']), root_url) for a in soup.find_all('a', href=True)]
        valid_links = [link for link in links if link.startswith(root_url) and '?' not in link and '#' not in link]
        urls_to_scrape.update(valid_links)

    return scraped_texts

def normalize_url(url, root_url):
    # Normalize the URL by joining with the root URL
    return urljoin(root_url, url)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    return re.sub(r'[^a-zA-Z0-9,.!? ]+', '', text).strip()  # Remove special characters

scraped_data = login_and_scrape("https://en.wikipedia.org/wiki/Cognitive_bias", "hissain", "1234")
#print(scraped_data[:100])

#driver.quit()


processing url: https://en.wikipedia.org/wiki/Cognitive_bias
processing url: https://en.wikipedia.org/wiki/Cognitive_bias_in_animals
processing url: https://en.wikipedia.org/wiki/Cognitive_bias_modification
processing url: https://en.wikipedia.org/wiki/Cognitive_bias_mitigation


In [31]:
len(scraped_data[0])

14284

In [30]:
scraped_data[0]

'The Bangla Wikivoyage Article Contest 2024 is now underway. Participate for a chance to win prizes! A cognitive bias is a systematic pattern of deviation from norm or rationality in judgment.1 Individuals create their own subjective reality from their perception of the input. An individuals construction of reality, not the objective input, may dictate their behavior in the world. Thus, cognitive biases may sometimes lead to perceptual distortion, inaccurate judgment, illogical interpretation, and irrationality.234 While cognitive biases may initially appear to be negative, some are adaptive. They may lead to more effective actions in a given context.5 Furthermore, allowing cognitive biases enables faster decisions which can be desirable when timeliness is more valuable than accuracy, as illustrated in heuristics.6 Other cognitive biases are a byproduct of human processing limitations,1 resulting from a lack of appropriate mental mechanisms bounded rationality, the impact of an individ