### Webscraping Forms and Authentication


#### Part 1: Submitting Search Form Using Code

In [1]:
import requests

In [2]:
# Defining the URL for the form's search action
form_url = 'https://bulbapedia.bulbagarden.net/w/index.php'

In [3]:
# Defining the query to search
search_query = 'Pikachu'

# Preparing the form data
form_data = {
    'search': search_query,
    'title': 'Special:Search',
    'go': 'Go'
}

In [5]:
# Send the GET request to submit the form
response = requests.get(form_url, params=form_data)

# Check if the request was successful
if response.status_code == 200:
    print("Search completed successfully!")
    # You can print the response or parse it further
    # print(response.text)  # This will print the entire HTML content of the response
else:
    print(f"Failed to retrieve search results. Status code: {response.status_code}")

Search completed successfully!


#### Part 2: Handling Login Pages and Sessions

In [33]:
from bs4 import BeautifulSoup

In [None]:
# Start a session
session = requests.Session()

# Get the login page to extract hidden fields
login_page_url = 'https://bulbapedia.bulbagarden.net/w/index.php?title=Special:UserLogin&returnto=MainPage'
login_page = session.get(login_page_url)

In [None]:
# Parse the login page
soup = BeautifulSoup(login_page.content, 'html.parser')

# Extract any hidden input fields, including wpEditToken, authAction, and force
hidden_inputs = soup.find_all("input", type="hidden")
form_data = {input_tag.get("name"): input_tag.get("value", "") for input_tag in hidden_inputs}

In [None]:
# Add username and password to form data
form_data['wpName1'] = 'placeholder'  # Replace with actual username
form_data['wpPassword1'] = 'placeholder'  # Replace with actual password

In [None]:
# Send the login POST request
login_url = 'https://bulbapedia.bulbagarden.net/w/index.php?title=Special:UserLogin&returnto=MainPage'
response = session.post(login_url, data=form_data)

# Check for login success using redirects or other methods
if response.url != login_url:
    print("Login successful!")
else:
    print("Login failed.")

# Print the response to help diagnose login issues
# print(response.text)

##### **Handling CSRF Tokens and Anti-Scraping Measures on Bulbapedia**

Based on the error message and response, it seems that Bulbapedia may be using additional anti-scraping techniques or expecting certain tokens or headers that are not being sent.

```
mw.user.tokens.set({"patrolToken":"...", "watchToken":"...", "csrfToken":"..."});
```
It looks like CSRF tokens are being *dynamically* generated and are required for a successful login.

The best thing to do is to use **Selenium** to Handle JavaScript-Generated Tokens


#### Part 3: Scraping Data Behind a Login

In [26]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time


In [29]:
# Setup WebDriver (Chrome in this case)
driver = webdriver.Chrome()

# Open the Bulbapedia login page
login_url = 'https://bulbapedia.bulbagarden.net/w/index.php?title=Special:UserLogin&returnto=MainPage'
driver.get(login_url)

In [30]:
# Wait for the username field to be present before interacting with it
wait = WebDriverWait(driver, 20)

try:
    # Wait until the username field is visible
    username_field = wait.until(EC.presence_of_element_located((By.ID, 'wpName1')))
    password_field = wait.until(EC.presence_of_element_located((By.ID, 'wpPassword1')))

    # Fill in the username and password fields
    username_field.send_keys('placeholder')  # Replace 'placeholder' with actual username
    password_field.send_keys('placeholder')  # Replace 'placeholder'with actual password

    # Clicking the "Log in" button
    login_button = wait.until(EC.presence_of_element_located((By.ID, 'wpLoginAttempt')))
    login_button.click()

    # Wait for a few seconds to ensure the login completes
    time.sleep(5)

    # Check if login is successful by looking for a specific element
    if "Log out" in driver.page_source:
        print("Login successful!")
    else:
        print("Login failed. Please check your credentials.")
    
except Exception as e:
    print(f"An error occurred: {e}")

Login successful!
