## Using requests APIs

In [16]:
import requests
from tqdm.notebook import tqdm
import pickle

def fetch_html_content(urls):
    html_content_dict = []
    for url in tqdm(urls, desc="Scraping HTML content"):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                html_content_dict.append((url, response.text))
            else:
                html_content_dict.append((url, None))
        except requests.RequestException:
            html_content_dict.append((url, None))
    return html_content_dict

URLs = [
    "https://en.wikipedia.org/wiki/List_of_wars_by_death_toll",
    "https://en.wikipedia.org/wiki/List_of_wars:_1990%E2%80%932002",
    "https://en.wikipedia.org/wiki/List_of_wars:_1945%E2%80%931989",
    "https://en.wikipedia.org/wiki/List_of_wars:_1900%E2%80%931944",
    "https://en.wikipedia.org/wiki/List_of_wars:_2003%E2%80%93present",
    "https://en.wikipedia.org/wiki/List_of_wars:_1800%E2%80%931899",
    "https://en.wikipedia.org/wiki/List_of_wars:_1500%E2%80%931799",
    "https://en.wikipedia.org/wiki/List_of_wars:_1000%E2%80%931499",
    "https://en.wikipedia.org/wiki/List_of_wars:_before_1000",
]

html_contents = fetch_html_content(URLs)

# Save dictionary to a pickle file
with open("html_contents.pkl", "wb") as f:
    pickle.dump(html_contents, f)

print(f"Done scraping {len(html_content_dict)} URLs")

# Load dictionary from pickle file
with open("html_contents.pkl", "rb") as f:
    loaded_html_contents = pickle.load(f)

print(f"Loaded {len(loaded_html_contents)} URLs from pickle file")


Scraping HTML content:   0%|          | 0/9 [00:00<?, ?it/s]

Done scraping 9 URLs
Loaded 9 URLs from pickle file


In [19]:
print("\n".join([url for url, _ in loaded_html_contents]))

https://en.wikipedia.org/wiki/List_of_wars_by_death_toll
https://en.wikipedia.org/wiki/List_of_wars:_1990%E2%80%932002
https://en.wikipedia.org/wiki/List_of_wars:_1945%E2%80%931989
https://en.wikipedia.org/wiki/List_of_wars:_1900%E2%80%931944
https://en.wikipedia.org/wiki/List_of_wars:_2003%E2%80%93present
https://en.wikipedia.org/wiki/List_of_wars:_1800%E2%80%931899
https://en.wikipedia.org/wiki/List_of_wars:_1500%E2%80%931799
https://en.wikipedia.org/wiki/List_of_wars:_1000%E2%80%931499
https://en.wikipedia.org/wiki/List_of_wars:_before_1000


## Using Selenium chromedriver

In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from tqdm.notebook import tqdm
import pickle
import time

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service()
    return webdriver.Chrome(service=service, options=options)

def fetch_html_content_with_selenium(urls):
    html_content_list = []
    driver = init_driver()
    
    for url in tqdm(urls, desc="Scraping HTML content with Selenium"):
        try:
            driver.get(url)
            time.sleep(1)  # Optional: wait for page to load fully
            html_content = driver.page_source
            html_content_list.append((url, html_content))
        except Exception as e:
            print(f"Failed to retrieve {url}: {e}")
            html_content_list.append((url, None))
    
    driver.quit()
    return html_content_list

URLs = [
    "https://en.wikipedia.org/wiki/List_of_wars_by_death_toll",
    "https://en.wikipedia.org/wiki/List_of_wars:_1990%E2%80%932002",
    "https://en.wikipedia.org/wiki/List_of_wars:_1945%E2%80%931989",
    "https://en.wikipedia.org/wiki/List_of_wars:_1900%E2%80%931944",
    "https://en.wikipedia.org/wiki/List_of_wars:_2003%E2%80%93present",
    "https://en.wikipedia.org/wiki/List_of_wars:_1800%E2%80%931899",
    "https://en.wikipedia.org/wiki/List_of_wars:_1500%E2%80%931799",
    "https://en.wikipedia.org/wiki/List_of_wars:_1000%E2%80%931499",
    "https://en.wikipedia.org/wiki/List_of_wars:_before_1000",
]

html_contents = fetch_html_content_with_selenium(URLs)

with open("html_contents.pkl", "wb") as f:
    pickle.dump(html_contents, f)

print(f"Done scraping {len(html_contents)} URLs")

with open("html_contents.pkl", "rb") as f:
    loaded_html_contents = pickle.load(f)

print(f"Loaded {len(loaded_html_contents)} URLs from pickle file")


Scraping HTML content with Selenium:   0%|          | 0/9 [00:00<?, ?it/s]

Done scraping 9 URLs
Loaded 9 URLs from pickle file


In [23]:
print("\n".join([url for url, _ in loaded_html_contents]))

https://en.wikipedia.org/wiki/List_of_wars_by_death_toll
https://en.wikipedia.org/wiki/List_of_wars:_1990%E2%80%932002
https://en.wikipedia.org/wiki/List_of_wars:_1945%E2%80%931989
https://en.wikipedia.org/wiki/List_of_wars:_1900%E2%80%931944
https://en.wikipedia.org/wiki/List_of_wars:_2003%E2%80%93present
https://en.wikipedia.org/wiki/List_of_wars:_1800%E2%80%931899
https://en.wikipedia.org/wiki/List_of_wars:_1500%E2%80%931799
https://en.wikipedia.org/wiki/List_of_wars:_1000%E2%80%931499
https://en.wikipedia.org/wiki/List_of_wars:_before_1000
