# Accessing Web Data with Data Scraping

In [1]:
# Importing libraries
import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging
import requests
import bs4

In [2]:
## Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

In [3]:
# Specify path to ChromeDriver executable
driver_path = r'C:\Program Files\Google\Chrome\chromedriver.exe'
service = Service(executable_path=driver_path)

In [4]:
# Initialize WebDriver with service
driver = webdriver.Chrome(executable_path=driver_path)

# Scraping "Key Events of the 20th Century" wiki page with Requests and BeautifulSoup

In [5]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [6]:
# Get web page contents
page = requests.get("https://en.wikipedia.org/wiki/Key_events_of_the_20th_century")

In [7]:
# Create soup and get title
soup = BeautifulSoup(page.text, 'html.parser')

# Remove headers and footers by their tags or classes/IDs
for header in soup.find_all('header'):
    header.decompose()  # This removes the header element from the soup

for footer in soup.find_all('footer'):
    footer.decompose()  # This removes the footer element from the soup

# Extract main content
main_content = soup.find('div', class_='mw-body-content')

# Clean the extracted text
if main_content:
    text = main_content.get_text(separator=' ', strip=True)
print(soup.title)

<title>Key events of the 20th century - Wikipedia</title>


In [9]:
# Creating object to store the text
text_clean = main_content.get_text()

In [10]:
# Define the encoding
text_clean = text_clean.encode('utf-8')

In [11]:
# Save as a text file to working folder
with open('20th_Century_Wiki_refined.txt', 'wb') as f: 
    f.write(text_clean)