### Wikipedia scraper


First: Import our modules or packages that we will need to scrape a website


In [4]:
import csv
import time

from bs4 import BeautifulSoup
import requests


### open the web site with requests

In [11]:
url = "https://en.wikipedia.org/wiki/Category:Women_computer_scientists"
page = requests.get(url)
page_content = page.content


### parse the page through the BeautifulSoup library

In [16]:
soup = BeautifulSoup(page_content, "html.parser")
container = soup.find("div", class_="mw-category")
letter_containers = container.find_all("div", "mw-category-group")

print(len(letter_containers))

11


In [34]:
rows = []
for letter_container in letter_containers:
    category = letter_container.find("h3").text
    name_list = letter_container.find_all("li")
    for name in name_list:
        scientist = name.text
        link  = "https://en.wikipedia.org" + name.find("a", href=True)["href"]
        row = {
            "scientist": scientist,
            "category": category,
            "link":link
        }
        
        rows.append(row)



In [33]:
with open("../output/all-computer-scientists.csv", "w+") as csvfile:
    headers = ["scientist", "category", "link"]
    writer =csv.DictWriter(csvfile, fieldnames = headers)
    writer.writeheader()
    
    for row in rows:
        writer.writerow(row)

### isolate all groupings by letter and add them to an array

### loop through the groupings and store them as a dictionary

### print each row into a spreadsheet

## Alternative scraper — Making parts of the scripts reuseable, more responsible

- announcing who you are using `requests` headers (polite scraping!)
- make variables that hold headers and links
- turning repetitive steps into functions
    - making your scraper take some time (mindful scraping without overloading the server!)

In [42]:
rows = []    

def scrape_content(url):
    time.sleep(2)
    page = requests.get(url)
    page_content = page.content
    soup = BeautifulSoup(page_content, "html.parser")
    container = soup.find("div", class_="mw-category")
    letter_containers = container.find_all("div", "mw-category-group")

    for letter_container in letter_containers:
        category = letter_container.find("h3").text
        name_list = letter_container.find_all("li")
        for name in name_list:
            scientist = name.text
            link  = "https://en.wikipedia.org" + name.find("a", href=True)["href"]
            row = {
                "scientist": scientist,
                "category": category,
                "link":link
            }

            rows.append(row)


In [43]:
urls = ["https://en.wikipedia.org/wiki/Category:Women_computer_scientists", "https://en.wikipedia.org/w/index.php?title=Category:Women_computer_scientists&pagefrom=Keller%2C+Mary+Kenneth%0AMary+Kenneth+Keller#mw-pages"]
for url in urls:
    scrape_content(url)

In [44]:
print(rows)

[{'scientist': 'Karen Aardal', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Karen_Aardal'}, {'scientist': 'Janet Abbate', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Janet_Abbate'}, {'scientist': 'Maya Ackerman', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Maya_Ackerman'}, {'scientist': 'Sibel Adalı', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Sibel_Adal%C4%B1'}, {'scientist': 'Tülay Adalı', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/T%C3%BClay_Adal%C4%B1'}, {'scientist': 'Sarita Adve', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Sarita_Adve'}, {'scientist': 'Frances E. Allen', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Frances_E._Allen'}, {'scientist': 'Sarah Allen (software developer)', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Sarah_Allen_(software_developer)'}, {'scientist': 'Nancy M. Amato', 'category': 'A', 'link': 'https://en.wikipedia.org/wiki/Nancy_M._Amato'}, {'scientist': 'Nin