### Wikipedia scraper


First: Import our modules or packages that we will need to scrape a website


In [1]:
import csv 
import time

from bs4 import BeautifulSoup
import requests


### open the web site with requests

In [2]:
url = "https://en.wikipedia.org/wiki/Category:Women_computer_scientists"
page = requests.get(url)
page_content = page.content

### parse the page through the BeautifulSoup library

In [3]:
soup = BeautifulSoup(page_content, "html.parser")
all_groupings = soup.find('div', class_='mw-category')
print(all_groupings)

<div class="mw-category"><div class="mw-category-group"><h3> </h3>
<ul><li><div class="CategoryTreeSection"><div class="CategoryTreeItem"><span class="CategoryTreeBullet"><span class="CategoryTreeToggle" data-ct-state="collapsed" data-ct-title="Women_computer_scientists_by_nationality"></span> </span> <a href="/wiki/Category:Women_computer_scientists_by_nationality" title="Category:Women computer scientists by nationality">Women computer scientists by nationality</a>‎ <span dir="ltr" title="Contains 58 subcategories, 0 pages, and 0 files">(58 C)</span></div><div class="CategoryTreeChildren" style="display:none"></div></div></li></ul></div><div class="mw-category-group"><h3>B</h3>
<ul><li><div class="CategoryTreeSection"><div class="CategoryTreeItem"><span class="CategoryTreeEmptyBullet"></span> <a href="/wiki/Category:Women_bioinformaticians" title="Category:Women bioinformaticians">Women bioinformaticians</a>‎ <span dir="ltr" title="Contains 0 subcategories, 9 pages, and 0 files">(9 P

In [4]:
len(all_groupings)

3

In [5]:
for grouping in all_groupings:
    print(grouping)


<div class="mw-category-group"><h3> </h3>
<ul><li><div class="CategoryTreeSection"><div class="CategoryTreeItem"><span class="CategoryTreeBullet"><span class="CategoryTreeToggle" data-ct-state="collapsed" data-ct-title="Women_computer_scientists_by_nationality"></span> </span> <a href="/wiki/Category:Women_computer_scientists_by_nationality" title="Category:Women computer scientists by nationality">Women computer scientists by nationality</a>‎ <span dir="ltr" title="Contains 58 subcategories, 0 pages, and 0 files">(58 C)</span></div><div class="CategoryTreeChildren" style="display:none"></div></div></li></ul></div>
<div class="mw-category-group"><h3>B</h3>
<ul><li><div class="CategoryTreeSection"><div class="CategoryTreeItem"><span class="CategoryTreeEmptyBullet"></span> <a href="/wiki/Category:Women_bioinformaticians" title="Category:Women bioinformaticians">Women bioinformaticians</a>‎ <span dir="ltr" title="Contains 0 subcategories, 9 pages, and 0 files">(9 P)</span></div><div class

### isolate all groupings by letter and add them to an array

In [6]:
for grouping in all_groupings:
    names_list = grouping.find('ul')
    category = grouping.find('h3').get_text()
    alphabetical_names = names_list.find_all('li')
    print(grouping)
    break

<div class="mw-category-group"><h3> </h3>
<ul><li><div class="CategoryTreeSection"><div class="CategoryTreeItem"><span class="CategoryTreeBullet"><span class="CategoryTreeToggle" data-ct-state="collapsed" data-ct-title="Women_computer_scientists_by_nationality"></span> </span> <a href="/wiki/Category:Women_computer_scientists_by_nationality" title="Category:Women computer scientists by nationality">Women computer scientists by nationality</a>‎ <span dir="ltr" title="Contains 58 subcategories, 0 pages, and 0 files">(58 C)</span></div><div class="CategoryTreeChildren" style="display:none"></div></div></li></ul></div>


### loop through the groupings and store them as a dictionary

In [7]:
# make an empty array for your data
rows = []
# loop through each grouping
for grouping in all_groupings:
    names_list = grouping.find('ul')
    category = grouping.find('h3').get_text()
    alphabetical_names = names_list.find_all('li')
    for item in alphabetical_names:
        # get the name
        name  = item.text
        # get the link
        anchortag = item.find('a',href=True)
        link = "https://en.wikipedia.org" + anchortag['href']
        # get the letter
        letter_name = category
        # make a data dictionary that will be written into the csv
        row = { 'name': name,
                'link': link,
                'letter_name': letter_name}
        rows.append(row)

### print each row into a spreadsheet

In [8]:
# make a new csv into which we will write all the rows
with open('../output/all-women-computer-scientists.csv', 'w+') as csvfile:
    # these are the header names:
    fieldnames = ['name', 'link', 'letter_name']
    # this creates your csv
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    # this writes in the first row, which are the headers
    writer.writeheader()

    # this loops through your rows (the array you set at the beginning and have updated throughtout)
    for row in rows:
        # this takes each row and writes it into your csv
        writer.writerow(row)


## Alternative scraper — Making parts of the scripts reuseable, more responsible

- announcing who you are using `requests` headers (polite scraping!)
- make variables that hold headers and links
- turning repetitive steps into functions
    - making your scraper take some time (mindful scraping without overloading the server!)

In [9]:
# Your identification
headers = {"user-agent" : "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36;",
"from": "Your name example@domain.com"}

In [10]:
# make an empty array for your data
rows = []

# open the web site
urls = ["https://en.wikipedia.org/wiki/Category:Women_computer_scientists", "https://en.wikipedia.org/w/index.php?title=Category:Women_computer_scientists&pagefrom=Lin%2C+Ming+C.%0AMing+C.+Lin#mw-pages"]


In [11]:
def scrape_content(url):
    time.sleep(2)
    # add headers
    page = requests.get(url, headers= headers)
    page_content = page.content
    # parse the page through the BeautifulSoup library
    soup = BeautifulSoup(page_content, "html.parser")
    content = soup.find("div", class_="mw-category")
    all_groupings = content.find_all("div", class_="mw-category-group")
    for grouping in all_groupings:
        names_list = grouping.find("ul")
        category = grouping.find("h3").get_text()
        alphabetical_names = names_list.find_all("li")
        for item in alphabetical_names:
            # get the name
            name  = item.text
            # get the link
            anchortag = item.find("a",href=True)
            link = anchortag["href"]
            # get the letter
            letter_name = category
            # make a data dictionary that will be written into the csv
            row = { "name": name,
                    "link": link,
                    "letter_name": letter_name}
            rows.append(row)

In [12]:
for url in urls:
    scrape_content(url)

In [13]:
# make a new csv into which we will write all the rows
with open('../output/all-women-computer-scientists.csv', 'w+') as csvfile:
    # these are the header names:
    fieldnames = ['name', 'link', 'letter_name']
    # this creates your csv
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    # this writes in the first row, which are the headers
    writer.writeheader()

    # this loops through your rows (the array you set at the beginning and have updated throughtout)
    for row in rows:
        # this takes each row and writes it into your csv
        writer.writerow(row)
