# Web Scraping 101

In [None]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

## Step 1: Grab content from the web

In [None]:
#!pip install beautifulsoup4
#!pip install html5lib

In [None]:
URL = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'

In [None]:
resp = requests.get(URL)

if resp.status_code == 200: 
    soup = BeautifulSoup(resp.text, 'html.parser')
else: 
    raise Exception('Request unsuccesful')

In [95]:
soup.title

<title>List of sovereign states - Wikipedia</title>

In [96]:
soup.title.parent

<head>
<meta charset="utf-8"/>
<title>List of sovereign states - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTr

## Step 2: Extract Table

We are looking for the *second* table on the page (the legend that explains the color coding is formatted as a table)

In [None]:
tables = soup.find_all('table')
target_table = tables[1]

headers = target_table.find_all('th')
rows = target_table.find_all('tr')

In [None]:
header_text = []

for header in headers: 
    header_contents = header.contents
    text = []
    for content in header_contents:
        text.append(content.text)
    
    header_text.append(" ".join(text))

In [None]:
countries_data = []
for row in rows[1:50]:
    row_data = row.find_all('td')
    country_data = []
    for field in row_data:
        country_data.append(field.text.strip('\n').replace('\xa0', ''))
    countries_data.append(country_data)

## Step 3: Find Elements with Selectors

Using class attribute to get all the flags

In [None]:
IMG_CLASS = 'mw-file-element'

In [None]:
flags = target_table.find_all('img', class_=IMG_CLASS)

In [None]:
flag_urls = []
for flag in flags:
    if flag.has_attr('src'):
        flag_urls.append(flag['src'])