## Web scraping
1. grab raw HTML page from the web
2. identify the tables that contain the data
3. process each table to extract the data

### HTML parsing lib
- requests
- bs4
- lxml
- Scrapy
- gazpacho

In [1]:
# install `gazpacho` lib
%pip install gazpacho --upgrade

Collecting gazpacho
  Downloading gazpacho-1.1.tar.gz (7.9 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: gazpacho
  Building wheel for gazpacho (pyproject.toml): started
  Building wheel for gazpacho (pyproject.toml): finished with status 'done'
  Created wheel for gazpacho: filename=gazpacho-1.1-py3-none-any.whl size=7487 sha256=40fb13a52d4608c81d3925cb4989438a730730f69dadd50dfa86c95d9ebc4610
  Stored in directory: c:\users\trucl\appdata\local\pip\cache\wheels\f4\e6\e3\d9a9b3f1dc045ee40c507cd630daa6cdd41b239633e92d5457
Successfully built gazpacho
Installing collected packages: gazpacho
Successfully installed gazpacho-1.1
Note: you may need to restart the kernel t

### grab raw HTML page from the web

In [2]:
# assign web address
URL = "https://en.wikipedia.org/wiki/List_of_world_records_in_swimming"

In [3]:
# use `gazpacho` to retrieve raw HTML from URL web address
import gazpacho

html = gazpacho.get(URL)

In [4]:
len(html)

553435

#### Start-Stop-Step

In [7]:
fav = "Life, the Universe and Everything."

In [8]:
fav[0], fav[1]

('L', 'i')

In [9]:
fav[-1], fav[-2]

('.', 'g')

In [10]:
fav[0:4]

'Life'

In [11]:
fav[10:18]

'Universe'

In [12]:
fav[23:33]

'Everything'

In [13]:
fav[23:-1]

'Everything'

In [14]:
fav[:4]

'Life'

In [15]:
fav[23:]

'Everything.'

In [17]:
fav[-11:]

'Everything.'

In [18]:
fav[-11:-1]

'Everything'

In [19]:
msg = "My name is James, and I like you much."

In [21]:
# slice every second value from the sequence (STEP is set to 2)
msg[::2]

'M aei ae,adIlk o uh'

In [22]:
msg[::3]

'Mnesasa leomh'

In [23]:
msg[::-1]

'.hcum uoy ekil I dna ,semaJ si eman yM'

In [24]:
# slice the first 200 characters from `html` string
html[:200] 

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-'

In [25]:
# slice the last 200 characters from `html` string
html[-200:]

'2:32:43Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/a\\/ae\\/Caeleb_Dressel_before_winning_100_fly_%2842769914221%29.jpg","headline":"Wikimedia list article"}</script>\n</body>\n</html>'

In [26]:
# extract the first 500 characters of the first HTML table in the `html` string
from_where = html.find("<table")
html[from_where:from_where+500]

'<table class="wikitable sortable" style="font-size: 95%;">\n<tbody><tr>\n<th>Event\n</th>\n<th style="width:4em" class="unsortable">Time\n</th>\n<th class="unsortable">\n</th>\n<th>Name</th>\n<th>Nationality</th>\n<th>Date</th>\n<th>Meet</th>\n<th>Location\n</th>\n<th style="width:2em" class="unsortable">Ref\n</th></tr>\n\n<tr>\n<td><span data-sort-value="01 &#160;!"> <a href="/wiki/World_record_progression_50_metres_freestyle" title="World record progression 50 metres freestyle">50m freestyle</a> </span>\n</td>\n<'

### identify the tables that contain the data

In [27]:
# passing the raw HTML string into the "Soup" constructor creates a parsed representation of the HTML
soup = gazpacho.Soup(html)

In [28]:
type(soup)

gazpacho.soup.Soup

In [29]:
print(dir(soup))

['attrs', 'find', 'get', 'html', 'strip', 'tag', 'text']


In [30]:
# search for tags of interest
# `find`: locate + extract HTML elements
tables = soup.find("table")

# returned objects:
# the HTML tag is not found: return `None`
# single tag is found: return single Soup object
# more than one tag is found: return list of Soup objects

In [31]:
type(tables)

list

In [32]:
len(tables)

12

In [33]:
# use `mode="all"` argument to `find()`
# always return a list
# `None`: empty list
# `one Soup object`: one-slot list
# `more than one Soup objects`: list of Soup objects
tables = soup.find("table", mode="all")

In [34]:
# grab all rows from first table (slot #0), return a list
# "tr" value refers to HTML's table row tag <TR>
rows = tables[0].find("tr", mode="all")

In [35]:
type(rows)

list

In [36]:
len(rows)

22

In [37]:
rows[0]

<tr>
  <th>Event
</th>
  <th style="width:4em" class="unsortable">Time
</th>
  <th class="unsortable">
</th>
  <th>Name</th>
  <th>Nationality</th>
  <th>Date</th>
  <th>Meet</th>
  <th>Location
</th>
  <th style="width:2em" class="unsortable">Ref
</th>
</tr>

In [39]:
# write a `for` loop that processes each of the tables in HTML soup
# on each enumerated iteration:
# calculate how many rows of data the current table has
# display the table number + number of rows

for n, table in enumerate(tables):
    rows = table.find("tr", mode="all")
    print(f"{n} -> {len(rows)} rows")

0 -> 22 rows
1 -> 1 rows
2 -> 21 rows
3 -> 3 rows
4 -> 27 rows
5 -> 26 rows
6 -> 3 rows
7 -> 25 rows
8 -> 1 rows
9 -> 7 rows
10 -> 5 rows
11 -> 8 rows


In [40]:
for n, table in enumerate(tables):
    rows = table.find("tr", mode="all")
    # grab the <td> tags from the last rows
    cols = rows[-1].find("td", mode="all")
    print(f"{n} -> {len(rows)} rows, {len(cols)} columns")

0 -> 22 rows, 9 columns
1 -> 1 rows, 2 columns
2 -> 21 rows, 9 columns
3 -> 3 rows, 9 columns
4 -> 27 rows, 9 columns
5 -> 26 rows, 9 columns
6 -> 3 rows, 9 columns
7 -> 25 rows, 11 columns
8 -> 1 rows, 2 columns
9 -> 7 rows, 1 columns
10 -> 5 rows, 1 columns
11 -> 8 rows, 1 columns


### extract the data

In [41]:
print(dir(soup))

# the `text` returns any textual data associated with an identified HTML tag

['attrs', 'find', 'get', 'html', 'strip', 'tag', 'text']


In [43]:
# create new variable assigned the first slot from `tables` list:
table = tables[0]

# skip the first row (header information), iterate over each of the rows (<tr> tag) in `table`
for row in table.find("tr", mode="all")[1:]:
    # find all the <td> tags for the current row
    columns = row.find("td", mode="all")
    event = columns[0].text
    time = columns[1].text
    print(f"{event} -> {time}")

50m freestyle -> 20.91
100m freestyle -> 46.86
200m freestyle -> 1:42.00
400m freestyle -> 3:40.07
800m freestyle -> 7:32.12
1500m freestyle -> 14:31.02
50m backstroke -> 23.71
50m backstroke -> 23.55
100m backstroke -> 51.60
200m backstroke -> 1:51.92
50m breaststroke -> 25.95
100m breaststroke -> 56.88
200m breaststroke -> 2:05.48
50m butterfly -> 22.27
100m butterfly -> 49.45
200m butterfly -> 1:50.34
200m individual medley -> 1:54.00
400m individual medley -> 4:02.50
4 × 100 m freestyle relay -> 3:08.24
4 × 200 m freestyle relay -> 6:58.55
4 × 100 m medley relay -> 3:26.78


#### extract data from all tables

In [45]:
RECORDS = (0, 2, 4, 5)

# LC = Long Course (i.e., 50m pool)
# SC = Short Course (i.e., 25m pool)
COURSES = ("LC Men", "LC Women", "SC Men", "SC Women")

In [46]:
# `zip()`: associate the slot numbers with course names
list(zip(RECORDS, COURSES))

[(0, 'LC Men'), (2, 'LC Women'), (4, 'SC Men'), (5, 'SC Women')]

In [47]:
for table, course in zip(RECORDS, COURSES):
    print(f"{course}:")

LC Men:
LC Women:
SC Men:
SC Women:


In [48]:
RECORDS = (0, 2, 4, 5)
COURSES = ("LC Men", "LC Women", "SC Men", "SC Women")

for table, course in zip(RECORDS, COURSES):
    print(f"{course}:")
    for row in tables[table].find("tr", mode="all")[1:]:
        columns = row.find("td", mode="all")
        event = columns[0].text
        time = columns[1].text
        print(f"\t{event} -> {time}")
    print()

LC Men:
	50m freestyle -> 20.91
	100m freestyle -> 46.86
	200m freestyle -> 1:42.00
	400m freestyle -> 3:40.07
	800m freestyle -> 7:32.12
	1500m freestyle -> 14:31.02
	50m backstroke -> 23.71
	50m backstroke -> 23.55
	100m backstroke -> 51.60
	200m backstroke -> 1:51.92
	50m breaststroke -> 25.95
	100m breaststroke -> 56.88
	200m breaststroke -> 2:05.48
	50m butterfly -> 22.27
	100m butterfly -> 49.45
	200m butterfly -> 1:50.34
	200m individual medley -> 1:54.00
	400m individual medley -> 4:02.50
	4 × 100 m freestyle relay -> 3:08.24
	4 × 200 m freestyle relay -> 6:58.55
	4 × 100 m medley relay -> 3:26.78

LC Women:
	50m freestyle -> 23.61
	100m freestyle -> 51.71
	200m freestyle -> 1:52.85
	400m freestyle -> 3:55.38
	800m freestyle -> 8:04.79
	1500m freestyle -> 15:20.48
	50m backstroke -> 26.98
	100m backstroke -> 57.45
	200m backstroke -> 2:03.14
	50m breaststroke -> 29.16
	100m breaststroke -> 1:04.13
	200m breaststroke -> 2:17.55
	50m butterfly -> 24.43
	100m butterfly -> 55.48
	2