# First webscraping task

In [38]:
# Load BeautifulSoup, and pandas to store in a dataframe
from bs4 import BeautifulSoup
import pandas as pd

Simple example of an HTML website

HTML tags (h1, h2, ..., a, b, img) and attributes (href, id, class)

In [8]:
example_html = """
<!DOCTYPE html>
<html>
<head>
<title>Sample web page</title>
</head>
<body>
<h1>h1 Header #1</h1>
<p>This is a paragraph tag</p>
<h2>h2 Sub-header</h2>
<p>A new paragraph, now in the <b>sub-header</b></p>
<h1>h1 Header #2</h1>
<p>This other paragraph has two  hyperlinks, one to <a href="https://carpentries.org/">The Carpentries homepage</a>, and another to the <a href="https://carpentries.org/past_workshops/">past workshops</a>.</p>
</body>
</html>
"""

Create soup object, and see tree structure

In [62]:
soup = BeautifulSoup(example_html, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Sample web page
  </title>
 </head>
 <body>
  <h1>
   h1 Header #1
  </h1>
  <p>
   This is a paragraph tag
  </p>
  <h2>
   h2 Sub-header
  </h2>
  <p>
   A new paragraph, now in the
   <b>
    sub-header
   </b>
  </p>
  <h1>
   h1 Header #2
  </h1>
  <p>
   This other paragraph has two  hyperlinks, one to
   <a href="https://carpentries.org/">
    The Carpentries homepage
   </a>
   , and another to the
   <a href="https://carpentries.org/past_workshops/">
    past workshops
   </a>
   .
  </p>
 </body>
</html>



In [63]:
# Accessing different parts of the website
print("1.", soup.title)
print("2.", soup.title.string)
print("3.", soup.h1.string)
print("4.", soup.find_all('h1'))
print("5.", soup.find_all('a'))
print("6.", soup.get_text())

1. <title>Sample web page</title>
2. Sample web page
3. h1 Header #1
4. [<h1>h1 Header #1</h1>, <h1>h1 Header #2</h1>]
5. [<a href="https://carpentries.org/">The Carpentries homepage</a>, <a href="https://carpentries.org/past_workshops/">past workshops</a>]
6. 



Sample web page


h1 Header #1
This is a paragraph tag
h2 Sub-header
A new paragraph, now in the sub-header
h1 Header #2
This other paragraph has two  hyperlinks, one to The Carpentries homepage, and another to the past workshops.





In [114]:
# How many elements with the 'a' tag did we find?
# find_all returns a list with all the tags that we specified. There could be 0, 1 or multiple items in the list
links = soup.find_all('a')
len(links)

131

In [65]:
# Now create a dataframe with all hyperlinks and the text each is referencing
for item in links:
    print(item)

<a href="https://carpentries.org/">The Carpentries homepage</a>
<a href="https://carpentries.org/past_workshops/">past workshops</a>


In [42]:
type(soup.find_all('a')[0].string)

bs4.element.NavigableString

In [35]:
# We can access the attribute like treating them as a dictionary
# And we can access the text using .text
for item in soup.find_all('a'):
    print(item, " - ", item['href'], " - ", item.text)

<a href="https://carpentries.org/">The Carpentries homepage</a>  -  https://carpentries.org/  -  The Carpentries homepage
<a href="https://carpentries.org/past_workshops/">past workshops</a>  -  https://carpentries.org/past_workshops/  -  past workshops


Maybe a refresher on dictionaries?

In [45]:
# Storing all as a list of dictionaries to create a table

# Creating empty dictionary
list_of_dicts = []
for item in soup.find_all('a'):
    dict_a = {}
    dict_a['tag'] = str(item)
    dict_a['link'] = item['href']
    dict_a['text'] = item.text
    list_of_dicts.append(dict_a)

In [46]:
pd.DataFrame(list_of_dicts)

Unnamed: 0,tag,link,text
0,"<a href=""https://carpentries.org/"">The Carpent...",https://carpentries.org/,The Carpentries homepage
1,"<a href=""https://carpentries.org/past_workshop...",https://carpentries.org/past_workshops/,past workshops


## A more real example, actually getting information from a website

In [124]:
# Check the website, see that we want to obtain the data from the upcoming workshops table
url = 'https://carpentries.org/'

In [125]:
# We'll use the requests package to get the HTML from a website, the Carpentries website
import requests
req = requests.get(url)

In [126]:
# See what we got
print(req.text)

<!doctype html>
<html class="no-js" lang="en">
<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>The Carpentries</title>

    <link rel="stylesheet" type="text/css" href="https://carpentries.org/assets/css/styles_feeling_responsive.css">

  

	<script src="https://carpentries.org/assets/js/modernizr.min.js"></script>

        <!-- matomo -->
        <script src="https://carpentries.org/assets/js/matomo-analytics.js"></script>

        <link href="https://fonts.googleapis.com/css?family=Lato:400,400i,700,700i|Roboto:400,400i,700,700i&display=swap" rel="stylesheet">

	<!-- Search Engine Optimization -->
	<meta name="description" content="The Carpentries is a fiscally sponsored project of Community Initiatives, a registered 501(c)3 non-profit organisation based in California, USA. We are a global community teaching foundational computational and data science skills to researchers in academia, industry and government.">
	
	
	
	
	

- Go to The Carpentries website, and compare the previous result to the 'View source' in Chrome

In [127]:
# We'll use BeautifulSoup to parse the HTML,
# as it has useful functions and tools to access the data in the HTML
soup = BeautifulSoup(req.text, 'html.parser')

In [128]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   The Carpentries
  </title>
  <link href="https://carpentries.org/assets/css/styles_feeling_responsive.css" rel="stylesheet" type="text/css"/>
  <script src="https://carpentries.org/assets/js/modernizr.min.js">
  </script>
  <!-- matomo -->
  <script src="https://carpentries.org/assets/js/matomo-analytics.js">
  </script>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,400i,700,700i|Roboto:400,400i,700,700i&amp;display=swap" rel="stylesheet"/>
  <!-- Search Engine Optimization -->
  <meta content="The Carpentries is a fiscally sponsored project of Community Initiatives, a registered 501(c)3 non-profit organisation based in California, USA. We are a global community teaching foundational computational and data science skills to researchers in academia, industry and government." name="description"/>
  <link h

In 'View source', try to find 'Upcoming Carpentries Workshops'.
Or alternatively, select an element of the table and click 'Inspect'

In [129]:
tables = soup.find_all('table')

In [130]:
# How many tables (elements with the table tag) did we find?
len(tables)

1

In [131]:
# Access the only element, the only table
upcoming_workshops = tables[0]

Using the 'Inspect' feature, we can explore the structure of the table element
We could see that it has multiple child tags, called 'tr' for 'table row', and each 'tr' has a 'td' for table data

In [132]:
# Let's get all rows and count how many there are
rows = upcoming_workshops.find_all('tr')
len(rows)

23

In [133]:
# Check first row
print(rows[0])

<tr>
<td>
<img alt="cp logo" class="flags" height="24" src="https://carpentries.org/assets/img/logos/cp.svg" title="cp workshop" width="24">
</img></td>
<td>
<img alt="us" class="flags" src="https://carpentries.org/assets/img/flags/24/us.png" title="US">
<a href="https://ucsbcarpentry.github.io/2024-10-08-ucsb/">UC Santa Barbara Library **</a>
<br>
<b>Instructors:</b> Jose Nino Muriel, Greg JanÃ©e, Julien Brun, Renata Curty
      
      
          <br>
<b>Helpers:</b> Jay Chi, Seth Erickson, Nandini Iyer, Sigrid Van Den Abbeele
      
	</br></br></img></td>
<td>
		Oct 8 - Oct 15, 2024
	</td>
</tr>


In [134]:
columns_first_row = rows[0].find_all('td')
# 3 columns:
#   0 is workshop type
#   1 is host, link, instructors and helpers
#   2 is dates

In [135]:
print("type: ", columns_first_row[0].find('img')['title']) # Getting the title attribute of the first img tag
print("location: ", columns_first_row[1].find('img')['title'])
print("link: ", columns_first_row[1].find('a')['href'])
print("host: ", columns_first_row[1].find('a').text)
print("all_text: ", columns_first_row[1].get_text(strip=True))
print("date: ", columns_first_row[2].get_text(strip=True))

type:  cp workshop
location:  US
link:  https://ucsbcarpentry.github.io/2024-10-08-ucsb/
host:  UC Santa Barbara Library **
all_text:  UC Santa Barbara Library **Instructors:Jose Nino Muriel, Greg JanÃ©e, Julien Brun, Renata CurtyHelpers:Jay Chi, Seth Erickson, Nandini Iyer, Sigrid Van Den Abbeele
date:  Oct 8 - Oct 15, 2024


In [136]:
# Doing it for all the rows in the 'table' tag
list_of_workshops = []
for row in rows:
    dict_data = {}
    columns_row = row.find_all('td')
    dict_data["type"] = columns_row[0].find('img')['title']
    dict_data["location"] =  columns_row[1].find('img')['title']
    dict_data["link"] = columns_row[1].find('a')['href']
    dict_data["host"] = columns_row[1].find('a').text
    dict_data["all_text"] = columns_row[1].get_text(strip=True)
    dict_data["date"] =  columns_row[2].get_text(strip=True)
    list_of_workshops.append(dict_data)

In [137]:
df_workshops = pd.DataFrame(list_of_workshops)
print(df_workshops.shape)
print(df_workshops.columns)

(23, 6)
Index(['type', 'location', 'link', 'host', 'all_text', 'date'], dtype='object')


## Challenge
Now do this same process for the entire list of workshops in https://carpentries.org/past_workshops/. What is the top 5 number of countries with more workshops and how many have each? 

In [139]:
url = 'https://carpentries.org/past_workshops/'
req_past = requests.get(url)
soup_past = BeautifulSoup(req_past.text, 'html.parser')

In [140]:
table = soup_past.find('table')
list_of_workshops = []
for row in table.find_all('tr'):
    dict_data = {}
    columns_row = row.find_all('td')
    dict_data["type"] = columns_row[0].find('img')['title']
    dict_data["location"] =  columns_row[1].find('img')['title']
    dict_data["link"] = columns_row[1].find('a')['href']
    dict_data["host"] = columns_row[1].find('a').text
    dict_data["all_text"] = columns_row[1].get_text(strip=True)
    dict_data["date"] =  columns_row[2].get_text(strip=True)
    list_of_workshops.append(dict_data)

In [143]:
df_past_workshops = pd.DataFrame(list_of_workshops)
print(df_past_workshops.shape)
print(df_past_workshops.columns)

(3816, 6)
Index(['type', 'location', 'link', 'host', 'all_text', 'date'], dtype='object')


In [144]:
df_past_workshops['location'].value_counts().head()

location
US    1831
GB     466
AU     333
CA     225
DE     172
Name: count, dtype: int64

# Challenge 2
You can see that there is also a little world icon for thos workshops that are online. What would your approach be to have in your dataframe a variable to distinguish if the workshops was online or not? Write code according to your approach


In [None]:
# To do

## Creating a web of scrapers
Now, we'll go into each link and get additional information from those pages. However, when doing so, we have to be mindful about the consumption of the servers we are doing

In [150]:
first_5_websites = df_past_workshops['link'].head(5)
first_5_websites

0           https://icr-sd-cc.github.io/2024-10-09-ICR
1    https://nclrse-training.github.io/2024-10-08-NCL/
2    https://brian-maass-unmc.github.io/2024-10-07-...
3    https://tajuakins.github.io/2024-10-01-nau-onl...
4       https://librarylady1.github.io/2024-09-28-UMD/
Name: link, dtype: object

In [152]:
first_5_websites = list(first_5_websites)
first_5_websites

['https://icr-sd-cc.github.io/2024-10-09-ICR',
 'https://nclrse-training.github.io/2024-10-08-NCL/',
 'https://brian-maass-unmc.github.io/2024-10-07-UNMC/',
 'https://tajuakins.github.io/2024-10-01-nau-online/',
 'https://librarylady1.github.io/2024-09-28-UMD/']

In [171]:
first_test = first_5_websites[0]
req = requests.get(first_test)
soup = BeautifulSoup(req.text, 'html.parser')

In [172]:
soup


<!DOCTYPE html>

<html lang="en">
<head>
<meta content="2024-10-09-ICR" name="slug"/>
<meta content="2024-10-09" name="startdate"/>
<meta content="2024-10-09" name="enddate"/>
<meta content="Wed Oct 09, 2024" name="humandate"/>
<meta content="uk" name="country"/>
<meta content="ICR Sutton" name="venue"/>
<meta content="Sutton: SRD Ground Floor" name="address"/>
<meta content="51.34489470306327,-0.1888057498478897" name="latlng"/>
<meta content="en" name="language"/>
<meta content="" name="eventbrite"/>
<meta content="Rachel Alcraft" name="instructor"/>
<meta content="tbc" name="helper"/>
<meta content="rachel.alcraft@icr.ac.uk" name="contact"/>
<meta content="swc" name="carpentry"/>
<!-- meta "search-domain" used for google site search function google_search() -->
<meta name="search-domain" value="https://icr-sd-cc.github.io/2024-10-09-ICR"/>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="2024-07-26 12:43:51 +0000" http-equiv="last-modifi

In [182]:
# We looking at the HTML, we find some useful information hidden that we'd like to scrape
# We could find a tag with a specific attribute and attribute value, with the 'attrs' parameter
print(soup.find('meta', attrs ={'name': 'latlng'})['content'])
print(soup.find('meta', attrs ={'name': 'instructor'})['content'])
print(soup.find('meta', attrs ={'name': 'helper'})['content'])
print(soup.get_text(strip=True))

51.34489470306327,-0.1888057498478897


In [190]:
import time
from tqdm import tqdm

In [191]:
list_of_dicts = []
for item in tqdm(list(first_5_websites)):
    req = requests.get(item)
    soup = BeautifulSoup(req.text, 'html.parser')
    dict_new_data = {}
    dict_new_data['link'] = item
    dict_new_data['latlng'] = soup.find('meta', attrs ={'name': 'latlng'})['content']
    dict_new_data['instructor'] = soup.find('meta', attrs ={'name': 'instructor'})['content']
    dict_new_data['helper'] = soup.find('meta', attrs ={'name': 'helper'})['content']
    dict_new_data['text'] = soup.get_text(strip=True)
    list_of_dicts.append(dict_new_data)
    time.sleep(3)

100%|██████████| 5/5 [00:16<00:00,  3.27s/it]


In [192]:
new_data_df = pd.DataFrame(list_of_dicts)
new_data_df

Unnamed: 0,link,latlng,instructor,helper,text
0,https://icr-sd-cc.github.io/2024-10-09-ICR,"51.34489470306327,-0.1888057498478897",Rachel Alcraft,tbc,"ICR Sutton: Wed Oct 09, 2024Toggle navigationH..."
1,https://nclrse-training.github.io/2024-10-08-NCL/,"54.980639,-1.6143042",Dr Jannetta Steyn|Imre Draskovits,Ruxandra Neatu|Steve Debski,"Newcastle University, Henry Daysh Building 6.1..."
2,https://brian-maass-unmc.github.io/2024-10-07-...,"41.255885,-95.974518",Brian Maass|Lisa Chinn|Caughlin Bohn,Natasha Pavlovikj|Emily Nimsakont,University of Nebraska Medical Center: October...
3,https://tajuakins.github.io/2024-10-01-nau-onl...,"45,-1",Salena Torres Ashton|Aristotelis Misios|Lyrric...,helper one|helper two,"North Arizona University, Institute for Tribal..."
4,https://librarylady1.github.io/2024-09-28-UMD/,"41.633331,-71.006882",Krishna Bijjam|Zebulun Arendsee,Andy Jones|Catalina Roma,University of Massachusetts Dartmouth: Sep 28-...


If you do this for all rows without waiting, you could send 3.816 requests to the web server in a couple of seconds! This is rude, as you are taking too many resources from the server, potentially inhibiting access to other users, or maybe causing it to crash. Your IP could be identified and you could get blocked, to say the least!