As we mentioned last class, we can effectively find the information we need to scrap by specifying a tag and a property of that tag. In the case above, the tag is 'td' and the property is 'column-#'.

In [7]:
url = 'https://thefactfile.org/countries-currencies-symbols/'
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
page_text = response.text # get all page text 
soup = BeautifulSoup(page_text) # turn page text into a form we can manipulate
all_rows = soup.find_all('tr') # find all table rows (tr)

all_rows[1] # sample of table row contents

<tr class="row-2 even">
<td class="column-1">1.</td><td class="column-2"><a href="https://thefactfile.org/afghanistan-facts/" rel="noopener noreferrer" target="_blank">Afghanistan</a></td><td class="column-3">Afghanistani Afghani</td><td class="column-4">AFN</td><td class="column-5">؋</td>
</tr>

In [6]:
all_rows[1].get_text() # get the text

'\n1.AfghanistanAfghanistani AfghaniAFN؋\n'

In [8]:
# the text still has unreadable strings, this subsets the data by the td tage to mitigate the issue 
all_tds = all_rows[1].find_all('td')
print(all_tds[2].get_text())
print(all_tds[3].get_text())

Afghanistani Afghani
AFN


In [10]:
# notice the code below generates an error
all_tds = all_rows[0].find_all('td')
print(all_tds[2].get_text())
print(all_tds[3].get_text())

IndexError: list index out of range

In [13]:
# notice that we get an exception error; this terminates the code at the line which the script encountered the error
# to prevent this issue from halting the code we handle the exception using the keyword 'try'

# we also use the method strip, which removes newlines '/n' from strings
url = 'https://thefactfile.org/countries-currencies-symbols/'
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
page_text = response.text
soup = BeautifulSoup(page_text)
body = soup.find('tbody')
all_rows = soup.find_all('tr') # tr = table row 
for row in all_rows:
    try:
        all_tds = row.find_all('td')
        print(all_tds[2].get_text().strip())
        print(all_tds[3].get_text().strip())
    except: 
        continue


Afghanistani Afghani
AFN
Armenian Dram
AMD
Azerbaijani Manat
AZN
Bahraini Dinar
BHD
Bangladeshi Taka
BDT
Bhutanese Ngultrum
BTN
Brunei Dollar
BND
Cambodian Riel
KHR
Chinese Yuan Renminbi
CNY
Cypriot Pound
CYP
Georgian Lari
GEL
Indian Rupee
INR
Indonesian Rupiah
IDR
Iranian Rial
IRR
Iraqi Dinar
IQD
Israeli New Sheqel
ILS
Japanese Yen
JPY
Jordanian Dinar
JOD
Kazakhstani Tenge
KZT
Kuwaiti Dinar
KWD
Kyrgyzstani Som
KGS
Lao Kip
LAK
Lebanese Pound
LBP
Malaysian Ringgit
MYR
Maldives Rufiyaa
MVR
Mongolian Tugrik
MNT
Myanmar Kyat
MMK
Nepalese Rupee
NPR
North Korean Won
KPW
Omani Rial
OMR
Pakistan Rupee
PKR
Jordanian Dinar
JOD
Philippine Peso
PHP
Qatari Riyal
QAR
Russian Ruble
RUB
Saudi Arabian Riyal
SAR
Singapore Dollar
SGD
Korean Won
KRW
Sri Lankan Rupee
LKR
Syrian Pound
SYP
New Taiwan Dollar
TWD
Tajikistan Somoni
TJS
Thai Baht
THB
United States Dollar
USD
Turkish New Lira
TRY
Turkmenistani Manat
TMM
United Arab Emirates Dirham
AED
Uzbekistani Som
UZS
Viet Nam Dong
VND
Yemeni Rial
YER
Currency

Putting this into a function: 

In [21]:
def get_currency_list():
    currency_list = list()
    url = 'https://thefactfile.org/countries-currencies-symbols/'
    # requests will run the required protocols to get the url's html
    import requests
    # check if we successfully received the data 
    response = requests.get(url)
    # a response code of 200 signifies that we successfully contacted and received data from the url's server
    if not response.status_code == 200:
        return currency_list
    
    page_text = response.text
    soup = BeautifulSoup(page_text)
    #body = soup.find('tbody')
    all_rows = soup.find_all('tr') # tr = table row 
    for row in all_rows:
        try:
            all_tds = row.find_all('td')
            currency_list.append((all_tds[2].get_text().strip(),all_tds[3].get_text().strip()))
        except: 
            continue
    return currency_list 

In [22]:
get_currency_list()

[('Afghanistani Afghani', 'AFN'),
 ('Armenian Dram', 'AMD'),
 ('Azerbaijani Manat', 'AZN'),
 ('Bahraini Dinar', 'BHD'),
 ('Bangladeshi Taka', 'BDT'),
 ('Bhutanese Ngultrum', 'BTN'),
 ('Brunei Dollar', 'BND'),
 ('Cambodian Riel', 'KHR'),
 ('Chinese Yuan Renminbi', 'CNY'),
 ('Cypriot Pound', 'CYP'),
 ('Georgian Lari', 'GEL'),
 ('Indian Rupee', 'INR'),
 ('Indonesian Rupiah', 'IDR'),
 ('Iranian Rial', 'IRR'),
 ('Iraqi Dinar', 'IQD'),
 ('Israeli New Sheqel', 'ILS'),
 ('Japanese Yen', 'JPY'),
 ('Jordanian Dinar', 'JOD'),
 ('Kazakhstani Tenge', 'KZT'),
 ('Kuwaiti Dinar', 'KWD'),
 ('Kyrgyzstani Som', 'KGS'),
 ('Lao Kip', 'LAK'),
 ('Lebanese Pound', 'LBP'),
 ('Malaysian Ringgit', 'MYR'),
 ('Maldives Rufiyaa', 'MVR'),
 ('Mongolian Tugrik', 'MNT'),
 ('Myanmar Kyat', 'MMK'),
 ('Nepalese Rupee', 'NPR'),
 ('North Korean Won', 'KPW'),
 ('Omani Rial', 'OMR'),
 ('Pakistan Rupee', 'PKR'),
 ('Jordanian Dinar', 'JOD'),
 ('Philippine Peso', 'PHP'),
 ('Qatari Riyal', 'QAR'),
 ('Russian Ruble', 'RUB'),
 ('Sa