In [6]:
from bs4 import BeautifulSoup
import requests

- Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: `url ='https://en.wikipedia.org/wiki/Python'`

In [11]:
url ='https://en.wikipedia.org/wiki/Python_(programming_language)'

response = requests.get(url)
response.status_code

soup = BeautifulSoup(response.content, 'html.parser')

reflist = soup.select('span.reference-text > cite > a:nth-child(1)')

links = []

for i in reflist:
    str_url = str(i['href'])
    if str_url[:4] != 'http':
        str_url = 'https://en.wikipedia.org' + str_url
    links.append(str_url)

#links ##uncomment to see the list of links

- Find the number of titles that have changed in the United States Code since its last release point: `url = 'http://uscode.house.gov/download/download.shtml'`

In [12]:
url ='http://uscode.house.gov/download/download.shtml'

response = requests.get(url)
response.status_code

soup = BeautifulSoup(response.content, 'html.parser')

changed = soup.find_all('div', {'class':'usctitlechanged'})

print('Changed titles:')

for i in changed:
    print(i.get_text(strip=True))

print('Total changed: ', len(changed))

Changed titles:
Title 1 - General Provisions٭
Title 2 - The Congress
Title 5 - Government Organization and Employees٭
Title 6 - Domestic Security
Title 7 - Agriculture
Title 12 - Banks and Banking
Title 15 - Commerce and Trade
Title 16 - Conservation
Title 19 - Customs Duties
Title 23 - Highways٭
Title 25 - Indians
Title 26 - Internal Revenue Code
Title 29 - Labor
Title 30 - Mineral Lands and Mining
Title 33 - Navigation and Navigable Waters
Title 40 - Public Buildings, Property, and Works٭
Title 41 - Public Contracts٭
Title 42 - The Public Health and Welfare
Title 43 - Public Lands
Title 45 - Railroads
Title 46 - Shipping٭
Title 47 - Telecommunications
Title 49 - Transportation٭
Title 54 - National Park Service and Related Programs٭
Total changed:  24


- Create a Python list with the top ten FBI's Most Wanted names: `url = 'https://www.fbi.gov/wanted/topten'`

In [13]:
url ='https://www.fbi.gov/wanted/topten'

response = requests.get(url)
response.status_code

soup = BeautifulSoup(response.content, 'html.parser')


# Copy CSS selector for the element from the page:
# (li.portal-type-person:nth-child(1) > h3:nth-child(2) > a:nth-child(1))
# and remove ":nth-child(1)" to select all "li.portal-type-person" items.

most_wanted = soup.select('li.portal-type-person > h3:nth-child(2) > a:nth-child(1)')

most_wanted_list = []

for i in most_wanted:
    most_wanted_list.append(i.get_text(strip=True))

print(most_wanted_list)

['JASON DEREK BROWN', 'ALEXIS FLORES', 'JOSE RODOLFO VILLARREAL-HERNANDEZ', 'RAFAEL CARO-QUINTERO', 'YULAN ADONAY ARCHAGA CARIAS', 'EUGENE PALMER', 'BHADRESHKUMAR CHETANBHAI PATEL', 'ALEJANDRO ROSALES CASTILLO', 'ARNOLDO JIMENEZ', 'OCTAVIANO JUAREZ-CORRO']


- Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: `url = 'https://www.emsc-csem.org/Earthquake/'`

In [14]:
import pandas as pd

url ='https://www.emsc-csem.org/Earthquake/'

response = requests.get(url)
response.status_code

soup = BeautifulSoup(response.content, 'html.parser')

# Select first 20 table rows:
rows = soup.select('#tbody > tr')

date = []
time = []
latitude = []
longitude = []
region_name = []

# print(rows[0].find_all('td')[3])

for row in rows:
    # Find all columns in a row
    columns = row.find_all('td')

    # Find date and time (4th column), find "a" tag and get text. Then split date and time to the new list.
    # Then append date to "date" list and time to "time" list.
    date_time_list = columns[3].find('a').get_text(strip=True).split()
    date.append(date_time_list[0])
    time.append(date_time_list[1])

    # Get latitude - join text in columns 5 and 6. Append to "latitude" list.
    la1 = columns[4].get_text(strip=True)
    la2 = columns[5].get_text(strip=True)
    latitude.append(' '.join([la1, la2]))

    # Get longitude - join text in columns 7 and 8. Append to "longitude" list.
    lo1 = columns[6].get_text(strip=True)
    lo2 = columns[7].get_text(strip=True)
    longitude.append(' '.join([lo1, lo2]))

    # Get region from column 12 and append to "region_name" list.
    region_name.append(columns[11].get_text(strip=True))

earthquakes = pd.DataFrame({'date':date, 'time':time, 'latitude':latitude, 'longitude':longitude, 'region_name':region_name})

earthquakes.head(20)

# print(date)
# print(time)
# print(latitude)
# print(longitude)
# print(region_name)

Unnamed: 0,date,time,latitude,longitude,region_name
0,2022-03-02,18:52:47.7,35.69 N,121.13 W,CENTRAL CALIFORNIA
1,2022-03-02,18:03:42.0,13.94 N,124.29 E,"CATANDUANES, PHILIPPINES"
2,2022-03-02,17:37:43.0,0.19 N,100.01 E,"NORTHERN SUMATRA, INDONESIA"
3,2022-03-02,17:32:58.0,0.21 N,100.07 E,"NORTHERN SUMATRA, INDONESIA"
4,2022-03-02,17:03:51.0,19.22 N,121.32 E,"BABUYAN ISL REGION, PHILIPPINES"
5,2022-03-02,17:02:34.1,49.33 N,155.92 E,KURIL ISLANDS
6,2022-03-02,16:57:48.3,35.21 N,25.32 E,"CRETE, GREECE"
7,2022-03-02,16:56:06.6,39.24 N,20.59 E,GREECE
8,2022-03-02,16:45:40.0,36.37 N,8.92 W,WEST OF GIBRALTAR
9,2022-03-02,16:37:50.2,40.07 N,22.32 E,GREECE


- List all language names and number of related articles in the order they appear in [wikipedia.org](wikipedia.org): `url = 'https://www.wikipedia.org/'`

In [15]:
url ='https://www.wikipedia.org'

response = requests.get(url)
response.status_code

soup = BeautifulSoup(response.content, 'html.parser')


# Copy CSS selector for the element from the page:
# (div.central-featured-lang:nth-child(1))
# and remove ":nth-child(1)" to select all "div.central-featured-lang" items.

lang_list = soup.select('div.central-featured-lang')

for i in lang_list:
    # Find language code
    code = i['lang']
    # Find language name
    name = i.find('strong').get_text()
    # Find the number of articles
    number = i.find('bdi').get_text()
    
    print(' - '.join([code, name, number]))

en - English - 6 383 000+
ja - 日本語 - 1 292 000+
ru - Русский - 1 756 000+
de - Deutsch - 2 617 000+
es - Español - 1 717 000+
fr - Français - 2 362 000+
zh - 中文 - 1 231 000+
it - Italiano - 1 718 000+
pl - Polski - 1 490 000+
pt - Português - 1 074 000+


- A list with the different kind of datasets available in [data.gov.uk](data.gov.uk): `url = 'https://data.gov.uk/'`

In [16]:
url = 'https://data.gov.uk'

response = requests.get(url)
response.status_code

soup = BeautifulSoup(response.content, 'html.parser')


# Copy CSS selector for the element from the page:
# (.govuk-list > li:nth-child(1) > h3:nth-child(1) > a:nth-child(1))
# and remove ":nth-child(1)" from "li:nth-child(1)" to select all items.

datasets = soup.select('.govuk-list > li > h3:nth-child(1) > a:nth-child(1)')

dataset_list = []

for i in datasets:
    dataset_list.append(i.get_text(strip=True))

print(dataset_list)

['Business and economy', 'Crime and justice', 'Defence', 'Education', 'Environment', 'Government', 'Government spending', 'Health', 'Mapping', 'Society', 'Towns and cities', 'Transport', 'Digital service performance', 'Government reference data']


- Display the top 10 languages by number of native speakers stored in a pandas dataframe: `url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'`

In [17]:
url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

response = requests.get(url)
response.status_code

soup = BeautifulSoup(response.content, 'html.parser')

# Copy CSS selector for the element from the page:
# (table.wikitable:nth-child(17) > tbody:nth-child(3) > tr:nth-child(1))
# and remove ":nth-child(1)" from "tbody" and "tr" to select all items.
# Add "not(:first-child)" to "tr" to ignore header row.

languages = soup.select('table.wikitable:nth-child(17) > tbody > tr:not(:first-child)')

language = []
number = []

for i in languages:
    # Find all columns in a row
    columns = i.find_all('td')

    # Get language from column 2 and append to "language" list.
    language.append(columns[1].get_text(strip=True))

    # Get the number of native speakers from column 3 and append to "number" list.
    number.append(columns[2].get_text(strip=True))

languages_list = pd.DataFrame({'language':language, 'number':number})
languages_list.head(10)

# print(language)
# print(number)

Unnamed: 0,language,number
0,Mandarin Chinese,918.0
1,Spanish,480.0
2,English,379.0
3,Hindi(sanskritisedHindustani)[11],341.0
4,Bengali,300.0
5,Portuguese,221.0
6,Russian,154.0
7,Japanese,128.0
8,Western Punjabi[12],92.7
9,Marathi,83.1
