In [None]:
'''Practice web scraping
As you've seen, scraping the internet is a skill that can get you all sorts of information. Here are some little challenges that you can try to gain more experience in the field:

Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml'

Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'

Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: url = 'https://www.emsc-csem.org/Earthquake/'

List all language names and number of related articles in the order they appear in wikipedia.org: url = 'https://www.wikipedia.org/'

A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'

Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers''''

In [15]:
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get('https://www.fbi.gov/wanted/topten', headers=headers, verify=False)
response



<Response [200]>

In [16]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content, 'html.parser')

In [27]:
names = soup.find_all('h3')

In [30]:
target = [name.text.strip() for name in names]


['WILVER VILLEGAS-PALOMINO',
 "VITEL'HOMME INNOCENT",
 'ARNOLDO JIMENEZ',
 'ALEXIS FLORES',
 'OMAR ALEXANDER CARDENAS',
 'YULAN ADONAY ARCHAGA CARIAS',
 'BHADRESHKUMAR CHETANBHAI PATEL',
 'DONALD EUGENE FIELDS II',
 'RUJA IGNATOVA',
 'ALEJANDRO ROSALES CASTILLO',
 'federal bureau of investigation',
 'FBI.gov Contact Center']

In [32]:
import pandas as pd

df = pd.DataFrame(target)
df

Unnamed: 0,0
0,WILVER VILLEGAS-PALOMINO
1,VITEL'HOMME INNOCENT
2,ARNOLDO JIMENEZ
3,ALEXIS FLORES
4,OMAR ALEXANDER CARDENAS
5,YULAN ADONAY ARCHAGA CARIAS
6,BHADRESHKUMAR CHETANBHAI PATEL
7,DONALD EUGENE FIELDS II
8,RUJA IGNATOVA
9,ALEJANDRO ROSALES CASTILLO


In [33]:
top10 = df.head(10)
top10

Unnamed: 0,0
0,WILVER VILLEGAS-PALOMINO
1,VITEL'HOMME INNOCENT
2,ARNOLDO JIMENEZ
3,ALEXIS FLORES
4,OMAR ALEXANDER CARDENAS
5,YULAN ADONAY ARCHAGA CARIAS
6,BHADRESHKUMAR CHETANBHAI PATEL
7,DONALD EUGENE FIELDS II
8,RUJA IGNATOVA
9,ALEJANDRO ROSALES CASTILLO


In [34]:
top10.index = top10.index + 1
top10

Unnamed: 0,0
1,WILVER VILLEGAS-PALOMINO
2,VITEL'HOMME INNOCENT
3,ARNOLDO JIMENEZ
4,ALEXIS FLORES
5,OMAR ALEXANDER CARDENAS
6,YULAN ADONAY ARCHAGA CARIAS
7,BHADRESHKUMAR CHETANBHAI PATEL
8,DONALD EUGENE FIELDS II
9,RUJA IGNATOVA
10,ALEJANDRO ROSALES CASTILLO


In [61]:
import requests

response = requests.get('https://www.data.gov.uk/', verify=False)
response



<Response [200]>

In [62]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content, 'html.parser')

In [65]:
data = soup.find_all( 'h3', class_ = 'govuk-heading-s dgu-topics__heading')
data


[<h3 class="govuk-heading-s dgu-topics__heading"><a class="govuk-link" href="/search?filters%5Btopic%5D=Business+and+economy">Business and economy</a></h3>,
 <h3 class="govuk-heading-s dgu-topics__heading"><a class="govuk-link" href="/search?filters%5Btopic%5D=Crime+and+justice">Crime and justice</a></h3>,
 <h3 class="govuk-heading-s dgu-topics__heading"><a class="govuk-link" href="/search?filters%5Btopic%5D=Defence">Defence</a></h3>,
 <h3 class="govuk-heading-s dgu-topics__heading"><a class="govuk-link" href="/search?filters%5Btopic%5D=Education">Education</a></h3>,
 <h3 class="govuk-heading-s dgu-topics__heading"><a class="govuk-link" href="/search?filters%5Btopic%5D=Environment">Environment</a></h3>,
 <h3 class="govuk-heading-s dgu-topics__heading"><a class="govuk-link" href="/search?filters%5Btopic%5D=Government">Government</a></h3>,
 <h3 class="govuk-heading-s dgu-topics__heading"><a class="govuk-link" href="/search?filters%5Btopic%5D=Government+spending">Government spending</a></

In [67]:
target = [name.text.strip() for name in data]
target

['Business and economy',
 'Crime and justice',
 'Defence',
 'Education',
 'Environment',
 'Government',
 'Government spending',
 'Health',
 'Mapping',
 'Society',
 'Towns and cities',
 'Transport',
 'Digital service performance',
 'Government reference data']

In [68]:
import pandas as pd

df2 = pd.DataFrame(target)
df2

Unnamed: 0,0
0,Business and economy
1,Crime and justice
2,Defence
3,Education
4,Environment
5,Government
6,Government spending
7,Health
8,Mapping
9,Society


In [69]:
import requests

response = requests.get('https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers', verify=False)
response



<Response [200]>

In [70]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content, 'html.parser')

In [115]:
lang = soup.find_all( 'a', class_ = 'mw-redirect')
lang


[<a class="mw-redirect" href="/wiki/Native_speaker" title="Native speaker">native speakers</a>,
 <a class="mw-redirect" href="/wiki/Mutually_intelligible" title="Mutually intelligible">mutually intelligible</a>,
 <a class="mw-redirect" href="/wiki/Arabic_language" title="Arabic language">Arabic</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:cmn" title="ISO 639:cmn">Mandarin Chinese</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:spa" title="ISO 639:spa">Spanish</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:eng" title="ISO 639:eng">English</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:hin" title="ISO 639:hin">Hindi</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:por" title="ISO 639:por">Portuguese</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:ben" title="ISO 639:ben">Bengali</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:rus" title="ISO 639:rus">Russian</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:jpn" title="ISO 639:jpn">Japanese</a>,
 <a class="mw-redirect" href="/w

In [116]:
target = [name.text.strip() for name in lang]
target

['native speakers',
 'mutually intelligible',
 'Arabic',
 'Mandarin Chinese',
 'Spanish',
 'English',
 'Hindi',
 'Portuguese',
 'Bengali',
 'Russian',
 'Japanese',
 'Yue Chinese',
 'Vietnamese',
 'Turkish',
 'Wu Chinese',
 'Marathi',
 'Telugu',
 'Korean',
 'French',
 'Tamil',
 'Egyptian Arabic',
 'Standard German',
 'Urdu',
 'Javanese',
 'Western Punjabi',
 'Italian',
 'Gujarati',
 'Iranian Persian',
 'Bhojpuri',
 'Hausa',
 'CIA World Factbook',
 'Arabic',
 'Hindi',
 'List of languages by the number of countries in which they are recognized as an official language',
 'ISBN',
 'ISBN',
 'ISBN',
 'ISBN',
 'ISBN',
 'Arabic',
 'Spanish',
 'Europe',
 'List of Afro-Asiatic languages',
 'List of Austronesian languages',
 'List of Tungusic languages']

In [127]:
import pandas as pd

lang = pd.DataFrame(target)

lang = lang_subset = lang.iloc[3:13]

print(lang_subset)

lang_subset = pd.DataFrame(lang_subset)
lang_subset

                   0
3   Mandarin Chinese
4            Spanish
5            English
6              Hindi
7         Portuguese
8            Bengali
9            Russian
10          Japanese
11       Yue Chinese
12        Vietnamese


Unnamed: 0,0
3,Mandarin Chinese
4,Spanish
5,English
6,Hindi
7,Portuguese
8,Bengali
9,Russian
10,Japanese
11,Yue Chinese
12,Vietnamese


In [108]:
num = soup.find_all( 'td')
num

[<td><a class="mw-redirect" href="/wiki/ISO_639:cmn" title="ISO 639:cmn">Mandarin Chinese</a>
 </td>,
 <td>939
 </td>,
 <td><a href="/wiki/Sino-Tibetan_languages" title="Sino-Tibetan languages">Sino-Tibetan</a>
 </td>,
 <td><a href="/wiki/Sinitic_languages" title="Sinitic languages">Sinitic</a>
 </td>,
 <td><a class="mw-redirect" href="/wiki/ISO_639:spa" title="ISO 639:spa">Spanish</a>
 </td>,
 <td>485
 </td>,
 <td><a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>
 </td>,
 <td><a href="/wiki/Romance_languages" title="Romance languages">Romance</a>
 </td>,
 <td><a class="mw-redirect" href="/wiki/ISO_639:eng" title="ISO 639:eng">English</a>
 </td>,
 <td>380
 </td>,
 <td><a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>
 </td>,
 <td><a href="/wiki/Germanic_languages" title="Germanic languages">Germanic</a>
 </td>,
 <td><a class="mw-redirect" href="/wiki/ISO_639:hin" title="ISO 639:hin">Hindi</a>
 </td>,
 <td>3

In [109]:
target = [number.text.strip() for number in num]
target

['Mandarin Chinese',
 '939',
 'Sino-Tibetan',
 'Sinitic',
 'Spanish',
 '485',
 'Indo-European',
 'Romance',
 'English',
 '380',
 'Indo-European',
 'Germanic',
 'Hindi',
 '345',
 'Indo-European',
 'Indo-Aryan',
 'Portuguese',
 '236',
 'Indo-European',
 'Romance',
 'Bengali',
 '234',
 'Indo-European',
 'Indo-Aryan',
 'Russian',
 '147',
 'Indo-European',
 'Balto-Slavic',
 'Japanese',
 '123',
 'Japonic',
 'Japanese',
 'Yue Chinese',
 '86.1',
 'Sino-Tibetan',
 'Sinitic',
 'Vietnamese',
 '85.0',
 'Austroasiatic',
 'Vietic',
 'Turkish',
 '84.0',
 'Turkic',
 'Oghuz',
 'Wu Chinese',
 '83.4',
 'Sino-Tibetan',
 'Sinitic',
 'Marathi',
 '83.2',
 'Indo-European',
 'Indo-Aryan',
 'Telugu',
 '83.0',
 'Dravidian',
 'South-Central',
 'Korean',
 '81.7',
 'Koreanic',
 '—',
 'French',
 '80.8',
 'Indo-European',
 'Romance',
 'Tamil',
 '78.6',
 'Dravidian',
 'South',
 'Egyptian Arabic',
 '77.4',
 'Afroasiatic',
 'Semitic',
 'Standard German',
 '75.3',
 'Indo-European',
 'Germanic',
 'Urdu',
 '70.6',
 'Indo-E

In [110]:
def is_numeric(value):
    try:
        float_value = float(value)
        return True
    except ValueError:
        return False

numeric_values = [value for value in target if is_numeric(value)]

print("Numeric Values:", numeric_values)



Numeric Values: ['939', '485', '380', '345', '236', '234', '147', '123', '86.1', '85.0', '84.0', '83.4', '83.2', '83.0', '81.7', '80.8', '78.6', '77.4', '75.3', '70.6', '68.3', '66.7', '64.6', '57.1', '57.2', '52.3', '51.7', '1', '2', '3', '3', '5', '6', '7', '8', '9', '10', '11']


In [111]:
number = numeric_values[:10]
number

['939', '485', '380', '345', '236', '234', '147', '123', '86.1', '85.0']

In [112]:
import pandas as pd

number = pd.DataFrame(number)

In [128]:
number = number.reset_index(drop = True) 
lang_subset = lang_subset.reset_index(drop = True) 


In [129]:
final = pd.concat([lang_subset, number], axis = 1)
final

Unnamed: 0,0,0.1
0,Mandarin Chinese,939.0
1,Spanish,485.0
2,English,380.0
3,Hindi,345.0
4,Portuguese,236.0
5,Bengali,234.0
6,Russian,147.0
7,Japanese,123.0
8,Yue Chinese,86.1
9,Vietnamese,85.0
