In [13]:
import pandas as pd
from bs4 import BeautifulSoup
import re

import requests

url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

response = requests.get(url)
response

<Response [200]>

In [2]:
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
languages = []
for language in soup.find_all('a', class_ = 'mw-redirect'):
    languages.append(language.getText().strip())

len(languages)
languages = languages[3:32]
languages

['Mandarin Chinese',
 'Spanish',
 'English',
 'Hindi',
 'Bengali',
 'Portuguese',
 'Russian',
 'Japanese',
 'Yue Chinese',
 'Vietnamese',
 'Turkish',
 'Wu Chinese',
 'Marathi',
 'Telugu',
 'Western Punjabi',
 'Korean',
 'Tamil',
 'Egyptian Arabic',
 'Standard German',
 'French',
 'Urdu',
 'Javanese',
 'Italian',
 'Iranian Persian',
 'Gujarati',
 'Hausa',
 'Bhojpuri',
 'Levantine Arabic',
 'Southern Min']

In [4]:
speakers = []
for n in soup.find_all('td'):
    speakers.append(n.getText().strip())

len(speakers)

speakers = speakers[1::4][:29]
speakers

['941',
 '486',
 '380',
 '345',
 '237',
 '236',
 '148',
 '123',
 '86',
 '85',
 '84',
 '83',
 '83',
 '83',
 '82',
 '81',
 '79',
 '78',
 '76',
 '74',
 '70',
 '68',
 '64',
 '62',
 '58',
 '54',
 '53',
 '51',
 '51']

In [7]:
df = pd.DataFrame({
    'Language': languages,
    'Speakers(in millions)': speakers
})

In [8]:
df

Unnamed: 0,Language,Speakers(in millions)
0,Mandarin Chinese,941
1,Spanish,486
2,English,380
3,Hindi,345
4,Bengali,237
5,Portuguese,236
6,Russian,148
7,Japanese,123
8,Yue Chinese,86
9,Vietnamese,85


In [9]:
df.head(10)

Unnamed: 0,Language,Speakers(in millions)
0,Mandarin Chinese,941
1,Spanish,486
2,English,380
3,Hindi,345
4,Bengali,237
5,Portuguese,236
6,Russian,148
7,Japanese,123
8,Yue Chinese,86
9,Vietnamese,85


In [10]:
df2 = pd.read_html('https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers')[0]
df2[['Language', 'Native speakers (in millions)']].head(10)

Unnamed: 0,Language,Native speakers (in millions)
0,Mandarin Chinese,941
1,Spanish,486
2,English,380
3,Hindi,345
4,Bengali,237
5,Portuguese,236
6,Russian,148
7,Japanese,123
8,Yue Chinese,86
9,Vietnamese,85


In [11]:
url2 = 'https://www.wikipedia.org/'
response2 = requests.get(url2)

In [12]:
soup2 = BeautifulSoup(response2.content, 'html.parser')

In [14]:
languages = []
for link_box in soup2.find_all('a', class_ = 'link-box'):
    text = link_box.get_text(strip = True)

    language_match = re.match(r'^\D+', text) 
    if language_match:
        languages.append(language_match.group())

languages

['English',
 '日本語',
 'Deutsch',
 'Русский',
 'Español',
 'Français',
 '中文',
 'Italiano',
 'فارسی',
 'Português']

In [15]:
article_counts = []

for number in soup2.find_all('small'):
    article_counts.append(number.getText().strip())

article_counts = article_counts[:10]
article_counts

['6,847,000+ articles',
 '1,421,000+ 記事',
 '2.924.000+ Artikel',
 '1\xa0987\xa0000+ статей',
 '1.965.000+ artículos',
 '2\u202f621\u202f000+ articles',
 '1,429,000+ 条目 / 條目',
 '1.871.000+ voci',
 '۱٬۰۰۶٬۰۰۰+ مقاله',
 '1.128.000+ artigos']

In [16]:
df2 = pd.DataFrame({
    'Language': languages,
    'Number of articles published': article_counts
})
df2

Unnamed: 0,Language,Number of articles published
0,English,"6,847,000+ articles"
1,日本語,"1,421,000+ 記事"
2,Deutsch,2.924.000+ Artikel
3,Русский,1 987 000+ статей
4,Español,1.965.000+ artículos
5,Français,2 621 000+ articles
6,中文,"1,429,000+ 条目 / 條目"
7,Italiano,1.871.000+ voci
8,فارسی,۱٬۰۰۶٬۰۰۰+ مقاله
9,Português,1.128.000+ artigos


In [19]:
df2['Number of articles published'] = (
    df2['Number of articles published']
    .astype(str)
    .str.replace(r'[^\d]', '', regex=True)
    .astype(int)
)

df2 = df2.sort_values(by='Number of articles published', ascending=False).reset_index(drop=True)
df2

Unnamed: 0,Language,Number of articles published
0,English,6847000
1,Deutsch,2924000
2,Français,2621000
3,Русский,1987000
4,Español,1965000
5,Italiano,1871000
6,中文,1429000
7,日本語,1421000
8,Português,1128000
9,فارسی,1006000


In [18]:
url3 = 'https://en.wikipedia.org/wiki/Python'
response3 = requests.get(url3)

soup3 = BeautifulSoup(response3.content, 'html.parser')

In [20]:
wiki_links = soup3.find_all('a')
links_url = []

for link in wiki_links:
    link_href = link.get('href')
    if link_href: 
        links_url.append(link_href)
    
links_url

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 '/w/index.php?title=Special:CreateAccount&returnto=Python',
 '/w/index.php?title=Special:UserLogin&returnto=Python',
 '/w/index.php?title=Special:CreateAccount&returnto=Python',
 '/w/index.php?title=Special:UserLogin&returnto=Python',
 '/wiki/Help:Introduction',
 '/wiki/Special:MyContributions',
 '/wiki/Special:MyTalk',
 '#',
 '#Snakes',
 '#Computing',
 '#People',
 '#Roller_coasters',
 '#Vehicles',
 '#Weaponry',
 '#Other_uses',
 '#See_also',
 

In [21]:
wiki_titles = soup3.find_all('a')
links_titles = []

for link in wiki_titles:
    link_title = link.get('title')
    if link_title: 
        links_titles.append(link_title)
    
links_titles

['Visit the main page [z]',
 'Guides to browsing Wikipedia',
 'Articles related to current events',
 'Visit a randomly selected article [x]',
 'Learn about Wikipedia and how it works',
 'How to contact Wikipedia',
 'Support us by donating to the Wikimedia Foundation',
 'Guidance on how to use and edit Wikipedia',
 'Learn how to edit Wikipedia',
 'The hub for editors',
 'A list of recent changes to Wikipedia [r]',
 'Add images or other media for use on Wikipedia',
 'Search Wikipedia [f]',
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 'A list of edits made from this IP address [y]',
 'Discussion about edits from this IP address [n]',
 'Python – Afrikaans',
 'Python – Alemannic',
 'بايثون (توضيح) – Arabic',
 'Python (dəqiqləşdirmə) – Az

In [22]:
wiki_links = soup3.find_all('a')

links_url = []
links_titles = []

pattern = re.compile(r'^/wiki/P[yi]')

for link in wiki_links:
    link_href = link.get('href')
    link_title = link.get('title')
    if link_href and pattern.match(link_href) and link_href != '/wiki/Python' : 
        links_url.append(link_href)
        links_titles.append(link_title)
    
print(links_url)      
print(links_titles) 

['/wiki/Pythonidae', '/wiki/Python_(genus)', '/wiki/Python_(mythology)', '/wiki/Python_(programming_language)', '/wiki/Python_of_Aenus', '/wiki/Python_(painter)', '/wiki/Python_of_Byzantium', '/wiki/Python_of_Catana', '/wiki/Python_Anghelo', '/wiki/Python_(Efteling)', '/wiki/Python_(Busch_Gardens_Tampa_Bay)', '/wiki/Python_(Coney_Island,_Cincinnati,_Ohio)', '/wiki/Python_(automobile_maker)', '/wiki/Python_(Ford_prototype)', '/wiki/Python_(missile)', '/wiki/Python_(nuclear_primary)', '/wiki/Python_(codename)', '/wiki/Python_(film)', '/wiki/Python_(Monty)_Pictures', '/wiki/Pyton', '/wiki/Pithon']
['Pythonidae', 'Python (genus)', 'Python (mythology)', 'Python (programming language)', 'Python of Aenus', 'Python (painter)', 'Python of Byzantium', 'Python of Catana', 'Python Anghelo', 'Python (Efteling)', 'Python (Busch Gardens Tampa Bay)', 'Python (Coney Island, Cincinnati, Ohio)', 'Python (automobile maker)', 'Python (Ford prototype)', 'Python (missile)', 'Python (nuclear primary)', 'Pytho

In [23]:
df3 = pd.DataFrame({
    'Title': links_titles,
    'url path': links_url
})
df3

Unnamed: 0,Title,url path
0,Pythonidae,/wiki/Pythonidae
1,Python (genus),/wiki/Python_(genus)
2,Python (mythology),/wiki/Python_(mythology)
3,Python (programming language),/wiki/Python_(programming_language)
4,Python of Aenus,/wiki/Python_of_Aenus
5,Python (painter),/wiki/Python_(painter)
6,Python of Byzantium,/wiki/Python_of_Byzantium
7,Python of Catana,/wiki/Python_of_Catana
8,Python Anghelo,/wiki/Python_Anghelo
9,Python (Efteling),/wiki/Python_(Efteling)
