In [18]:
import requests
from bs4 import BeautifulSoup
import unicodedata
import pandas as pd

#### Using the Web scraping, read the list of the Sovereign States from https://en.wikipedia.org/wiki/List_of_sovereign_states

In [19]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_sovereign_states")
soup = BeautifulSoup(response.content, "html.parser")
all_table = soup.find('tbody')
countries = all_table.find_all('b')

#### Get the list of the countries. When navigating to each of them (ex. Andorra), get the main information about the country.
#### Write the program which gathers the main information about each country and writes them all (for all the countries) into the same file

In [20]:
countries_info = dict()
for country in countries:
    specific_country = country.text.replace('\xa0', '')

    for url in country.find_all('a', href=True):
        full_url = 'https://en.wikipedia.org/' + url['href']
        countries_info[specific_country] = full_url

In [21]:
#countries_info

In [22]:
file_text = open("countries_descriptions.txt", "w", encoding="utf8")

In [23]:
for country_name, country_url in countries_info.items():
    file_text.write(country_name)
    resp = requests.get(country_url)
    asoup = BeautifulSoup(resp.content, "html.parser")
    bsoup = asoup.find(property="mw:PageProp/toc").find_all_previous("p")

    description = []
    for item in bsoup:
        item = str(item.get_text())
        description.append(item)

    desc = reversed(description)
    for paragraph in desc:
        file_text.write(paragraph)
    # I used break for run the cicle once, otherwise it takes time for scraping over 200 countries
    break 

In [24]:
file_text.close()

#### Also from Wikipedia, get as much data about each country and summarize it into the dataframe and export to csv or excel file

In [25]:
some_country = requests.get("https://en.wikipedia.org/wiki/Afghanistan")
soup = BeautifulSoup(some_country.content, "html.parser")
table = soup.table

# I got only part of headers in this part. Also it is possible to get all of them if there would be enough time for this task
headers = []

label = table.find_all(class_="infobox-label")
for item in label:
    if u'\xa0' in item.text:
        unnecessary = item
    else:
        clean_text = unicodedata.normalize("NFKD", item.text)
        headers.append(clean_text)

In [26]:
headers[1] = 'Capital'
headers[2] = 'Religion'
headers.insert(0, 'State')
#headers.insert(1, 'Status')
headers.remove('')

In [27]:
headers

['State',
 'Status',
 'Capital',
 'Religion',
 'Demonym(s)',
 'Government',
 'Legislature',
 'Currency',
 'Time zone',
 'Driving side',
 'Calling code',
 'ISO 3166 code',
 'Internet TLD']

In [28]:
#headers = ['State', 'Status', 'Capital', 'Religion', 'Demonym(s)', 'Government', 'Legislature',
#          'Currency', 'Time zone', 'Driving side', 'Calling code', 'ISO 3166 code', 'Internet TLD']

In [29]:
table_dict = dict()
for header in headers:
    table_dict[header] = 'NaN'

In [30]:
df = pd.DataFrame(columns=headers)

In [31]:
# I used counter and break to run the cycle once, otherwise, it takes time to parse over 200 countries
counter = 0

for state, country_url in countries_info.items():
    
    table_dict['State'] = state
    res = requests.get(country_url)
    soup = BeautifulSoup(res.content, "html.parser")
    country_soup = soup.table
    rows = country_soup.find_all('tr')
    for row in rows:
        lines = []
        label = row.find(class_="infobox-label")
        for header in headers[1:]:
            if label is not None and header in label.text:
                clean_text = unicodedata.normalize("NFKD", row.find(class_="infobox-data").text)
                table_dict[header] = clean_text
    new_row = pd.Series(table_dict)
    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)  

    counter += 1
    if counter > 5:
        break

In [32]:
df.head(5)

Unnamed: 0,State,Status,Capital,Religion,Demonym(s),Government,Legislature,Currency,Time zone,Driving side,Calling code,ISO 3166 code,Internet TLD
0,Afghanistan,UN member state under an unrecognized government,Kabul34°31′N 69°11′E﻿ / ﻿34.517°N 69.183°E﻿ / ...,99.7% Islam (official)0.3% Others,Afghan[b][11][12],Unitary totalitarian[13] provisional theocrati...,Leadership Council[15],Afghani (افغانی) (AFN),UTC+4:30Lunar Calendar[21] (Afghanistan Time),right,93,AF,.afافغانستان.
1,Albania,UN member state under an unrecognized government,Tirana41°19′N 19°49′E﻿ / ﻿41.317°N 19.817°E﻿ /...,59% Islam17% Christianity9% No religion15% Und...,Albanian,Unitary parliamentary republic,Kuvendi,Lek (ALL),UTC+1 (CET),right,355,AL,.al
2,Algeria,UN member state under an unrecognized government,Algiers36°42′N 3°13′E﻿ / ﻿36.700°N 3.217°E﻿ / ...,99% Sunni Islam (official)1% other (inc. Chris...,Algerian,Unitary semi-presidential republic,Parliament,Algerian dinar (DZD),UTC+1 (CET),right,213,DZ,.dzالجزائر.
3,Andorra,UN member state under an unrecognized government,Andorra la Vella42°30′23′′N 1°31′17′′E﻿ / ﻿42....,Christianity (Catholicism),Andorran,Unitary parliamentary constitutional elective ...,General Council,Euro (€)[d] (EUR),UTC+01 (CET),right[13],376,AD,.ad[e]
4,Angola,UN member state under an unrecognized government,Luanda8°50′S 13°20′E﻿ / ﻿8.833°S 13.333°E﻿ / -...,92.9% Christianity—53.9% Roman Catholic—27.4% ...,Angolan,Unitary dominant-party presidential republic,National Assembly,Angolan kwanza (AOA),UTC+1 (WAT),right,244,AO,.ao


In [33]:
df.to_csv('countries_summary.csv', index=False)