In [107]:
import requests
from bs4 import BeautifulSoup
import unicodedata
import pandas as pd

#### Using the Web scraping, read the list of the Sovereign States from https://en.wikipedia.org/wiki/List_of_sovereign_states

In [108]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_sovereign_states")
soup = BeautifulSoup(response.content, "html.parser")
all_table = soup.find('tbody')
countries = all_table.find_all('b')

#### Get the list of the countries. When navigating to each of them (ex. Andorra), get the main information about the country.
#### Write the program which gathers the main information about each country and writes them all (for all the countries) into the same file

In [109]:
countries_info = dict()
for country in countries:
    specific_country = country.text.replace('\xa0', '')

    for url in country.find_all('a', href=True):
        full_url = 'https://en.wikipedia.org/' + url['href']
        countries_info[specific_country] = full_url

In [110]:
file_text = open("countries_descriptions.txt", "w", encoding="utf8")

In [111]:
for country_name, country_url in countries_info.items():
    file_text.write(country_name)
    resp = requests.get(country_url)
    asoup = BeautifulSoup(resp.content, "html.parser")
    bsoup = asoup.find(property="mw:PageProp/toc").find_all_previous("p")

    description = []
    for item in bsoup:
        item = str(item.get_text())
        description.append(item)

    desc = reversed(description)
    for paragraph in desc:
        file_text.write(paragraph)
    # I used break for run the cicle once, otherwise it takes time for scraping over 200 countries
    #break 

KeyboardInterrupt: 

In [None]:
file_text.close()

#### Also from Wikipedia, get as much data about each country and summarize it into the dataframe and export to csv or excel file

In [None]:
some_country = requests.get("https://en.wikipedia.org/wiki/Afghanistan")
soup = BeautifulSoup(some_country.content, "html.parser")
table = soup.table

# I got only part of headers in this part. Also it is possible to get all of them if there would be enough time for this task
headers = []

label = table.find_all(class_="infobox-label")
for item in label:
    if u'\xa0' in item.text:
        unnecessary = item
    else:
        clean_text = unicodedata.normalize("NFKD", item.text)
        headers.append(clean_text)
print(headers)

In [None]:
headers[1] = 'Capital'
headers[2] = 'Religion'
headers.insert(0, 'State')
#headers.insert(1, 'Status')
headers.remove('')

In [None]:
headers

In [None]:
#headers = ['State', 'Status', 'Capital', 'Religion', 'Demonym(s)', 'Government', 'Legislature',
#          'Currency', 'Time zone', 'Driving side', 'Calling code', 'ISO 3166 code', 'Internet TLD']

In [None]:
table_dict = dict()
for header in headers:
    table_dict[header] = 'NaN'

In [None]:
df = pd.DataFrame(columns=headers)

In [None]:
# I used counter and break to run the cycle once, otherwise, it takes time to parse over 200 countries
#counter = 0

for state, country_url in countries_info.items():
    
    table_dict['State'] = state
    res = requests.get(country_url)
    soup = BeautifulSoup(res.content, "html.parser")
    country_soup = soup.table
    rows = country_soup.find_all('tr')
    for row in rows:
        lines = []
        label = row.find(class_="infobox-label")
        for header in headers[1:]:
            if label is not None and header in label.text:
                clean_text = unicodedata.normalize("NFKD", row.find(class_="infobox-data").text)
                table_dict[header] = clean_text
    new_row = pd.Series(table_dict)
    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)  

    #counter += 1
    #if counter > 5:
    #   break

In [None]:
df.head(5)

In [None]:
df

In [None]:
df.to_csv('countries_summary.csv', index=False)