# TASK: From Wikipedia, get as much data about each country and summarize it into the dataframe and export to csv or excel file.

In [1]:
from bs4 import BeautifulSoup

In [2]:
import pandas

In [3]:
with open("List of sovereign states - Wikipedia.html", "r", encoding = "utf8") as file:
    data_html = file.read()

In [4]:
import requests

In [5]:
import re

In [6]:
soup = BeautifulSoup(data_html, "html.parser")

In [7]:
countries_table = soup.find(class_="sortable wikitable jquery-tablesorter")

In [8]:
countries_list = countries_table.find("tbody")

In [9]:
countries = countries_list.find_all("b")

In [10]:
count = 0
name_rows = []
capital_rows = []
legislature_rows = []
demonym_rows = []
area_rows = []

for country in countries: 
    
    # Getting table of contents
    url = str(countries[count].find("a")["href"])        
    country_page = requests.get(url)
    country_soup =  BeautifulSoup(country_page.content, "html.parser")
    table = country_soup.find("table", class_ = "infobox ib-country vcard")
    
    # Getting labels from the table
    labels = table.find_all("th", class_ = "infobox-label")
    headers = table.find_all("th", class_ = "infobox-header")
    
    # Getting names of the countries
    name = table.find("div", class_ = "fn org country-name").text
    name_rows.append(re.sub(r"\[\w\]*","",name)) 
    

    capital = ""
    legislature = ""
    demonym = ""
    demonym_multiple = []
    area = ""
    
    # Getting label values
    for label in labels:
        if "Capital" in label:
            capital = re.match(r"\D+",label.next_sibling.text).group()  
            capital = re.sub(r"\[\w\]\[","",capital)
            
        if "Legislature" in label.text:
            legislature = re.sub(r"\[[a-zA-Z]{1}\]|\[\d*\]","",label.next_sibling.text.strip())
            
        if "Demonym(s)" in label.text:
            
            if demonyms_list := label.next_sibling.find("ul"):
                for dem in demonyms_list.find_all("li"):  
                    demonym_multiple.append(re.sub(r"\[[a-zA-Z]{1}\]|\[\d*\]","",dem.text.strip()))
                demonym = ', '.join(demonym_multiple)              
            elif demonyms_list := label.next_sibling.find_all("br"):                
                for dem in demonyms_list:  
                    demonym_multiple.append(re.sub(r"\[[a-zA-Z]{1}\]|\[\d*\]","",dem.previous_sibling.text.strip()))               
                demonym_multiple = filter(lambda x: x != ")", demonym_multiple)           
                demonym = ', '.join(demonym_multiple)    
                
            else:
                demonym = re.sub(r"\[[a-zA-Z]{1}\]|\[\d*\]","",label.next_sibling.text.strip())
        
        # Getting area values from labels        
        if ("area" in label.text) and ("metropolitan" not in label.text):
            area = re.match(r"[0-9,\.]+",label.next_sibling.text.strip()).group()
    
    # Getting area values from headers
    if area == "":
        for header in headers:      
            if "Area" in header.text:
                row = header.parent.next_sibling.text
                row_without_prefix = re.sub(r"•\s+([a-zA-Z \[\]]+(\[c\])?)","",row)
                area = re.match(r"[0-9,\.]+",row_without_prefix).group().replace(".",",")

            
    # Appending values to the columns
    capital_rows.append(capital)        
    legislature_rows.append(legislature)
    demonym_rows.append(demonym)
    area_rows.append(area)
    
    count += 1

In [11]:
## Creating a table

table = {
    "Country":name_rows,
    "Capital":capital_rows,
    "Legislature":legislature_rows,
    "Demonym(s)":demonym_rows,
    "Area":area_rows
}

In [12]:
## Creating pandas DataFrame from table

df = pandas.DataFrame(table)
df

Unnamed: 0,Country,Capital,Legislature,Demonym(s),Area
0,Islamic Emirate of Afghanistan,Kabul,Leadership Council,Afghan,652867
1,Republic of Albania,Tirana,Kuvendi,Albanian,28748
2,People's Democratic Republic of Algeria,Algiers,Parliament,Algerian,2381741
3,Principality of Andorra,Andorra la Vella,General Council,Andorran,46763
4,Republic of Angola,Luanda,National Assembly,Angolan,1246700
...,...,...,...,...,...
201,Sahrawi Arab Democratic Republic,El Aaiúna (de jure)Tifariti (de facto),Sahrawi National Council,"Sahrawi, Saharawi, Western Saharan",266000
202,Republic of Somaliland,Hargeisa,Parliament,Somalilander,177000
203,Republic of South Ossetia – the State of Alania,Tskhinvali,Parliament,,3900
204,Republic of China,Taipei,Legislative Yuan,Taiwanese,36197


In [13]:
## Exporting DataFrame to excel file

df.to_excel('States.xlsx')