# Exercício

Faça a extração de alguma tabela da Wikipedia utilizando a biblioteca BeautifulSoup

In [1]:
!pip install jupysql
!pip install beautifulsoup4

Collecting jupysql
  Downloading jupysql-0.10.4-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.1/91.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting ploomber-core>=0.2.7 (from jupysql)
  Downloading ploomber_core-0.2.18-py3-none-any.whl (22 kB)
Collecting jupysql-plugin (from jupysql)
  Downloading jupysql_plugin-0.2.7-py3-none-any.whl (376 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.7/376.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting posthog (from ploomber-core>=0.2.7->jupysql)
  Downloading posthog-3.1.0-py2.py3-none-any.whl (37 kB)
Collecting monotonic>=1.5 (from posthog->ploomber-core>=0.2.7->jupysql)
  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Collecting backoff>=1.10.0 (from posthog->ploomber-core>=0.2.7->jupysql)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: monotonic, backoff, posthog, ploomber-core, jupysql-plugin, jupysql
Successf

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3

In [4]:
# Fazer requisição GET para a página da Wikipedia e obter o conteúdo HTML
url = "https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita"
response = requests.get(url)
html = response.content

# Criar o objeto Beautiful Soup
soup = BeautifulSoup(html, "html.parser")

# Encontrar a tabela desejada na página (pode variar dependendo da página e dados que você deseja extrair)
table = soup.find_all("table", class_="wikitable")[0]

# Extrair os dados da tabela para uma lista
data = []
for row in table.find_all("tr"):
    cols = row.find_all("td")
    if cols:
        values = [col.text.strip() for col in cols]
        data.append(values)

In [5]:
data

[['Czech Republic\u202f*', '140.0', '', '1,498', '2020', '[2]'],
 ['Austria\u202f*', '107.8', '0.2', '949', '2019', '[1]'],
 ['Romania\u202f*', '100.3', '1.4', '1,956', '2019', '[1]'],
 ['Germany\u202f*', '99.0', '-2.1', '8,160', '2019', '[1]'],
 ['Poland\u202f*', '97.7', '-0.5', '3,713', '2019', '[1]'],
 ['Namibia', '95.5', '14.2', '248', '2019', '[1]'],
 ['Ireland', '92.9', '-2.9', '446', '2019', '[1]'],
 ['Spain\u202f*', '88.8', '1.5', '4,119', '2019', '[1]'],
 ['Croatia\u202f*', '85.5', '5.0', '351', '2019', '[1]'],
 ['Latvia', '81.4', '4.6', '155', '2019', '[1]'],
 ['Estonia\u202f*', '80.5', '7.9', '105', '2019', '[1]'],
 ['Slovenia\u202f*', '80.0', '-0.2', '168', '2019', '[1]'],
 ['Netherlands\u202f*', '79.3', '1.2', '1,357', '2019', '[1]'],
 ['Bulgaria\u202f*', '78.7', '2.3', '551', '2019', '[1]'],
 ['Panama\u202f*', '78.3', '1.5', '329', '2019', '[1]'],
 ['Slovakia\u202f*', '76.1', '-0.9', '418', '2019', '[1]'],
 ['Australia\u202f*', '75.1', '-1.2', '1,885', '2019', '[1]'],
 ['

#Transformação

In [13]:
for i in range(len(data)):
    # Remover caracteres indesejados no nome do país
    data[i][0] = data[i][0].replace('\u202f*', '').strip()

    # Substituir strings vazias por None (ou outro valor de sua escolha)
    data[i] = [None if x == '' else x for x in data[i]]


In [14]:
df = pd.DataFrame(data)

In [15]:
df

Unnamed: 0,0,1,2,3,4,5
0,Czech Republic,140.0,,1498,2020,[2]
1,Austria,107.8,0.2,949,2019,[1]
2,Romania,100.3,1.4,1956,2019,[1]
3,Germany,99.0,-2.1,8160,2019,[1]
4,Poland,97.7,-0.5,3713,2019,[1]
...,...,...,...,...,...,...
57,Uganda,6,,,,[13]
58,Malaysia,5.8,,,2015,[6]
59,Sri Lanka,2,,50,,[15]
60,India,2,,,2015,[6]


In [16]:
df.columns = ["Country", "Consumption per capita (litres per year)", "Total national consumption (million litres per year)", "2018 change (litres per year)", "Year", "Sources"]

In [17]:
df

Unnamed: 0,Country,Consumption per capita (litres per year),Total national consumption (million litres per year),2018 change (litres per year),Year,Sources
0,Czech Republic,140.0,,1498,2020,[2]
1,Austria,107.8,0.2,949,2019,[1]
2,Romania,100.3,1.4,1956,2019,[1]
3,Germany,99.0,-2.1,8160,2019,[1]
4,Poland,97.7,-0.5,3713,2019,[1]
...,...,...,...,...,...,...
57,Uganda,6,,,,[13]
58,Malaysia,5.8,,,2015,[6]
59,Sri Lanka,2,,50,,[15]
60,India,2,,,2015,[6]


In [18]:
df.set_index('Country', inplace=True)

In [19]:
df

Unnamed: 0_level_0,Consumption per capita (litres per year),Total national consumption (million litres per year),2018 change (litres per year),Year,Sources
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Czech Republic,140.0,,1498,2020,[2]
Austria,107.8,0.2,949,2019,[1]
Romania,100.3,1.4,1956,2019,[1]
Germany,99.0,-2.1,8160,2019,[1]
Poland,97.7,-0.5,3713,2019,[1]
...,...,...,...,...,...
Uganda,6,,,,[13]
Malaysia,5.8,,,2015,[6]
Sri Lanka,2,,50,,[15]
India,2,,,2015,[6]


# Carregamento

In [20]:
%load_ext sql

[32mDeploy AI and data apps for free on Ploomber Cloud! Learn more: https://docs.cloud.ploomber.io/en/latest/quickstart/signup.html[0m


In [21]:
%%sql sqlite://

In [22]:
%sql --persist df

In [23]:
%%sql

select * from df

Country,Consumption per capita (litres per year),Total national consumption (million litres per year),2018 change (litres per year),Year,Sources
Czech Republic,140.0,,1498,2020,[2]
Austria,107.8,0.2,949,2019,[1]
Romania,100.3,1.4,1956,2019,[1]
Germany,99.0,-2.1,8160,2019,[1]
Poland,97.7,-0.5,3713,2019,[1]
Namibia,95.5,14.2,248,2019,[1]
Ireland,92.9,-2.9,446,2019,[1]
Spain,88.8,1.5,4119,2019,[1]
Croatia,85.5,5.0,351,2019,[1]
Latvia,81.4,4.6,155,2019,[1]
