In [2]:
!pip install beautifulsoup4 requests pandas


Collecting beautifulsoup4
  Obtaining dependency information for beautifulsoup4 from https://files.pythonhosted.org/packages/b1/fe/e8c672695b37eecc5cbf43e1d0638d88d66ba3a44c4d321c796f4e59167f/beautifulsoup4-4.12.3-py3-none-any.whl.metadata
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Obtaining dependency information for soupsieve>1.2 from https://files.pythonhosted.org/packages/4c/f3/038b302fdfbe3be7da016777069f26ceefe11a681055ea1f7817546508e3/soupsieve-2.5-py3-none-any.whl.metadata
  Downloading soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
   ---------------------------------------- 0.0/147.9 kB ? eta -:--:--
   -- ------------------------------------- 10.2/147.9 kB ? eta -:--:--
   -------- ------------------------------ 30.7/147.9 kB 435.7 kB/s eta 0:00:01
   ---------------------------- --------- 112.6/147.9 kB 939.4 kB/s eta 0:00:01
   --------


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
#Import Module
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Response HTML yang diakses
url = "https://www.petsecure.com.au/pet-care/a-guide-to-worldwide-pet-ownership/"
response = requests.get(url)
if response.status_code == 200:
    print("Successfully accessed the webpage")
else:
    print(f"Failed to access the webpage. Status code: {response.status_code}")

Successfully accessed the webpage


In [3]:
# Parse & Print HTML dengan Bs4
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify()[:1000])  # Print first 1000 characters for review


<!DOCTYPE html>
<html class="avada-html-layout-wide avada-html-header-position-top" lang="en-US">
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots"/>
  <!-- This site is optimized with the Yoast SEO plugin v22.7 - https://yoast.com/wordpress/plugins/seo/ -->
  <title>
   Worldwide Pet Ownership Statistics | Most Common Pets Around the World - PetSecure
  </title>
  <meta content="Wondering what the most common pets are around the world? Find out interesting pet facts and statistics with our comprehensive guide." name="description"/>
  <link href="https://www.petsecure.com.au/pet-care/a-guide-to-worldwide-pet-ownership/" rel="canonical"/>
  <meta content="en_US" property="og:locale"/>
  <meta content="article" property="og:t

In [4]:
# Dapatkan Data Table dari Bs4 dan jelaskan bagaimana proses pengambilan datanya
# Process: 
# 1. Identify the table by examining the HTML structure.
# 2. Extract the table headers.
# 3. Extract each row of the table.
# 4. Store the extracted data in a structured format.

# Identifikasi tabel yang ingin diekstrak datanya.
table = soup.find_all('table')[0]

# Ekstrak Header Tabel: Header tabel biasanya terletak dalam elemen <th>. 
headers = [header.text.strip() for header in table.find_all('th')]

# Extract rows
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    cells = row.find_all('td')
    cells = [cell.text.strip() for cell in cells]
    rows.append(cells)

# Menampilkan Hasil
print("Headers:", headers)
print("First 5 rows:", rows[:5])


Headers: ['TOP 20\nDOG POPULATIONS', '']
First 5 rows: [['USA', '69,929,000'], ['China', '27,400,000'], ['Russia', '12,520,000'], ['Japan', '12,000,000'], ['Philippines', '11,600,000']]


In [5]:
# Dapatkan Data Table Headers dan Table Rows kemudian jadikan satu DataFrame dan print menjadi .csv file
# Create DataFrame
df = pd.DataFrame(rows, columns=headers)

# Save to CSV
df.to_csv('dog_table.csv', index=False)
print("Data saved to 'dog_table.csv'")


Data saved to 'dog_table.csv'


In [6]:
# Opsional: Cleaning atau Transformasi Data:

# Tampilkan nama kolom aktual dalam DataFrame
print("Actual column names in the DataFrame:", df.columns.tolist())

# Mengubah nama kolom agar lebih bermakna
df.columns = ["Country", "Total number of dogs"]

# Identifikasi kolom numerik
numeric_columns = ['Dogs per 1000 people', 'Total number of dogs']
numeric_columns = [col for col in df.columns if 'Dogs per 1000 people' in col or 'Total number of dogs' in col]
print("Identified numeric columns:", numeric_columns)

# Bersihkan data dalam kolom numerik dengan menghapus tanda koma dan mengubah tipe data ke float
for column in numeric_columns:
    df[column] = df[column].str.replace(',', '').astype(float)

# Simpan data yang telah dibersihkan ke file CSV
df.to_csv('dog_table_cleaned.csv', index=False)
print("Cleaned data saved to 'dog_table_cleaned.csv'")

Actual column names in the DataFrame: ['TOP 20\nDOG POPULATIONS', '']
Identified numeric columns: ['Total number of dogs']
Cleaned data saved to 'dog_table_cleaned.csv'


In [38]:
from IPython.display import display

# Membaca file CSV
df = pd.read_csv('dog_table.csv')

# Menampilkan DataFrame secara interaktif
display(df)

Unnamed: 0,TOP 20\nDOG POPULATIONS,Unnamed: 1
0,USA,69929000
1,China,27400000
2,Russia,12520000
3,Japan,12000000
4,Philippines,11600000
5,India,10200000
6,Argentina,9200000
7,UK,9000000
8,France,7570000
9,South Africa,7400000


In [7]:
from IPython.display import display

# Membaca file CSV
df = pd.read_csv('dog_table_cleaned.csv')

# Menampilkan DataFrame secara interaktif
display(df)


Unnamed: 0,Country,Total number of dogs
0,USA,69929000.0
1,China,27400000.0
2,Russia,12520000.0
3,Japan,12000000.0
4,Philippines,11600000.0
5,India,10200000.0
6,Argentina,9200000.0
7,UK,9000000.0
8,France,7570000.0
9,South Africa,7400000.0
