### INSTALL LIBRARIES

| Requirements | Purpose |
| :--- | :--- | 
| Requests | Make Web requests |
| BeautifulSoup | Parse response |

In [2]:
! pip install requests     
! pip install bs4          



### Import Dependencies

In [3]:
from bs4 import BeautifulSoup
import requests

### Set up URL

In [4]:
url = "https://example.com"

### Scrape page

In [5]:
r = requests.get(url)

### Explore response

In [6]:
r.status_code

200

In [11]:
r.content

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    

In [12]:
r.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

### Parse with Beautiful Soup

In [13]:
soup = BeautifulSoup(r.content, "html.parser")
soup

<!DOCTYPE html>

<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative example

### Traverse HTML and find required items

In [15]:
soup.body

<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>

In [16]:
body = soup.body

### Find

In [18]:
body.find('p')

<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>

### Get text

In [21]:
body.text

'\n\nExample Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\nMore information...\n\n'

### FindAll

In [22]:
paras= body.findAll('p')

In [26]:
divs = body.findAll('div')
divs[0]

<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>

In [27]:
paras

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [28]:
p0=paras[0]
p1 = paras[1]

### Get href

In [32]:
p1.find('a')['href']

'https://www.iana.org/domains/example'

In [33]:
p1.a['href']

'https://www.iana.org/domains/example'

In [35]:
p1.a.get_attribute_list('href')[0]

'https://www.iana.org/domains/example'

In [1]:
%%html
<style>
table {float:left}
</style>

In [None]:
# base_url = "https://www.formula1.com/en/results/2024"
# url = base_url + "/drivers"

### For Formula one example

In [None]:
soup.findAll('h1')

In [None]:
soup.h1.text

### Navigation Bar 

In [None]:
len(soup.findAll('nav'))

In [None]:
nav = soup.find("nav", {"aria-label":"Primary Navigation"})

In [None]:
nav.findAll("li")[0].span.text

In [None]:
items = nav.findAll("li")
for item in items :
    print(item.span.text)

### Scrape table

In [None]:
len(soup.findAll('table'))

In [None]:
table = soup.find('table')

### Headers

In [None]:
headers = table.findAll('tr')[0].findAll('th')

In [None]:
headers_list = []
for header in headers :
    headers_list.append(header)

### Data

In [None]:
rows = table.findAll('tr')[1:]

In [None]:
len(rows[0].findAll('td'))

In [None]:
data = []

for row in rows :
    pos =row.findAll('td')[0].text
    f_name = row.findAll('td')[1].findAll('span')[0].text
    l_name = row.findAll('td')[1].findAll('span')[1].text
    abbr = row.findAll('td')[1].findAll('span')[2].text
    dr_url = base_url + row.findAll('td')[1].a['href']
    nat = row.findAll('td')[2].text
    team_name = row.findAll('td')[3].text
    team_link = base_url + row.findAll('td')[3].a['href']
    pts = int(row.findAll('td')[4].text)
    
    data.append([pos, f_name, l_name, abbr, dr_url, nat, team_name, team_link, pts])

In [None]:
    pos
    f_name
    l_name
    abbr
    dr_url
    nat
    team_name
    team_link
    pts

In [None]:
data

In [None]:
col_names = ["pos", "f_name", "l_name", "abbr", "dr_url", "nationality", "team_name", "team_link", "pts"]

In [None]:
import pandas as pd 

In [None]:
df = pd.DataFrame(data = data, columns=col_names)
df.head()

In [None]:
df.to_json("f1.json")

In [None]:
df.to_csv("f1.csv", index=None)

In [None]:
url = "https://reg.githubuniverse.com/flow/github/universe24/attendee-portal/page/sessioncatalog?tab.day=20241029"

In [None]:
r = requests.get(url)
r.status_code

In [None]:
soup = BeautifulSoup(r.content)
soup.body

In [None]:
soup.findAll('h2')