# Web scraping - Introduction


## 1) Sending a request


In [1]:
# Import before use
import requests

## Running a GET request


In [None]:
# Creating a request & submit a GET request
url = 'https://hkuspace.hku.hk/eddie'
response = requests.get(url)

In [3]:
# Status code
response.status_code

404

## Content may be returned as `bytes` type, you can use `.decode()` method to change it to plain text (as `str`)


In [4]:
print(type(response.content))
print(type(response.content.decode()))

<class 'bytes'>
<class 'str'>


In [5]:
print(response.content.decode())

<!DOCTYPE html>
<html class="no-js lang-en" lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=0">
	<title>Page Not Found</title>
  <link rel="stylesheet" href="/assets/css/style.css">
</head>
<body>
  <div class="wrapper not-found">
    <header class="page-head" role="banner">
      <div class="container">	
        <a class="logo" href="/">
          <img class="hidden-xs" src="/assets/img/logo.png" alt="HKU SPACE">
          <img class="visible-xs-inline-block" src="/assets/img/m_logo.png" alt="HKU SPACE">
        </a>
      </div>   
    </header> 

    <div class="has-bg">  
      <div class="container">
        <div class="content">	
          <h1>ERROR 404</h1>
          <p>Sorry, the page you are looking for does not exist. <br>
          You may wish to check these links below :<br>
          此網頁不存在。請用以下連結找尋所需資料：</p>
          <ul class="list-unstyled">
            <li><a href="/">Homepage</a></li>
    

## Extra: POST Method


In [6]:
post_url = 'https://authenticationtest.com//login/?mode=simpleFormAuth'
email = 'simpleForm@authenticationtest.com'
password = 'pa$$w0rd'
post_response = requests.post(post_url, data={'email': email, 'password': password})
post_response.status_code

200

In [7]:
print(post_response.content.decode())

<!DOCTYPE html>
<html lang="en">
	<head>
		<meta charset="utf-8">
		<meta name="viewport" content="width=device-width, initial-scale=1">
		<meta name="description" content="This site contains some examples of the ways web applications accept authentication. You can use these to test your scripts ability to authentication to them.">
		<title>Authentication Test</title>
		<link href="https://authenticationtest.com/assets/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
		<link rel="icon" href="https://authenticationtest.com/assets/favicon.png">
		
	</head>
	<body>
		<nav class="navbar navbar-expand-lg navbar-dark bg-primary">
			<a class="navbar-brand" href="https://authenticationtest.com/">
				<img src="https://authenticationtest.com/assets/favicon.png" width="30" height="30" class="d-inline-block align-top" alt="Authentication Test Icon">Authentication Test</a>
			<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNavAltMarkup" aria-con

## 2) Locating Resources


In [8]:
import json

json_response = requests.get('https://www.twse.com.tw/rwd/en/marginTrading/TWT93U?response=json')
json_data = json.loads(json_response.content)
json_data['title']

'2024/09/20 Daily Short Sale Balances'

## 3) Reading Webpages with Beautiful soup

Reference: https://pypi.org/project/beautifulsoup4/


### Creating a `BeautifulSoup` object with HTML content and a parser

Parser choices: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser


In [9]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
type(soup)

## `Tag` objects


To find a tag named `xyz`, you just need to call `soup.find('xyz')` to see the first element


Use `<a>` as an example for hyperlinks


In [10]:
soup.find('a')

<a class="logo" href="/">
<img alt="HKU SPACE" class="hidden-xs" src="/assets/img/logo.png"/>
<img alt="HKU SPACE" class="visible-xs-inline-block" src="/assets/img/m_logo.png"/>
</a>

In [11]:
type(soup.find('a'))

In [12]:
# Attributes within the tag
soup.find('a').attrs

{'class': ['logo'], 'href': '/'}

In [13]:
soup.find('a').contents

['\n',
 <img alt="HKU SPACE" class="hidden-xs" src="/assets/img/logo.png"/>,
 '\n',
 <img alt="HKU SPACE" class="visible-xs-inline-block" src="/assets/img/m_logo.png"/>,
 '\n']

In [14]:
for item in soup.a.contents:
  print(f'{type(item)}: "{item}"')

<class 'bs4.element.NavigableString'>: "
"
<class 'bs4.element.Tag'>: "<img alt="HKU SPACE" class="hidden-xs" src="/assets/img/logo.png"/>"
<class 'bs4.element.NavigableString'>: "
"
<class 'bs4.element.Tag'>: "<img alt="HKU SPACE" class="visible-xs-inline-block" src="/assets/img/m_logo.png"/>"
<class 'bs4.element.NavigableString'>: "
"


Use `find_all()` to iterate through all tags with that name


In [15]:
# Image => Homepage => Study => Discover
for i, tag in enumerate(soup.find_all('a')):
  print(f"[{i+1}] Display: {tag.string}, url: {tag.attrs['href']}")

[1] Display: None, url: /
[2] Display: Homepage, url: /
[3] Display: Study, url: /programme
[4] Display: Discover, url: /discover


In [16]:
print(soup.find_all('img'))
print(soup.find('img', {'class': 'hidden-xs'}))

[<img alt="HKU SPACE" class="hidden-xs" src="/assets/img/logo.png"/>, <img alt="HKU SPACE" class="visible-xs-inline-block" src="/assets/img/m_logo.png"/>]
<img alt="HKU SPACE" class="hidden-xs" src="/assets/img/logo.png"/>


## Tag navigation


In [17]:
# Set a reference point
tag_p_first = soup.find('p')
tag_p_first

<p>Sorry, the page you are looking for does not exist. <br/>
          You may wish to check these links below :<br/>
          此網頁不存在。請用以下連結找尋所需資料：</p>

In [18]:
# Find parent
tag_p_first.find_parent()

<div class="content">
<h1>ERROR 404</h1>
<p>Sorry, the page you are looking for does not exist. <br/>
          You may wish to check these links below :<br/>
          此網頁不存在。請用以下連結找尋所需資料：</p>
<ul class="list-unstyled">
<li><a href="/">Homepage</a></li>
<li><a href="/programme">Study</a></li>
<li><a href="/discover">Discover</a></li>
</ul>
<p class="copyright">Copyright © 2024 HKU SPACE. All rights reserved.</p>
</div>

In [19]:
# Find previous
tag_p_first.find_previous()

<h1>ERROR 404</h1>

In [20]:
# Find next
tag_p_first.find_next()

<br/>

In [21]:
# Find next x 2
tag_p_first.find_next().find_next()

<br/>

In [22]:
# Find next x 3
tag_p_first.find_next().find_next().find_next()

<ul class="list-unstyled">
<li><a href="/">Homepage</a></li>
<li><a href="/programme">Study</a></li>
<li><a href="/discover">Discover</a></li>
</ul>

In [23]:
# Find Children
tag_p_first.findChildren()

[<br/>, <br/>]

In [24]:
# "Find" functions give you `Tag` objects
[type(c) for c in tag_p_first.findChildren()]

[bs4.element.Tag, bs4.element.Tag]

In [25]:
# Find next siblings
tag_p_first.find_next_siblings()

[<ul class="list-unstyled">
 <li><a href="/">Homepage</a></li>
 <li><a href="/programme">Study</a></li>
 <li><a href="/discover">Discover</a></li>
 </ul>,
 <p class="copyright">Copyright © 2024 HKU SPACE. All rights reserved.</p>]

In [26]:
# Find previous siblings
tag_p_first.find_previous_siblings()

[<h1>ERROR 404</h1>]

## 3) Practice


In [27]:
import pandas as pd

In [28]:
practice_url = 'http://www3.kits-tutor.com/2015/phpgetadvcontentbig5.php'
practice_response = requests.get(practice_url)
practice_soup = BeautifulSoup(practice_response.content, 'html.parser')

In [29]:
markers = practice_soup.find_all('marker')

# Load the data
address_list = [m.attrs['job_address'] for m in markers]
subject_list = [m.attrs['job_stu_subject'] for m in markers]
rate_list = [m.attrs['job_hourrate'] for m in markers]

In [30]:
# Create a dataframe
pd.DataFrame({'Address': address_list, 'Subject': subject_list, 'Rate': rate_list})

Unnamed: 0,Address,Subject,Rate
0,雅景臺,"補經濟,(英文卷)",200
1,海怡半島,補數學,160
2,翠怡花園,補英文,220
3,廣明苑,補英文,210
4,堅尼地城,補英文,250
...,...,...,...
472,亞洲大?,"補數學,SCIENCE,(英文卷)",200
473,凱匯,補全科,160
474,興民街,補英文,160
475,"火炭,銀禧花園(近港鐵站)","補SCIENCE,(英文卷)",220
