### Install the packages

In [1]:
!pip install requests
!pip install bs4




### Impact packages

In [2]:
import requests #Make http request
from bs4 import BeautifulSoup #Parse html (Understand the html script and convert them into python object)
import pandas as pd #Popular data manipulation tool


### HTTP request
Send Http request to retrieve the data from targeted webpage


In [15]:
#Send http request to price.com with particular parameters
page = requests.get("https://www.price.com.hk/category.php?c=100005&gp=10&page=1")

#Print the response status code (200 - Successful request)
print("Status code: " + str(page.status_code))


Status code: 200


### Information extraction
Extract targeted information from html

In [27]:
#Use beautifulsoup package to extract targeted information
soup = BeautifulSoup(page.content, "html.parser")

#Find a tag <ul> with attribute "tag" valued "product-list"
#Find all tags <li> within <ul>
results = soup.find("ul", attrs={'tag':'product-list' }).find_all("li")

#Create an empty list to store data
output = []

#For each <li> tag, extract data and store into a dictionary (key-value pair)
for element in results:

  #Get all product attributes
  ##Get the attribute names
  attribute_names = element.find_all("td", class_="attr-label") #Return a list of beautifulsoup object

  ##Skill - List comprehension (Ref: https://www.w3schools.com/python/python_lists_comprehension.asp)
  ##Extract text from beautiful soup object using .text
  attribute_names = [attr_name.text.replace(":", "") for attr_name in attribute_names] 

  ##Get the attribute values
  attribute_values = element.find_all("td", class_="attr-info") #Return a list of beautifulsoup object
  attribute_values = [attr_value.text for attr_value in attribute_values] #Skill - List comprehension

  #Combine lists of keys and values of same length into a dictionary
  attributes = dict(zip(attribute_names, attribute_values))

  #Store information in a dictionary called info
  info = {
      "Product": element.find("div", class_="line-01").find("a").text.strip() if element.find("div", class_="line-01") else "N/A",
      "SaleDate": element.find("div", class_="item-attr item-attr-00").find("td", class_="attr-info").find("span").text.strip() if element.find("div", class_="item-attr item-attr-00") else "N/A",
      "Price": element.find("span", class_="text-price-number").get("data-price") if element.find("span", class_="text-price-number") else "N/A"
  }

  #Combine the attributes dictionary with info dictionary
  info.update(attributes)

  #Append the info dictionary into output list which we previously created
  output.append(info)


Preview the extracted information

In [29]:
print(output[1])

{'Product': 'Xiaomi 小米 Redmi K40 Pro 5G (8+256GB)', 'SaleDate': '2021年3月', 'Price': '3480.0', '上市日期': '2021年3月', '制式': '5G/4G/3G/2G', '作業系統': 'Android', '處理器': 'Qualcomm Snapdragon 888', '顯示屏': '6.67吋', '解像度': '2400 x 1080', '螢幕刷新率': '120Hz', '前鏡頭': '2000萬像素', '後鏡頭': '6400+800+500萬像素', '記憶體': '8GB', '容量': '256GB', '功能': '指紋解鎖,NFC,快速充電', 'Wi-Fi 制式': 'Wi-Fi 6 (802.11ax)', '藍牙版本': '5.2', '重量': '196g', '尺寸': '163.7 x 76.4 x 7.8mm', '電池容量': '4520mAh'}


Convert the list of dictionaries to panda dataframe

In [30]:
#Convert the output list into dataframe
output = pd.DataFrame(output)


In [31]:
#Enable interactive table function for panda dataframe in Google Colab
#It can be skipped if it does not run on Google Colab
output

Unnamed: 0,Product,SaleDate,Price,上市日期,制式,作業系統,處理器,顯示屏,解像度,螢幕刷新率,...,後鏡頭,記憶體,容量,功能,快充功率,Wi-Fi 制式,藍牙版本,重量,尺寸,電池容量
0,Samsung Galaxy A52 5G (8+256GB),2021年4月,3610.0,2021年4月,5G/4G/3G/2G,"Android 11, One UI 3.1",Qualcomm SM7225 Snapdragon 750G 5G (8 nm),6.5吋,1080 x 2400,120Hz,...,6400+1200+500+500萬像素,8GB,256GB,"雙卡,指紋解鎖,SD卡槽,NFC,快速充電,3.5mm插頭,立體聲喇叭",25W,Wi-Fi 5 (802.11ac),5.0,189g,159.9 x 75.1 x 8.4mm,4500mAh
1,Xiaomi 小米 Redmi K40 Pro 5G (8+256GB),2021年3月,3480.0,2021年3月,5G/4G/3G/2G,Android,Qualcomm Snapdragon 888,6.67吋,2400 x 1080,120Hz,...,6400+800+500萬像素,8GB,256GB,"指紋解鎖,NFC,快速充電",,Wi-Fi 6 (802.11ax),5.2,196g,163.7 x 76.4 x 7.8mm,4520mAh
2,ASUS ROG Phone 5 5G (8+128GB),2021年3月,5280.0,2021年3月,5G/4G+/4G/3G/2G,Android 11,Qualcomm Snapdragon 888,6.78吋,2448 x 1080,144Hz,...,6400+1300+500萬像素,8GB,128GB,"雙卡,NFC,快速充電,3.5mm插頭",,Wi-Fi 6 (802.11ax),5.2,239g,173 x 77 x 9.9mm,6000mAh
3,Samsung Galaxy A52 5G (6+128GB),2021年4月,3398.0,2021年4月,5G/4G/3G/2G,"Android 11, One UI 3.1",Qualcomm SM7225 Snapdragon 750G 5G (8 nm),6.5吋,1080 x 2400,120Hz,...,6400+1200+500+500萬像素,6GB,128GB,"雙卡,指紋解鎖,SD卡槽,NFC,快速充電,3.5mm插頭,立體聲喇叭",25W,Wi-Fi 5 (802.11ac),5.0,189g,159.9 x 75.1 x 8.4mm,4500mAh
4,Xiaomi 小米 10S 5G (8+128GB),2021年3月,3380.0,2021年3月,5G/4G+/4G/3G/2G,Android 11,Qualcomm Snapdragon 870,6.67吋,2340 x 1080,90Hz,...,1億+1300+200+200萬像素,8GB,128GB,"雙卡,快速充電,無線充電,立體聲喇叭",33W,Wi-Fi 6 (802.11ax),5.1,208g,162.6 x 74.8 x 8.96mm,4780mAh
5,,,,,,,,,,,...,,,,,,,,,,
6,realme GT 5G (12+256GB),2021年3月,3699.0,2021年3月,5G/4G+/4G/3G/2G,Android 11,Qualcomm Snapdragon 888,6.43吋,2400 x 1080,120Hz,...,6400+800+200萬像素,12GB,256GB,"雙卡,NFC,快速充電,3.5mm插頭,立體聲喇叭",65W,Wi-Fi 6 (802.11ax),5.2,186.5g,158.5 x 73.3 x 9.1mm,4400mAh
7,HUAWEI Mate X2 5G (8+256GB),2021年3月,26298.0,2021年3月,5G/4G+/4G/3G/2G,Android 10,麒麟 9000,6.45吋,2480 x 2200,,...,5000+1600+1200+800萬像素,8GB,256GB,"雙卡,SD卡槽,NFC,快速充電",,Wi-Fi 6 (802.11ax),5.2,295g,161.8 x 展開態: 145.8 / 折疊: 74.6 x 展開態: 4.4-8.2 /...,4500mAh
8,Nubia 紅魔 6 Pro 電競遊戲手機 (12+256GB),2021年3月,5958.0,2021年3月,5G/4G/3G/2G,Android 11,snapdragon 888,6.8吋,2400 x 1080,165Hz,...,6400+800+200萬像素,12GB,256GB,"雙卡,3.5mm插頭",120W,Wi-Fi 6 (802.11ax),5.0,,,4500mAh
9,UleFone 歐樂風 Armor 11 5G (8+256GB),2021年3月,3499.0,2021年3月,5G/4G/3G/2G,Android 10,MediaTek Dimensity 800 with 5G,6.1吋,1560 x 720,,...,4800+2000+200+200萬像素,8GB,256GB,"雙卡,指紋解鎖,面部解鎖,NFC,快速充電,無線充電",18W,,5.0,295g,163.8 x 81.6 x 14.2mm,5200mAh


###Functionalize the operation for a single page extraction

In [10]:
#Make the previous steps as a function with single parameter input: target url
#Output as a panda dataframe
def extract_page(url):
  page = requests.get(url)
  print("Status code: " + str(page.status_code))
  soup = BeautifulSoup(page.content, "html.parser")

  results = soup.find("ul", attrs={'tag':'product-list' }).find_all("li")
  output = []

  for element in results:
    attribute_names = element.find_all("td", class_="attr-label") 
    attribute_names = [attr_name.text.replace(":", "") for attr_name in attribute_names]
    attribute_values = element.find_all("td", class_="attr-info")
    attribute_values = [attr_value.text for attr_value in attribute_values]
    attributes = dict(zip(attribute_names, attribute_values))

    info = {
      "Product": element.find("div", class_="line-01").find("a").text.strip() if element.find("div", class_="line-01") else "N/A",
      "SaleDate": element.find("div", class_="item-attr item-attr-00").find("td", class_="attr-info").find("span").text.strip() if element.find("div", class_="item-attr item-attr-00") else "N/A",
      "Price": element.find("span", class_="text-price-number").get("data-price") if element.find("span", class_="text-price-number") else "N/A"
    }
    info.update(attributes)
    output.append(info)
    
  output = pd.DataFrame(output)
  return output


In [33]:
#Try the extract_page()
extract_page("https://www.price.com.hk/category.php?c=100005&gp=10&page=2")

Status code: 200


Unnamed: 0,Product,SaleDate,Price,制式,作業系統,處理器,顯示屏,解像度,前鏡頭,後鏡頭,...,容量,功能,快充功率,藍牙版本,重量,尺寸,電池容量,上市日期,螢幕刷新率,Wi-Fi 制式
0,Samsung Galaxy M62 4G (8+128GB),,2809.0,4G/3G/2G,Android,Exynos 9825 (7 nm),6.7吋,1080 x 2400,3200萬像素,6400+1200+500+500萬像素,...,128GB,"雙卡,指紋解鎖,快速充電,3.5mm插頭",25W,5.0,218g,163.9 x 76.3 x 9.5mm,7000mAh,,,
1,realme 7 5G (8+128GB),2021年2月,2480.0,5G/4G/3G/2G,Android 10,MediaTek Dimensity 800U,6.5吋,2400 x 1080,1600萬像素,B＆W人像鏡頭+4800+800+200萬像素,...,128GB,"雙卡,指紋解鎖,SD卡槽,NFC,快速充電,3.5mm插頭",30W,5.1,195g,162.2 x 9.1 x 75.1mm,5000mAh,2021年2月,120Hz,Wi-Fi 5 (802.11ac)
2,Xiaomi 小米 Redmi K40 5G (8+128GB),,2780.0,5G/4G/3G/2G,Android 11,驍龍870,6.67吋,2400 x 1080,2000萬像素,4800+800+500萬像素,...,128GB,雙卡,,5.1,196g,163.7 x 76.4 x 7.8mm,4520mAh,,120Hz,Wi-Fi 6 (802.11ax)
3,Xiaomi 小米 紅米 Redmi Note 9T 5G (6+128GB),2021年1月,1790.0,5G/4G/3G/2G,Android,MediaTek 天璣 800U,6.53吋,2340 x 1080,1300萬像素,4800+800+200萬像素,...,128GB,"雙卡,指紋解鎖,面部解鎖,SD卡槽",18W,5.1,,,,2021年1月,60Hz,
4,Xiaomi 小米 11 5G (8+128GB),2021年1月,4599.0,5G/4G+/4G/3G/2G,Android 11,Qualcomm Snapdragon 888,6.81吋,3200 x 1440,2000萬像素,1.08億+1300+500萬像素,...,128GB,"雙卡,指紋解鎖,NFC,快速充電,無線充電,立體聲喇叭",55W,,,,4600mAh,2021年1月,120Hz,Wi-Fi 6 (802.11ax)
5,,,,,,,,,,,...,,,,,,,,,,
6,Samsung Galaxy S21+ 5G (8+256GB),2021年1月,6290.0,5G/4G/3G/2G,"Android 11, One UI 3.1",Snapdragon 888,6.7吋,2400 x 1080,1000萬像素,1200萬+1200萬+6400萬像素,...,256GB,"指紋解鎖,NFC,快速充電,無線充電,立體聲喇叭",25W,5.0,"(mmW) 202g, (sub6) 200g",75.6 x 161.5 x 7.8mm,4800mAh,2021年1月,120Hz,Wi-Fi 6 (802.11ax)
7,Xiaomi 小米 紅米 Redmi Note 9T 5G (4+128GB),2021年1月,1630.0,5G/4G/3G/2G,Android 10,MediaTek 天璣 800U,6.53吋,2340 x 1080,1300萬像素,4800+200+200萬像素,...,128GB,"雙卡,指紋解鎖,面部解鎖,SD卡槽,NFC,快速充電",18W,5.1,199g,161.9 x 77.3 x 9.05mm,5000mAh,2021年1月,,
8,Samsung Galaxy A71 (8+128GB),,2578.0,4G/3G/2G,Android 10,Dual 2.2GHz + Hexa 1.8GHz,6.7吋,2400 x 1080,3200萬像素,6400+1200+500+500萬像素,...,128GB,"雙卡,指紋解鎖,SD卡槽,NFC,快速充電,3.5mm插頭",25W,5.0,179g,163.6 x 76.0 x 7.7mm,4500mAh,,,Wi-Fi 5 (802.11ac)
9,Samsung Galaxy S20 FE 5G (8+128GB),2020年10月,3960.0,5G/4G/3G/2G,Android 10,Qualcomm Snapdragon 865,6.5吋,1080 x 2400,3200萬像素,1200+1200+800萬像素,...,128GB,"指紋解鎖,快速充電,無線充電,立體聲喇叭",25W,5.0,190g,159.8 x 74.5 x 8.4mm,4500mAh,2020年10月,120Hz,Wi-Fi 6 (802.11ax)


###Extract multiple pages
Extract multiple pages and merge them together

In [37]:
#Create a empyt list to store dataframe for each page
pages = []
#Input the number of pages that we want to scrape. In our example, it contained 270 pages of results.
end_page = 277

#For each page, store the dataframe in "pages" list
for num in range(1, end_page+1):
  print("Extracting page "+ str(num))
  url = "https://www.price.com.hk/category.php?c=100005&gp=10&page=" + str(num)
  pages.append(extract_page(url))

#Combine the dataframes of all pages
final_data = pd.concat(pages, ignore_index=True)

Extracting page 1
Status code: 200
Extracting page 2
Status code: 200
Extracting page 3
Status code: 200
Extracting page 4
Status code: 200
Extracting page 5
Status code: 200
Extracting page 6
Status code: 200
Extracting page 7
Status code: 200
Extracting page 8
Status code: 200
Extracting page 9
Status code: 200
Extracting page 10
Status code: 200
Extracting page 11
Status code: 200
Extracting page 12
Status code: 200
Extracting page 13
Status code: 200
Extracting page 14
Status code: 200
Extracting page 15
Status code: 200
Extracting page 16
Status code: 200
Extracting page 17
Status code: 200
Extracting page 18
Status code: 200
Extracting page 19
Status code: 200
Extracting page 20
Status code: 200
Extracting page 21
Status code: 200
Extracting page 22
Status code: 200
Extracting page 23
Status code: 200
Extracting page 24
Status code: 200
Extracting page 25
Status code: 200
Extracting page 26
Status code: 200
Extracting page 27
Status code: 200
Extracting page 28
Status code: 200
E

In [38]:
final_data

Unnamed: 0,Product,SaleDate,Price,上市日期,制式,作業系統,處理器,顯示屏,解像度,螢幕刷新率,...,後鏡頭,記憶體,容量,功能,快充功率,Wi-Fi 制式,藍牙版本,重量,尺寸,電池容量
0,Samsung Galaxy A52 5G (8+256GB),2021年4月,3610.0,2021年4月,5G/4G/3G/2G,"Android 11, One UI 3.1",Qualcomm SM7225 Snapdragon 750G 5G (8 nm),6.5吋,1080 x 2400,120Hz,...,6400+1200+500+500萬像素,8GB,256GB,"雙卡,指紋解鎖,SD卡槽,NFC,快速充電,3.5mm插頭,立體聲喇叭",25W,Wi-Fi 5 (802.11ac),5.0,189g,159.9 x 75.1 x 8.4mm,4500mAh
1,Xiaomi 小米 Redmi K40 Pro 5G (8+256GB),2021年3月,3480.0,2021年3月,5G/4G/3G/2G,Android,Qualcomm Snapdragon 888,6.67吋,2400 x 1080,120Hz,...,6400+800+500萬像素,8GB,256GB,"指紋解鎖,NFC,快速充電",,Wi-Fi 6 (802.11ax),5.2,196g,163.7 x 76.4 x 7.8mm,4520mAh
2,ASUS ROG Phone 5 5G (8+128GB),2021年3月,5280.0,2021年3月,5G/4G+/4G/3G/2G,Android 11,Qualcomm Snapdragon 888,6.78吋,2448 x 1080,144Hz,...,6400+1300+500萬像素,8GB,128GB,"雙卡,NFC,快速充電,3.5mm插頭",,Wi-Fi 6 (802.11ax),5.2,239g,173 x 77 x 9.9mm,6000mAh
3,Samsung Galaxy A52 5G (6+128GB),2021年4月,3398.0,2021年4月,5G/4G/3G/2G,"Android 11, One UI 3.1",Qualcomm SM7225 Snapdragon 750G 5G (8 nm),6.5吋,1080 x 2400,120Hz,...,6400+1200+500+500萬像素,6GB,128GB,"雙卡,指紋解鎖,SD卡槽,NFC,快速充電,3.5mm插頭,立體聲喇叭",25W,Wi-Fi 5 (802.11ac),5.0,189g,159.9 x 75.1 x 8.4mm,4500mAh
4,Xiaomi 小米 10S 5G (8+128GB),2021年3月,3380.0,2021年3月,5G/4G+/4G/3G/2G,Android 11,Qualcomm Snapdragon 870,6.67吋,2340 x 1080,90Hz,...,1億+1300+200+200萬像素,8GB,128GB,"雙卡,快速充電,無線充電,立體聲喇叭",33W,Wi-Fi 6 (802.11ax),5.1,208g,162.6 x 74.8 x 8.96mm,4780mAh
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,LG G3 Stylus - D690,2014年9月,,2014年9月,,Android 4.4.2,,5.5吋,,,...,1300萬像素,,,,,,,163g,149.3 x 75.9 x 10.2mm,
4701,HUAWEI Ascend G7,2014年10月,,2014年10月,,Android 4.4.4 (KitKat) + HUAWEI Emotion UI 3.0,,5.5吋,1280 x 720,,...,1300萬像素,,,SD卡槽,,,,165g,153.5 x 77.3 x 7.6mm,
4702,HTC Desire 826W Dual SIM 16GB,2015年1月,,2015年1月,FDD-LTE/TD-LTE /WCDMA/TD-SCDMA/GSM,Android 5.0,,5.5吋,1920 x 1080,,...,1300萬像素,,,SD卡槽,,,,,158 x 77.5 x 7.99mm,
4703,,,,,,,,,,,...,,,,,,,,,,


In [36]:
# Write the DataFrame to CSV file
final_data.to_csv('Price_Cellphone.csv')
