# Web Scraper

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from typing import List, Dict
import pandas as pd
import os
import openpyxl
import time

In [2]:
class ScraperBase(object):
    def __init__(self, webdriver: webdriver, name: str, web_url: str) -> None:
        self._webdriver = webdriver
        self._name = name
        self._web_url = web_url
        self._storage = {}
    
    def _search_desired_item(self) -> None:
        raise NotImplementedError
    
    def _get_all_product_links(self) -> List[str]:
        raise NotImplementedError
    
    def _parse_data(self, link: str) -> None:
        raise NotImplementedError
    
    def run(self) -> Dict:
        start_time = time.time()
        self._webdriver.get(self._web_url)
        self._search_desired_item()
        links = self._get_all_product_links()
        print("Num of links: %d" % len(links))
        for i in range(len(links)):
            self._parse_data(link=links[i])
            print("Progress: %d/%d" % (i + 1, len(links)), end='\r' )
        print("Progress: %d/%d" % (len(links), len(links)))
        print("Total time elapsed: %.2fs" % (time.time() - start_time))
        return self._storage

In [3]:
class EbayScraper(ScraperBase):
    def __init__(self, webdriver: webdriver, name: str, web_url: str) -> None:
        super(EbayScraper, self).__init__(webdriver=webdriver, name=name, web_url=web_url)

    def _search_desired_item(self) -> None:
        search_bar = self._webdriver.find_element_by_xpath('//*[@id="gh-ac"]')
        search_bar.send_keys("laptop", Keys.ENTER)
        refurbished_items = self._webdriver.find_element_by_xpath(
            '//*[@id="x-refine__group__2"]/ul/li[3]/div/a/div/div/span[1]')
        refurbished_items.click()
        non_auction_items = self._webdriver.find_element_by_xpath(
            '//*[@id="s0-14-11-5-1[0]"]/div[2]/div/div/ul/li[3]/a/h2')
        non_auction_items.click()
        self._webdriver.get(self._webdriver.current_url + "&_ipg=200")
    
    def _get_all_product_links(self) -> List[str]:
        link_container = self._webdriver.find_element_by_xpath(
            '//*[@id="srp-river-results"]/ul').find_elements_by_tag_name("a")
        links = []
        for link in link_container:
            links.append(link.get_attribute("href"))
        links = list(set(links))
        return links
    
    def _parse_data(self, link: str) -> None:
        self._webdriver.get(link)
        product_id = str(len(self._storage.keys()) + 1)
        try:
            product_title = self._webdriver.find_element_by_xpath('//*[@id="itemTitle"]').text.lower()
            product_cost = self._webdriver.find_element_by_xpath('//*[@id="prcIsum"]').get_attribute("content")
            specs = self._webdriver.find_element_by_xpath(
                '//*[@id="viTabs_0_is"]/div/table').find_elements_by_tag_name("tr")
            try:
                product_desc = self._webdriver.find_element_by_xpath(
                    '//*[@id="ds_div"]/font/font/font/font/font/font/font/font/font/ul/li/p').text.lower()
            except NoSuchElementException:
                product_desc = ""
            self._storage[product_id] = {"Title": product_title, 'Link': link, "Price": product_cost, 
                                         "Description": product_desc}
            for spec in specs:
                spec_titles = spec.find_elements_by_class_name("attrLabels")
                spec_values = spec.find_elements_by_tag_name("span")
                if len(spec_titles) == len(spec_values):
                    for j in range(len(spec_titles)):
                        self._storage[product_id][spec_titles[j].text.strip().capitalize()[: -1]] = \
                            spec_values[j].text.strip().lower()
        except NoSuchElementException:
            print("\n\rParsing failed, link: %s" % link)
            self._storage.pop(product_id, None)


In [4]:
class ReebeloScraper(ScraperBase):
    def __init__(self, webdriver: webdriver, name: str, web_url: str) -> None:
        super(ReebeloScraper, self).__init__(webdriver=webdriver, name=name, web_url=web_url)
    
    def _search_desired_item(self) -> None:
        search_bar = self._webdriver.find_element_by_xpath('//*[@id="bc-sf-search-box-0"]')
        search_bar.send_keys("laptop", Keys.ENTER)
        time.sleep(3)
        self._webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

    def _get_all_product_links(self) -> List[str]:
        link_container = self._webdriver.find_element_by_xpath(
            '//*[@id="bc-sf-filter-products"]').find_elements_by_tag_name("a")
        links = []
        for link in link_container:
            links.append(link.get_attribute("href"))
        links = list(set(links))
        return links
    
    def _parse_data(self, link: str) -> None:
        self._webdriver.get(link)
        product_id = str(len(self._storage.keys()) + 1)
        product_title = self._webdriver.find_element_by_xpath(
            '/html/body/main/div[2]/section/div[1]/div[3]/div/'
            'div[2]/div/div[1]/form/div[1]/div[1]/h1').text.lower()
        product_cost = self._webdriver.find_element_by_xpath(
            '/html/body/main/div[2]/section/div[1]/div[3]/div/div[2]/div/div[1]/form/div[1]/div[2]/div/'
            'div[1]/span[1]/span').text[2: ]
        self._storage[product_id] = {"Title": product_title, 'Link': link, "Price": product_cost, 
                                     "Description": ""}
        try:
            view_more_btn = self._webdriver.find_element_by_class_name("cus-btn").find_element_by_tag_name("a")
            view_more_btn.click()
        except NoSuchElementException:
            pass
        try:
            specs = self._webdriver.find_element_by_class_name("table-wrapper").find_elements_by_tag_name("tr")
        except NoSuchElementException:
            specs = self._webdriver.find_element_by_class_name("custom-table").find_elements_by_tag_name("li")
            for spec in specs:
                parsed_spec = spec.text.split(":")
                if len(parsed_spec) == 2:
                    spec_title = parsed_spec[0].strip().capitalize()
                    spec_value = parsed_spec[1].strip().lower()
                    self._storage[product_id][spec_title] = spec_value
                elif len(parsed_spec) == 1:
                    pass
                else:
                    raise Exception("parsed_spec: %s" % parsed_spec)
        else:
            try:
                for spec in specs:
                    parsed_spec = spec.find_element_by_tag_name("td").text.split(":")
                    if len(parsed_spec) == 3:
                        spec_title = parsed_spec[0].strip().capitalize()
                        spec_value = parsed_spec[1].split()[0].strip().lower()
                        self._storage[product_id][spec_title] = spec_value.strip()
                        spec_title = parsed_spec[1][len(spec_value) + 2:].strip().capitalize()
                        spec_value = parsed_spec[2].strip().lower()
                    elif len(parsed_spec) == 2:
                        spec_title = parsed_spec[0].strip().capitalize()
                        spec_value = parsed_spec[1].strip().lower()
                    elif len(parsed_spec) == 1:
                        print(parsed_spec[0])
                        spec_title, spec_value = "Model", parsed_spec[0].strip().lower()
                    else:
                        raise Exception("parsed_spec: %s" % parsed_spec)
                    self._storage[product_id][spec_title] = spec_value
            except NoSuchElementException:
                print("\n\rParsing failed, link: %s" % link)
                self._storage.pop(product_id, None)


In [5]:
# Instantiate the webdriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(executable_path=os.getenv("CHROMEDRIVER_PATH"), options=options)

In [6]:
# Instantiate all scrapers
ebay_scraper = EbayScraper(webdriver=driver, name="eBay", web_url="https://www.ebay.com.sg/")
reebelo_scraper = ReebeloScraper(webdriver=driver, name="Reebelo", web_url="https://www.reebelo.com/")

In [7]:
# Start scraping Ebay
ebay_storage = ebay_scraper.run()

Num of links: 64
Progress: 6/64
Parsing failed, link: https://www.ebay.com.sg/sch/i.html?_from=R40&_nkw=laptop&_sacat=0&LH_ItemCondition=2500&rt=nc&LH_BIN=1&_ipg=200&RAM%2520Size=1%252D19%2520GB&_dcat=177
Progress: 9/64
Parsing failed, link: https://www.ebay.com.sg/sch/i.html?_from=R40&_nkw=laptop&_sacat=0&LH_ItemCondition=2500&rt=nc&LH_BIN=1&_ipg=200&RAM%2520Size=12%2520GB&_dcat=177
Progress: 21/64
Parsing failed, link: https://www.ebay.com.sg/sch/i.html?_from=R40&_nkw=laptop&_sacat=0&LH_ItemCondition=2500&rt=nc&LH_BIN=1&_ipg=200&RAM%2520Size=16%2520GB&_dcat=177
Progress: 41/64
Parsing failed, link: https://www.ebay.com.sg/sch/i.html?_from=R40&_nkw=laptop&_sacat=0&LH_ItemCondition=2500&rt=nc&LH_BIN=1&_ipg=200&RAM%2520Size=4%2520GB&_dcat=177
Progress: 55/64
Parsing failed, link: https://www.ebay.com.sg/sch/i.html?_from=R40&_nkw=laptop&_sacat=0&LH_ItemCondition=2500&rt=nc&LH_BIN=1&_ipg=200&RAM%2520Size=8%2520GB&_dcat=177
Progress: 64/64
Total time elapsed: 164.63s


In [8]:
# Start scraping Reebelo
reebelo_storage = reebelo_scraper.run()

Num of links: 45
Lenovo ThinkPad Yoga 370 (Silver) (TOUCHSCREEN)
Model & OS17/45
Model
Installed OS
 
Processor
Processor
Processor Type
Processor Speed
Memory
Memory Type
Memory Std.
Memory Max.
Memory Slots
Drives / Storage
SSD Capacity
Display
LED backlight
Display Type
Display Size
Display Max. Res.
Multimedia
Integrated Camera
Graphics
Graphic Controller
Audio
Audio Controller
Integrated Speakers
Built in Mic
Interfaces
Ports - USB 3 Series Type-A
Ports - USB 3 Series Type-C
Ports - Audio
HDMI Connector
Slots / Expansions
Card Reader
Connectivity
Network
Modem
Bluetooth
Input
KeyBoard
Pointing Device
Security
Fingerprint Reader
Power
Battery Type
Battery Runtime
Ac Adaptor
Number Of Cells
Dimensions
Width
Height
Depth
Weight
Other
Special Features
Lenovo ThinkPad X1 Yoga G1 (Black) (TOUCHSCREEN)
FUJITSU Notebook LIFEBOOK E546 (Black)
HP ZBook 15 G3 Mobile Workstation Laptop
Progress: 45/45
Total time elapsed: 155.24s


In [9]:
# Stop the webdriver
driver.quit()

In [10]:
ebay_storage

{'1': {'Title': 'dell latitude 5404 rugged extreme laptop i7-4650u16gb 1tb ssd win10 4g lte bt',
  'Link': 'https://www.ebay.com.sg/itm/Dell-Latitude-5404-Rugged-Extreme-Laptop-i7-4650U16GB-1TB-SSD-WIN10-4G-LTE-BT/164402678993?hash=item264729ccd1%3Ag%3AseoAAOSwyKBfb021&LH_BIN=1&LH_ItemCondition=2500',
  'Price': '1472.12',
  'Description': '',
  'Condition': 'remanufactured',
  'Memory': '8gb',
  'Model': '5404',
  'Operating system edition': 'professional',
  'Processor': 'intel core i7 4th gen.',
  'Ram size': '16 gb',
  'Release year': '2014',
  'Type': 'notebook/laptop',
  'Graphics processing type': 'integrated/on-board graphics',
  'Processor speed': '2.0ghz',
  'Features': 'triple rf - passthough, 4g lte cellular, rugged, bluetooth, backlit keyboard',
  'Operating system': 'windows 10',
  'Mpn': '5404',
  'Storage type': 'ssd (solid state drive)',
  'Series': 'latitude',
  'Brand': 'dell',
  'Hard drive capacity': '1 tb',
  'Ssd capacity': '1 tb',
  'Product line': 'latitude',
 

In [11]:
reebelo_storage

{'1': {'Title': 'refurbished lenovo thinkpad x250 touch 240gb',
  'Link': 'https://www.reebelo.com/collections/all/products/refurbished-lenovo-thinkpadx250touch-black-240gb-caslam9fhj',
  'Price': '649',
  'Description': '',
  'Condition': 'mint',
  'Keyboard language': 'english (us)',
  'Keyboard type': 'qwerty',
  'Screen size (inches)': '13',
  'Resolution': '',
  'Storage (gb)': '240',
  'Memory (gb)': '8',
  'Storage type': 'ssd',
  'Bluetooth': 'yes',
  'Webcam': 'yes',
  'Hard drive interface': '',
  'Memory type': '',
  'Processor speed (ghz)': '',
  'Processor brand': 'intel',
  'Processor type': 'i5-5300u',
  'Processor core': '',
  'Graphic card type': '',
  'Graphic card memory type': '',
  'Os': 'windows 10',
  'Network': '',
  'Backlit keyboard': '',
  'Color': 'black',
  'Processor': 'intel core i5-5300u',
  'Ports': '',
  'Manufacturer ref': 'lenovo thinkpad x250 touch'},
 '2': {'Title': 'refurbished lenovo thinkpad t460',
  'Link': 'https://www.reebelo.com/collections/

In [12]:
df_ebay = pd.DataFrame.from_dict(ebay_storage, orient='index')
df_ebay

Unnamed: 0,Title,Link,Price,Description,Condition,Memory,Model,Operating system edition,Processor,Ram size,...,Ethernet,Dimensions,Touchscreen,Depth (mm),Bluetooth,Wifi,Audio connections,Storage capacity,Compatible model,Country/region of manufacture
1,dell latitude 5404 rugged extreme laptop i7-46...,https://www.ebay.com.sg/itm/Dell-Latitude-5404...,1472.12,,remanufactured,8gb,5404,professional,intel core i7 4th gen.,16 gb,...,,,,,,,,,,
2,"asus zenbook ux31a ultrabook 13.3"" full hd i7-...",https://www.ebay.com.sg/itm/ASUS-Zenbook-UX31A...,649.49,,remanufactured,,asus zenbook,,intel core i7 3rd gen.,4 gb,...,,,,,,,,,,
3,notebook laptop siemens simatic filed pg 6es77...,https://www.ebay.com.sg/itm/Notebook-Laptop-SI...,1550.2,,remanufactured,,filed pg,,does not apply,,...,,,,,,,,,,
4,"hp 14-cf1502sa 14"" full hd laptop intel quad c...",https://www.ebay.com.sg/itm/HP-14-CF1502sa-14-...,1037.08,,remanufactured,8gb,hp 14-cf1502sa,,intel core i7 8th gen.,,...,,,,,,,,,,
5,"hp 15s-eq1510sa 15.6"" full hd laptop amd ryzen...",https://www.ebay.com.sg/itm/HP-15s-eq1510sa-15...,932.73,,remanufactured,,hp 15s-eq1510sa,,amd ryzen 5,8 gb,...,,,,,,,,,,
6,notebook laptop siemens simatic filed pg m 6es...,https://www.ebay.com.sg/itm/Notebook-Laptop-SI...,1416.1,,remanufactured,,filed pg m,,does not apply,,...,,,,,,,,,,
7,"acer nitro 5 an515 15.6"" full hd intel quad co...",https://www.ebay.com.sg/itm/Acer-Nitro-5-AN515...,1115.62,,remanufactured,,,,does not apply,,...,,,,,,,,,,
8,"lenovo yoga s730 13.3"" full hd laptop quad cor...",https://www.ebay.com.sg/itm/Lenovo-YOGA-S730-1...,1074.2,,remanufactured,,lenovo yoga 730,,intel core i7 8th gen.,16 gb,...,,,,,,,,,,
9,laptop computer lenovo l412 i3 4gb 120gb ssd 1...,https://www.ebay.com.sg/itm/Laptop-Computer-Le...,697.88,,remanufactured,,i3-m380,,i3 m380,,...,,,,,,,,,,
10,"hp 14-cf2503sa 14"" full hd intel quad core i5-...",https://www.ebay.com.sg/itm/HP-14-cf2503sa-14-...,853.93,,remanufactured,,hp 14-cf2503sa,,intel core i5 10th gen.,4 gb,...,,,,,,,,,,


In [13]:
df_reebelo = pd.DataFrame.from_dict(reebelo_storage, orient='index')
df_reebelo

Unnamed: 0,Title,Link,Price,Description,Condition,Keyboard language,Keyboard type,Screen size (inches),Resolution,Storage (gb),...,Weight,Operating system,Ram,Storage,Unnamed: 16,Ram installed size,Display resolution,Features,Gpu,"Speakers, stereo microphone data link protocol"
1,refurbished lenovo thinkpad x250 touch 240gb,https://www.reebelo.com/collections/all/produc...,649,,mint,english (us),qwerty,13,,240.0,...,,,,,,,,,,
2,refurbished lenovo thinkpad t460,https://www.reebelo.com/collections/all/produc...,643,,,,,,,,...,,,,,,,,,,
3,refurbished dell latitude e7280 12.5' / core i...,https://www.reebelo.com/collections/all/produc...,649,,,,,,,,...,2.6 lbs,microsoft windows 10 pro 64-bit,,,,,,,,
4,refurbished lenovo thinkpad yoga 370 (touchscr...,https://www.reebelo.com/collections/all/produc...,1029,,,,,,,,...,,,8gb,256gb ssd,,,,,,
5,refurbished dell latitude e7470,https://www.reebelo.com/collections/all/produc...,809,,,1399,,english (us),qwerty,14.0,...,,,,,dell latitude e7470,,,,,
6,refurbished dell latitude e5470 256gb,https://www.reebelo.com/collections/all/produc...,849,,mint,english (us),qwerty,14,,256.0,...,,,,,,,,,,
7,refurbished lenovo thinkpad t440p,https://www.reebelo.com/collections/all/produc...,528,,,,,,,,...,,,,,,,,,,
8,refurbished dell latitude 7270 128gb,https://www.reebelo.com/collections/all/produc...,649,,mint,english (us),qwerty,13,,128.0,...,,,,,,,,,,
9,refurbished acer travelmate p249,https://www.reebelo.com/collections/all/produc...,575,,,,standard,14,1366*768,500.0,...,,,,,na,,,,,
10,refurbished lenovo thinkpad x240,https://www.reebelo.com/collections/all/produc...,459,,,,,,,,...,,,,,,,,,,


In [14]:
df_ebay.to_excel("ebay.xlsx")

In [15]:
df_reebelo.to_excel("reebelo.xlsx")

In [None]:
# # Lazada example
# driver = webdriver.Chrome(executable_path=os.getenv("CHROMEDRIVER_PATH"))
# driver.get('https://www.lazada.sg/')

# # Search for laptop
# search_bar = driver.find_element_by_id("q")
# search_bar.send_keys("laptop", Keys.ENTER)

# # Filter by Refurbished items
# used_checkbox = driver.find_element_by_xpath("//span[contains(text(), 'Refurbish')]")
# used_checkbox.click()

# link_container = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div/div[1]/div[3]').find_elements_by_tag_name("a")
# links = []
# for link in link_container:
#     links.append(link.get_attribute("href"))
# links = list(set(links))

# storage = {}
# for i in range(len(links)):
#     driver.get(links[i])
#     try:
#         product_title = driver.find_element_by_class_name("pdp-mod-product-badge-title").text
#         specs = driver.find_element_by_class_name("specification-keys").find_elements_by_tag_name("li")
#         storage[i] = {"title": product_title}
#         for spec in specs:
#             spec_title = spec.find_element_by_class_name("key-title").text
#             storage[i][spec_title] = spec.find_element_by_class_name("key-value").text 
#     except NoSuchElementException:
#         pass