# Web Scraper

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from typing import List, Dict
import pandas as pd
import os
import openpyxl
import time

In [None]:
class ScraperBase(object):
    def __init__(self, webdriver: webdriver, name: str, web_url: str) -> None:
        self._webdriver = webdriver
        self._name = name
        self._web_url = web_url
        self._storage = {}
    
    def _search_desired_item(self) -> None:
        raise NotImplementedError
    
    def _get_all_product_links(self) -> List[str]:
        raise NotImplementedError
    
    def _parse_data(self, link: str) -> None:
        raise NotImplementedError
    
    def run(self) -> Dict:
        start_time = time.time()
        self._webdriver.get(self._web_url)
        self._search_desired_item()
        links = self._get_all_product_links()
        print("Num of links: %d" % len(links))
        for i in range(len(links)):
            self._parse_data(link=links[i])
            print("Progress: %d/%d" % (i + 1, len(links)), end='\r' )
        print("Progress: %d/%d" % (len(links), len(links)))
        print("Total time elapsed: %.2fs" % (time.time() - start_time))
        return self._storage

In [None]:
class EbayScraper(ScraperBase):
    def __init__(self, webdriver: webdriver, name: str, web_url: str) -> None:
        super(EbayScraper, self).__init__(webdriver=webdriver, name=name, web_url=web_url)

    def _search_desired_item(self) -> None:
        search_bar = self._webdriver.find_element_by_xpath('//*[@id="gh-ac"]')
        search_bar.send_keys("laptop", Keys.ENTER)
        refurbished_items = self._webdriver.find_element_by_xpath(
            '//*[@id="x-refine__group__2"]/ul/li[3]/div/a/div/div/span[1]')
        refurbished_items.click()
        non_auction_items = self._webdriver.find_element_by_xpath(
            '//*[@id="s0-14-11-5-1[0]"]/div[2]/div/div/ul/li[3]/a/h2')
        non_auction_items.click()
        self._webdriver.get(self._webdriver.current_url + "&_ipg=200")
    
    def _get_all_product_links(self) -> List[str]:
        link_container = self._webdriver.find_element_by_xpath(
            '//*[@id="srp-river-results"]/ul').find_elements_by_tag_name("a")
        links = []
        for link in link_container:
            links.append(link.get_attribute("href"))
        links = list(set(links))
        return links
    
    def _parse_data(self, link: str) -> None:
        self._webdriver.get(link)
        product_id = str(len(self._storage.keys()) + 1)
        try:
            product_title = self._webdriver.find_element_by_xpath('//*[@id="itemTitle"]').text
            product_cost = self._webdriver.find_element_by_xpath('//*[@id="prcIsum"]').get_attribute("content")
            specs = self._webdriver.find_element_by_xpath(
                '//*[@id="viTabs_0_is"]/div/table').find_elements_by_tag_name("tr")
            try:
                product_desc = self._webdriver.find_element_by_xpath(
                    '//*[@id="ds_div"]/font/font/font/font/font/font/font/font/font/ul/li/p').text
            except NoSuchElementException:
                product_desc = ""
            self._storage[product_id] = {"title": product_title, 'link': link, "price": product_cost, 
                                         "description": product_desc}
            for spec in specs:
                spec_titles = spec.find_elements_by_class_name("attrLabels")
                spec_values = spec.find_elements_by_tag_name("span")
                if len(spec_titles) == len(spec_values):
                    for j in range(len(spec_titles)):
                        self._storage[product_id][spec_titles[j].text.strip()] = spec_values[j].text.strip()
        except NoSuchElementException:
            print("\n\rParsing failed, link: %s" % link)
            self._storage.pop(product_id, None)


In [None]:
class ReebeloScraper(ScraperBase):
    def __init__(self, webdriver: webdriver, name: str, web_url: str) -> None:
        super(ReebeloScraper, self).__init__(webdriver=webdriver, name=name, web_url=web_url)
    
    def _search_desired_item(self) -> None:
        search_bar = self._webdriver.find_element_by_xpath('//*[@id="bc-sf-search-box-0"]')
        search_bar.send_keys("laptop", Keys.ENTER)
        time.sleep(3)
        self._webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

    def _get_all_product_links(self) -> List[str]:
        link_container = self._webdriver.find_element_by_xpath(
            '//*[@id="bc-sf-filter-products"]').find_elements_by_tag_name("a")
        links = []
        for link in link_container:
            links.append(link.get_attribute("href"))
        links = list(set(links))
        return links
    
    def _parse_data(self, link: str) -> None:
        self._webdriver.get(link)
        product_id = str(len(self._storage.keys()) + 1)
        product_title = self._webdriver.find_element_by_xpath(
                '/html/body/main/div[2]/section/div[1]/div[3]/div/div[2]/div/div[1]/form/div[1]/div[1]/h1').text
        product_cost = self._webdriver.find_element_by_xpath(
            '/html/body/main/div[2]/section/div[1]/div[3]/div/div[2]/div/div[1]/form/div[1]/div[2]/div/'
            'div[1]/span[1]/span').text
        self._storage[product_id] = {"title": product_title, 'link': link, "price": product_cost, 
                                     "description": ""}
        try:
            view_more_btn = self._webdriver.find_element_by_class_name("cus-btn").find_element_by_tag_name("a")
            view_more_btn.click()
        except NoSuchElementException:
            pass
        try:
            specs = self._webdriver.find_element_by_class_name("table-wrapper").find_elements_by_tag_name("tr")
        except NoSuchElementException:
            specs = self._webdriver.find_element_by_class_name("custom-table").find_elements_by_tag_name("li")
            for spec in specs:
                parsed_spec = spec.text.split(":")
                if len(parsed_spec) == 2:
                    spec_title = parsed_spec[0].strip()
                    spec_value = parsed_spec[1].strip()
                    self._storage[product_id][spec_title] = spec_value
                elif len(parsed_spec) == 1:
                    pass
                else:
                    raise Exception("parsed_spec: %s" % parsed_spec)
        else:
            try:
                for spec in specs:
                    parsed_spec = spec.find_element_by_tag_name("td").text.split(":")
                    if len(parsed_spec) == 3:
                        spec_title = parsed_spec[0].strip()
                        spec_value = parsed_spec[1].split()[0].strip()
                        self._storage[product_id][spec_title] = spec_value.strip()
                        spec_title = parsed_spec[1][len(spec_value) + 2:].strip()
                        spec_value = parsed_spec[2].strip()
                    elif len(parsed_spec) == 2:
                        spec_title = parsed_spec[0].strip()
                        spec_value = parsed_spec[1].strip()
                    elif len(parsed_spec) == 1:
                        print(parsed_spec[0])
                        spec_title, spec_value = "Model", parsed_spec[0].strip()
                    else:
                        raise Exception("parsed_spec: %s" % parsed_spec)
                    self._storage[product_id][spec_title] = spec_value
            except NoSuchElementException:
                print("\n\rParsing failed, link: %s" % link)
                self._storage.pop(product_id, None)


In [None]:
# Instantiate the webdriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(executable_path=os.getenv("CHROMEDRIVER_PATH"), options=options)

In [None]:
# Instantiate all scrapers
ebay_scraper = EbayScraper(webdriver=driver, name="eBay", web_url="https://www.ebay.com.sg/")
reebelo_scraper = ReebeloScraper(webdriver=driver, name="Reebelo", web_url="https://www.reebelo.com/")

In [None]:
# Start scraping Ebay
ebay_storage = ebay_scraper.run()

In [None]:
# Start scraping Reebelo
reebelo_storage = reebelo_scraper.run()

In [None]:
# Stop the webdriver
driver.quit()

In [None]:
ebay_storage

In [None]:
reebelo_storage

In [None]:
df_ebay = pd.DataFrame.from_dict(ebay_storage, orient='index')
df_ebay

In [None]:
df_reebelo = pd.DataFrame.from_dict(reebelo_storage, orient='index')
df_reebelo

In [None]:
df_ebay.to_excel("ebay.xlsx")

In [None]:
df_reebelo.to_excel("reebelo.xlsx")

In [None]:
# # Lazada example
# driver = webdriver.Chrome(executable_path=os.getenv("CHROMEDRIVER_PATH"))
# driver.get('https://www.lazada.sg/')

# # Search for laptop
# search_bar = driver.find_element_by_id("q")
# search_bar.send_keys("laptop", Keys.ENTER)

# # Filter by Refurbished items
# used_checkbox = driver.find_element_by_xpath("//span[contains(text(), 'Refurbish')]")
# used_checkbox.click()

# link_container = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[1]/div/div[1]/div[3]').find_elements_by_tag_name("a")
# links = []
# for link in link_container:
#     links.append(link.get_attribute("href"))
# links = list(set(links))

# storage = {}
# for i in range(len(links)):
#     driver.get(links[i])
#     try:
#         product_title = driver.find_element_by_class_name("pdp-mod-product-badge-title").text
#         specs = driver.find_element_by_class_name("specification-keys").find_elements_by_tag_name("li")
#         storage[i] = {"title": product_title}
#         for spec in specs:
#             spec_title = spec.find_element_by_class_name("key-title").text
#             storage[i][spec_title] = spec.find_element_by_class_name("key-value").text 
#     except NoSuchElementException:
#         pass