Skip to content

Commit

Permalink
Improve oikotie performance and add area parameter (#71)
Browse files Browse the repository at this point in the history
* Improve oikotie performance and add area parameter

* flake8 fixes
  • Loading branch information
jmyrberg committed Jul 14, 2022
1 parent 6ac368d commit 3b8be11
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 64 deletions.
21 changes: 1 addition & 20 deletions finscraper/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse

from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException

from finscraper.request import SeleniumCallbackRequest
from finscraper.utils import get_chromedriver

Expand All @@ -33,24 +30,8 @@ def from_crawler(cls, crawler):
return middleware

def spider_opened(self, spider):
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option(
'prefs', {'intl.accept_languages': 'fi,fi_FI'})
if not self.settings.get('DISABLE_HEADLESS', False):
options.add_argument("--headless")
if self.settings.get('PROGRESS_BAR_ENABLED', True):
options.add_argument('--disable-logging')
try:
self.driver = get_chromedriver(options)
if self.settings.get('MINIMIZE_WINDOW', False):
try:
self.driver.minimize_window()
except WebDriverException:
pass
self.driver = get_chromedriver(settings=self.settings)
except Exception:
raise NotConfigured('Could not get chromedriver')

Expand Down
116 changes: 78 additions & 38 deletions finscraper/scrapy_spiders/oikotieapartment.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,39 +20,37 @@
from finscraper.request import SeleniumCallbackRequest
from finscraper.text_utils import strip_join, drop_empty_elements, \
paragraph_join
from finscraper.utils import get_chromedriver


logger = logging.getLogger(__name__)


class _OikotieApartmentSpider(Spider):
name = 'oikotieapartment'
start_urls = ['https://asunnot.oikotie.fi/myytavat-asunnot']
follow_link_extractor = LinkExtractor(
attrs=('href', 'ng-href'),
allow_domains=('asunnot.oikotie.fi'),
allow=(r'.*\/myytavat-asunnot\/.*'),
deny=(r'.*?origin\=.*'),
deny_domains=(),
canonicalize=True
)
base_url = 'https://asunnot.oikotie.fi/myytavat-asunnot'
item_link_extractor = LinkExtractor(
allow_domains=('asunnot.oikotie.fi'),
allow=(r'.*/myytavat-asunnot/.*/[0-9]+'),
allow=(r'.*\/myytavat-asunnot\/.*\/[0-9]{3,}'),
deny=(r'.*?origin\=.*'),
deny_domains=(),
canonicalize=True
)
custom_settings = {
# The following needs to be set
# Custom
'DISABLE_HEADLESS': True,
'MINIMIZE_WINDOW': True,
# Scrapy
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_TARGET_CONCURRENCY': 0.9,
'CONCURRENT_REQUESTS': 4,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {
'finscraper.middlewares.SeleniumCallbackMiddleware': 800
}
}
itemcount = 0
listings_per_page = 24
title2field = {
# Perustiedot
'Sijainti': 'location',
Expand Down Expand Up @@ -131,52 +129,100 @@ class _OikotieApartmentSpider(Spider):
'Pintamateriaalit': 'wallcovering'
}

def __init__(self, *args, **kwargs):
def __init__(self, *args, area=None, **kwargs):
"""Fetch oikotie.fi apartments.
Args:
area (str, optional): Scrape listings based on area, e.g.
"helsinki" or "hausjärvi". The final URL will be formed as:
'https://asunnot.oikotie.fi/myytavat-asunnot/{area}'. Defaults
to None.
"""
kwargs['follow_request_type'] = SeleniumCallbackRequest
super(_OikotieApartmentSpider, self).__init__(*args, **kwargs)
self.area = area

self._last_page = None

def start_requests(self):
for url in self.start_urls:
# Render start page with headed Chrome
driver = get_chromedriver(settings=self.settings)

area = '' if self.area is None else f'/{self.area}'
base_url_with_area = f'{self.base_url}{area}'
logger.info(f'Using "{base_url_with_area}" as start URL')

driver.get(base_url_with_area)

# Click yes on modal, if it exists (Selenium)
self._handle_start_modal(driver)

# Find the last page in pagination
self._last_page = self._get_last_page(driver)

driver.close()

# Iterate pagination pages one-by-one and extract links + items
for page in range(1, self._last_page + 1):
url = f'{base_url_with_area}?pagination={page}'
yield SeleniumCallbackRequest(
url, selenium_callback=self._handle_start)
url,
priority=10,
meta={'page': page},
selenium_callback=self._handle_pagination_page)

@staticmethod
def _handle_start(request, spider, driver):
driver.get(request.url)
def _get_last_page(self, driver):
logger.debug('Getting last page...')
last_page_xpath = '//span[contains(@ng-bind, "ctrl.totalPages")]'
last_page_element = driver.find_element(By.XPATH, last_page_xpath)
last_page = int(last_page_element.text.split('/')[-1].strip())
logger.debug(f'Last page found: {last_page}')
return last_page

def _handle_start_modal(self, driver):
# Click modal, if it exists
try:
# Find iframe
logger.info('Waiting for iframe...')
logger.debug('Waiting for iframe...')
iframe_xpath = "//iframe[contains(@id, 'sp_message_iframe')]"
iframe = WebDriverWait(driver, 5).until(
iframe = WebDriverWait(driver, 2).until(
EC.presence_of_element_located((By.XPATH, iframe_xpath)))
driver.switch_to.frame(iframe)
logger.info(f'Switched to iframe {iframe}')
logger.debug(f'Switched to iframe {iframe}')

# Find button
logger.info('Finding button...')
logger.debug('Finding button...')
button_xpath = "//button[contains(., 'Hyväksy')]"
WebDriverWait(driver, 5).until(
WebDriverWait(driver, 2).until(
EC.presence_of_element_located((By.XPATH, button_xpath)))
modal = driver.find_element(By.XPATH, button_xpath)
logger.info('Clicking modal...')
logger.debug('Clicking modal...')
modal.click()
logger.info('Waiting 1 second...')
driver.implicitly_wait(1)
logger.info('Waiting for modal to disappear...')
WebDriverWait(driver, 10).until(
logger.debug('Waiting for modal to disappear...')
WebDriverWait(driver, 2).until(
EC.invisibility_of_element_located((By.XPATH, button_xpath)))

logger.info('Switching to default frame')
logger.debug('Switching to default frame')
driver.switch_to.default_content()
logger.info('Modal handled successfully!')
logger.debug('Modal handled successfully!')
except TimeoutException:
logger.warning('No modal found, assuming does not exist')

def _handle_pagination_page(self, request, spider, driver):
driver.get(request.url)

logger.debug('Scrolling pagination page to bottom...')
listings_xpath = '//div[contains(@class, "cards__card")]'
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

logger.debug('Waiting for listings to be available...')
page = request.meta['page']
n_listings = self.listings_per_page if page < self._last_page else 1
WebDriverWait(driver, 10).until(
lambda browser:
len(browser.find_elements(By.XPATH, listings_xpath)) >=
n_listings)
logger.debug('Listings rendered, returning response')

return HtmlResponse(
driver.current_url,
body=driver.page_source.encode('utf-8'),
Expand All @@ -190,22 +236,16 @@ def parse(self, resp, to_parse=False):
if self.itemcount and self.itemcount == max_itemcount:
raise CloseSpider

if to_parse:
if to_parse: # Parse listing item
yield self._parse_item(resp)
self.itemcount += 1

# Parse items and further on extract links from those pages
# Extract listing links and parse them
item_links = self.item_link_extractor.extract_links(resp)
for link in item_links:
yield Request(link.url, callback=self.parse, priority=20,
cb_kwargs={'to_parse': True})

# Extract all links from this page
follow_links = self.follow_link_extractor.extract_links(resp)
for link in follow_links:
yield SeleniumCallbackRequest(
link.url, callback=self.parse, priority=10)

def _parse_item(self, resp):
il = ItemLoader(item=_OikotieApartmentItem(), response=resp)
il.add_value('url', resp.url)
Expand Down
5 changes: 3 additions & 2 deletions finscraper/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,11 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
class OikotieApartment(_SpiderWrapper):
__doc__ = _get_docstring(_OikotieApartmentSpider, _OikotieApartmentItem)

def __init__(self, jobdir=None, progress_bar=True, log_level=None):
def __init__(self, area=None, jobdir=None, progress_bar=True,
log_level=None):
super(OikotieApartment, self).__init__(
spider_cls=_OikotieApartmentSpider,
spider_params=dict(),
spider_params=dict(area=area),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
Expand Down
30 changes: 27 additions & 3 deletions finscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pickle

from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
Expand Down Expand Up @@ -86,17 +87,40 @@ def emit(self, record):
self.handleError(record)


def get_chromedriver(options=None):
if not options:
def get_chromedriver(options=None, settings=None):
"""Get chromedriver automatically.
Args:
options (selenium.webdriver.chrome.options.Options, optional):
Options to start chromedriver with. If None, will use default
settings. Defaults to None.
settings (scrapy.settings.Settings, optional): Scrapy settings to
take into consideration when starting chromedriver. If None,
will not be taken into consideration. Defaults to None.
Returns:
Selenium webdriver for Chrome (selenium.webdriver.Chrome).
"""
settings = settings or {}
if options is None:
options = Options()
options.add_argument('--no-sandbox')
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless")
options.add_experimental_option(
'prefs', {'intl.accept_languages': 'fi,fi_FI'})
if not settings.get('DISABLE_HEADLESS', False):
options.add_argument("--headless")
if settings.get('PROGRESS_BAR_ENABLED', True):
options.add_argument('--disable-logging')

service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
if settings.get('MINIMIZE_WINDOW', False):
try:
driver.minimize_window()
except WebDriverException:
pass

return driver
4 changes: 3 additions & 1 deletion tests/test_spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
},
{
'class': OikotieApartment,
'params': [None],
'params': [None, {'area': 'vuosaari'}],
'n_fields': 80,
'mark': pytest.mark.oikotieapartment
},
Expand Down Expand Up @@ -97,6 +97,8 @@ def test_scraping(spider_cls, spider_params, n_fields, capsys, n_items=10):
def test_functionality(spider_cls, spider_params):
# Save and load
spider = spider_cls(**spider_params).scrape(1)
for k, v in spider_params.items():
assert getattr(spider, k) == v
df = spider.get()
jobdir = spider.save()
del spider
Expand Down

0 comments on commit 3b8be11

Please sign in to comment.