Skip to content

Commit

Permalink
let the OS pick an available port, to avoid what appear to be timing …
Browse files Browse the repository at this point in the history
…issues causing multiple browsers to choose the same port
  • Loading branch information
nlevitt committed Feb 22, 2017
1 parent 3c4ab83 commit 2398031
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 55 deletions.
33 changes: 18 additions & 15 deletions brozzler/browser.py
Expand Up @@ -30,6 +30,7 @@
import base64
from brozzler.chrome import Chrome
import surt
import socket

class BrowsingException(Exception):
pass
Expand All @@ -41,10 +42,12 @@ class BrowsingTimeout(BrowsingException):
pass

class BrowserPool:
'''
Manages pool of browsers. Automatically chooses available port for the
debugging protocol.
'''
logger = logging.getLogger(__module__ + '.' + __qualname__)

BASE_PORT = 9200

def __init__(self, size=3, **kwargs):
'''
Initializes the pool.
Expand All @@ -54,13 +57,8 @@ def __init__(self, size=3, **kwargs):
**kwargs: arguments for Browser(...)
'''
self.size = size
self._available = set()
self.kwargs = kwargs
self._in_use = set()

for i in range(0, size):
browser = Browser(port=BrowserPool.BASE_PORT + i, **kwargs)
self._available.add(browser)

self._lock = threading.Lock()

def acquire(self):
Expand All @@ -74,30 +72,35 @@ def acquire(self):
NoBrowsersAvailable if none available
'''
with self._lock:
try:
browser = self._available.pop()
except KeyError:
if len(self._in_use) >= self.size:
raise NoBrowsersAvailable

# choose available port
sock = socket.socket()
sock.bind(('0.0.0.0', 0))
port = sock.getsockname()[1]
sock.close()

browser = Browser(port=port, **self.kwargs)
self._in_use.add(browser)
return browser

def release(self, browser):
browser.stop() # make sure
with self._lock:
self._available.add(browser)
self._in_use.remove(browser)

def shutdown_now(self):
self.logger.info(
'shutting down browser pool (%s browsers in use)',
len(self._in_use))
with self._lock:
for browser in self._available:
browser.stop()
for browser in self._in_use:
browser.stop()
self._in_use.clear()

def num_available(self):
return len(self._available)
return self.size - len(self._in_use)

def num_in_use(self):
return len(self._in_use)
Expand Down
20 changes: 0 additions & 20 deletions brozzler/chrome.py
Expand Up @@ -28,7 +28,6 @@
import signal
import sqlite3
import json
import psutil
import tempfile

class Chrome:
Expand Down Expand Up @@ -61,24 +60,6 @@ def __enter__(self):
def __exit__(self, *args):
self.stop()

def _find_available_port(self, default_port=9200):
try:
conns = psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
return default_port

if not any(conn.laddr[1] == default_port for conn in conns):
return default_port

for p in range(9999,8999,-1):
if not any(conn.laddr[1] == p for conn in conns):
self.logger.warn(
'port %s already in use, using %s instead',
default_port, p)
return p

return default_port

def _init_cookie_db(self, cookie_db):
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
cookie_location = os.path.join(cookie_dir, 'Cookies')
Expand Down Expand Up @@ -140,7 +121,6 @@ def start(self, proxy=None, cookie_db=None):

new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name
self.port = self._find_available_port(self.port)
chrome_args = [
self.chrome_exe,
'--remote-debugging-port=%s' % self.port,
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Expand Up @@ -32,7 +32,7 @@ def find_package_data(package):

setuptools.setup(
name='brozzler',
version='1.1b9.dev192',
version='1.1b9.dev193',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
Expand Down Expand Up @@ -71,7 +71,6 @@ def find_package_data(package):
'surt>=0.3.0',
'rethinkstuff>=0.1.5',
'rethinkdb>=2.3,<2.4',
'psutil==4.3.0',
'cerberus==1.0.1',
'jinja2',
],
Expand Down
18 changes: 0 additions & 18 deletions tests/test_units.py
Expand Up @@ -25,7 +25,6 @@
import brozzler.chrome
import socket
import logging
import psutil
import yaml

@pytest.fixture(scope='module')
Expand Down Expand Up @@ -57,23 +56,6 @@ def test_robots(httpd):
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
assert not brozzler.is_permitted_by_robots(site, url)

def test_find_available_port():
x = brozzler.chrome.Chrome(None, None)
try:
psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
logging.warn(
'skipping _find_available_port() test because '
'psutil.net_connections(kind="tcp") raised AccessDenied')
return
assert x._find_available_port(9800) == 9800
sock = socket.socket()
sock.bind(('localhost', 9800))
sock.listen(0)
assert x._find_available_port(9800) >= 9990
sock.close()
assert x._find_available_port(9800) == 9800

def test_scoping():
test_scope = yaml.load('''
max_hops: 100
Expand Down

0 comments on commit 2398031

Please sign in to comment.