Skip to content

Commit

Permalink
feature: private tlds can be used at call-time
Browse files Browse the repository at this point in the history
- Adds `include_psl_private_domains` to the `__call__` method.  This is now something you can choose on a per-call basis.  The object level argument now is only a default value for each call.
- The entire dataset from publicsuffix.org is saved to cache
- Ensured no weird cache issues happen when using with different `suffix_list_urls` by using different filenames per `suffix_list_urls`
- Use filelock to support multiprocessing and multithreading use cases
- Updates the bundled snapshot to be the raw publicsuffix data. Need to look at performance impact of this.
- Breaking change cache_file => cache_dir
- various other cleanups
  • Loading branch information
brycedrennan committed Oct 8, 2020
1 parent 1dd19cf commit d68e6de
Show file tree
Hide file tree
Showing 15 changed files with 13,721 additions and 7,496 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ tldextract/_version.py
tldextract_app/tldextract
tldextract_app/web
tldextract.egg-info
tldextract/.suffix_cache/*
.tox
.pytest_cache
21 changes: 13 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,19 @@ when I haven't kept this code up to date.)

To avoid this fetch or control the cache's location, use your own extract
callable by setting TLDEXTRACT_CACHE environment variable or by setting the
cache_file path in TLDExtract initialization.
cache_dir path in TLDExtract initialization.

```python
# extract callable that falls back to the included TLD snapshot, no live HTTP fetching
no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
no_fetch_extract('http://www.google.com')

# extract callable that reads/writes the updated TLD set to a different path
custom_cache_extract = tldextract.TLDExtract(cache_file='/path/to/your/cache/file')
custom_cache_extract = tldextract.TLDExtract(cache_dir='/path/to/your/cache/')
custom_cache_extract('http://www.google.com')

# extract callable that doesn't use caching
no_cache_extract = tldextract.TLDExtract(cache_file=False)
no_cache_extract = tldextract.TLDExtract(cache_dir=False)
no_cache_extract('http://www.google.com')
```

Expand Down Expand Up @@ -167,10 +167,15 @@ ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
```

The following overrides this.
```python
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
```

or to change the default for all extract calls,
```python
>>> extract = tldextract.TLDExtract(include_psl_private_domains=True)
>>> extract.update() # necessary until #66 is fixed
>>> extract = tldextract.TLDExtract( include_psl_private_domains=True)
>>> extract('waiterrant.blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
```
Expand All @@ -189,19 +194,19 @@ extract = tldextract.TLDExtract(
suffix_list_urls=["http://foo.bar.baz"],
# Recommended: Specify your own cache file, to minimize ambiguities about where
# tldextract is getting its data, or cached data, from.
cache_file='/path/to/your/cache/file',
cache_dir='/path/to/your/cache/',
fallback_to_snapshot=False)
```

The above snippet will fetch from the URL *you* specified, upon first need to download the
suffix list (i.e. if the cache_file doesn't exist).
suffix list (i.e. if the cached version doesn't exist).

If you want to use input data from your local filesystem, just use the `file://` protocol:

```python
extract = tldextract.TLDExtract(
suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
cache_file='/path/to/your/cache/file',
cache_dir='/path/to/your/cache/',
fallback_to_snapshot=False)
```

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"version of tldextract." % (sys.version_info[0], sys.version_info[1])
)

INSTALL_REQUIRES = ["idna", "requests>=2.1.0", "requests-file>=1.4"]
INSTALL_REQUIRES = ["idna", "requests>=2.1.0", "requests-file>=1.4", "filelock>=3.0.8"]

setup(
name="tldextract",
Expand Down
20 changes: 14 additions & 6 deletions tests/custom_suffix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
import os

import tldextract

from .helpers import temporary_file

from .helpers import temporary_dir

FAKE_SUFFIX_LIST_URL = "file://" + os.path.join(
os.path.dirname(os.path.abspath(__file__)),
Expand All @@ -15,21 +13,31 @@

# pylint: disable=invalid-name
extract_using_fake_suffix_list = tldextract.TLDExtract(
cache_file=temporary_file(),
cache_dir=temporary_dir(),
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_fake_suffix_list_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_extra_suffixes = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL],
extra_suffixes=EXTRA_SUFFIXES
)
# pylint: enable=invalid-name


def test_private_extraction():
tld = tldextract.TLDExtract(
cache_dir=temporary_dir(),
suffix_list_urls=[]
)

assert tld("foo.blogspot.com") == ('foo', 'blogspot', 'com')
assert tld("foo.blogspot.com", include_psl_private_domains=True) == ('', 'foo', 'blogspot.com')


def test_suffix_which_is_not_in_custom_list():
for fun in (extract_using_fake_suffix_list, extract_using_fake_suffix_list_no_cache):
result = fun("www.google.com")
Expand Down
4 changes: 2 additions & 2 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def check_output(*popenargs, **kwargs):
return output


def temporary_file():
def temporary_dir():
""" Make a writable temporary file and return its absolute path.
"""
return tempfile.mkstemp()[1]
return tempfile.mkdtemp()
23 changes: 1 addition & 22 deletions tests/integration_test.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,12 @@
'''tldextract integration tests.'''

import logging
import os
import traceback

import pytest

import tldextract


def test_log_snapshot_diff(mocker):
mocker.patch.object(logging.getLogger(), 'level', logging.DEBUG)
debug_mock = mocker.patch.object(logging.getLogger('tldextract'), 'debug')

extractor = tldextract.TLDExtract()
try:
os.remove(extractor.cache_file)
except (IOError, OSError):
logging.warning(traceback.format_exc())

extractor('ignore.com')

assert debug_mock.call_count == 1
log_str = debug_mock.call_args[0][0]
assert log_str.startswith('computed TLD diff')


def test_bad_kwargs():
with pytest.raises(ValueError):
tldextract.TLDExtract(
cache_file=False, suffix_list_urls=False, fallback_to_snapshot=False
cache_dir=False, suffix_list_urls=False, fallback_to_snapshot=False
)
21 changes: 13 additions & 8 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,23 @@

import sys

import pytest
import responses
import tldextract
from .helpers import temporary_file
from tldextract.cache import DiskCache
from tldextract.suffix_list import SuffixListNotFound
from .helpers import temporary_dir
if sys.version_info >= (3,): # pragma: no cover
unicode = str # pylint: disable=invalid-name,redefined-builtin


# pylint: disable=invalid-name
extract = tldextract.TLDExtract(cache_file=temporary_file())
extract_no_cache = tldextract.TLDExtract(cache_file=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_file=temporary_file())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_file=False)
extract = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_fallback_to_snapshot_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=None
)
# pylint: enable=invalid-name
Expand Down Expand Up @@ -236,12 +239,14 @@ def test_result_as_dict():


@responses.activate # pylint: disable=no-member
def test_cache_timeouts():
def test_cache_timeouts(tmpdir):
server = 'http://some-server.com'
responses.add( # pylint: disable=no-member
responses.GET, # pylint: disable=no-member
server,
status=408
)
cache = DiskCache(tmpdir)

assert tldextract.remote.find_first_response([server], 5) == unicode('')
with pytest.raises(SuffixListNotFound):
tldextract.suffix_list.find_first_response(cache, [server], 5)
18 changes: 18 additions & 0 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Test the caching functionality"""
import pytest

from tldextract.cache import DiskCache


def test_disk_cache(tmpdir):
cache = DiskCache(tmpdir)
cache.set("testing", "foo", "bar")
assert cache.get("testing", "foo") == "bar"

cache.clear()

with pytest.raises(KeyError):
cache.get("testing", "foo")

cache.set("testing", "foo", "baz")
assert cache.get("testing", "foo") == "baz"
Loading

0 comments on commit d68e6de

Please sign in to comment.