feature: private tlds can be used at call-time

- Adds `include_psl_private_domains` to the `__call__` method. This is now something you can choose on a per-call basis. The object level argument now is only a default value for each call. - The entire dataset from publicsuffix.org is saved to cache - Ensured no weird cache issues happen when using with different `suffix_list_urls` by using different filenames per `suffix_list_urls` - Use filelock to support multiprocessing and multithreading use cases - Updates the bundled snapshot to be the raw publicsuffix data. Need to look at performance impact of this. - Breaking change cache_file => cache_dir - various other cleanups
john-kurkowski · Oct 8, 2020 · d68e6de · d68e6de
1 parent 1dd19cf
commit d68e6de
Show file tree

Hide file tree

Showing 15 changed files with 13,721 additions and 7,496 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ tldextract/_version.py
 tldextract_app/tldextract
 tldextract_app/web
 tldextract.egg-info
+tldextract/.suffix_cache/*
 .tox
+.pytest_cache
diff --git a/README.md b/README.md
@@ -114,19 +114,19 @@ when I haven't kept this code up to date.)
 
 To avoid this fetch or control the cache's location, use your own extract
 callable by setting TLDEXTRACT_CACHE environment variable or by setting the
-cache_file path in TLDExtract initialization.
+cache_dir path in TLDExtract initialization.
 
 ```python
 # extract callable that falls back to the included TLD snapshot, no live HTTP fetching
 no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
 no_fetch_extract('http://www.google.com')
 
 # extract callable that reads/writes the updated TLD set to a different path
-custom_cache_extract = tldextract.TLDExtract(cache_file='/path/to/your/cache/file')
+custom_cache_extract = tldextract.TLDExtract(cache_dir='/path/to/your/cache/')
 custom_cache_extract('http://www.google.com')
 
 # extract callable that doesn't use caching
-no_cache_extract = tldextract.TLDExtract(cache_file=False)
+no_cache_extract = tldextract.TLDExtract(cache_dir=False)
 no_cache_extract('http://www.google.com')
 ```
 
@@ -167,10 +167,15 @@ ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
 ```
 
 The following overrides this.
+```python
+>>> extract = tldextract.TLDExtract()
+>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
+ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
+```
 
+or to change the default for all extract calls,
 ```python
->>> extract = tldextract.TLDExtract(include_psl_private_domains=True)
->>> extract.update() # necessary until #66 is fixed
+>>> extract = tldextract.TLDExtract( include_psl_private_domains=True)
 >>> extract('waiterrant.blogspot.com')
 ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
 ```
@@ -189,19 +194,19 @@ extract = tldextract.TLDExtract(
     suffix_list_urls=["http://foo.bar.baz"],
     # Recommended: Specify your own cache file, to minimize ambiguities about where
     # tldextract is getting its data, or cached data, from.
-    cache_file='/path/to/your/cache/file',
+    cache_dir='/path/to/your/cache/',
     fallback_to_snapshot=False)
 ```
 
 The above snippet will fetch from the URL *you* specified, upon first need to download the
-suffix list (i.e. if the cache_file doesn't exist).
+suffix list (i.e. if the cached version doesn't exist).
 
 If you want to use input data from your local filesystem, just use the `file://` protocol:
 
 ```python
 extract = tldextract.TLDExtract(
     suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
-    cache_file='/path/to/your/cache/file',
+    cache_dir='/path/to/your/cache/',
     fallback_to_snapshot=False)
 ```
 

diff --git a/setup.py b/setup.py
@@ -38,7 +38,7 @@
         "version of tldextract." % (sys.version_info[0], sys.version_info[1])
     )
 
-INSTALL_REQUIRES = ["idna", "requests>=2.1.0", "requests-file>=1.4"]
+INSTALL_REQUIRES = ["idna", "requests>=2.1.0", "requests-file>=1.4", "filelock>=3.0.8"]
 
 setup(
     name="tldextract",

diff --git a/tests/custom_suffix_test.py b/tests/custom_suffix_test.py
@@ -3,9 +3,7 @@
 import os
 
 import tldextract
-
-from .helpers import temporary_file
-
+from .helpers import temporary_dir
 
 FAKE_SUFFIX_LIST_URL = "file://" + os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
@@ -15,21 +13,31 @@
 
 # pylint: disable=invalid-name
 extract_using_fake_suffix_list = tldextract.TLDExtract(
-    cache_file=temporary_file(),
+    cache_dir=temporary_dir(),
     suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
 )
 extract_using_fake_suffix_list_no_cache = tldextract.TLDExtract(
-    cache_file=None,
+    cache_dir=None,
     suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
 )
 extract_using_extra_suffixes = tldextract.TLDExtract(
-    cache_file=None,
+    cache_dir=None,
     suffix_list_urls=[FAKE_SUFFIX_LIST_URL],
     extra_suffixes=EXTRA_SUFFIXES
 )
 # pylint: enable=invalid-name
 
 
+def test_private_extraction():
+    tld = tldextract.TLDExtract(
+        cache_dir=temporary_dir(),
+        suffix_list_urls=[]
+    )
+
+    assert tld("foo.blogspot.com") == ('foo', 'blogspot', 'com')
+    assert tld("foo.blogspot.com", include_psl_private_domains=True) == ('', 'foo', 'blogspot.com')
+
+
 def test_suffix_which_is_not_in_custom_list():
     for fun in (extract_using_fake_suffix_list, extract_using_fake_suffix_list_no_cache):
         result = fun("www.google.com")

diff --git a/tests/helpers.py b/tests/helpers.py
@@ -19,7 +19,7 @@ def check_output(*popenargs, **kwargs):
     return output
 
 
-def temporary_file():
+def temporary_dir():
     """ Make a writable temporary file and return its absolute path.
     """
-    return tempfile.mkstemp()[1]
+    return tempfile.mkdtemp()
diff --git a/tests/integration_test.py b/tests/integration_test.py
@@ -1,33 +1,12 @@
 '''tldextract integration tests.'''
 
-import logging
-import os
-import traceback
-
 import pytest
 
 import tldextract
 
 
-def test_log_snapshot_diff(mocker):
-    mocker.patch.object(logging.getLogger(), 'level', logging.DEBUG)
-    debug_mock = mocker.patch.object(logging.getLogger('tldextract'), 'debug')
-
-    extractor = tldextract.TLDExtract()
-    try:
-        os.remove(extractor.cache_file)
-    except (IOError, OSError):
-        logging.warning(traceback.format_exc())
-
-    extractor('ignore.com')
-
-    assert debug_mock.call_count == 1
-    log_str = debug_mock.call_args[0][0]
-    assert log_str.startswith('computed TLD diff')
-
-
 def test_bad_kwargs():
     with pytest.raises(ValueError):
         tldextract.TLDExtract(
-            cache_file=False, suffix_list_urls=False, fallback_to_snapshot=False
+            cache_dir=False, suffix_list_urls=False, fallback_to_snapshot=False
         )
diff --git a/tests/main_test.py b/tests/main_test.py
@@ -3,20 +3,23 @@
 
 import sys
 
+import pytest
 import responses
 import tldextract
-from .helpers import temporary_file
+from tldextract.cache import DiskCache
+from tldextract.suffix_list import SuffixListNotFound
+from .helpers import temporary_dir
 if sys.version_info >= (3,):  # pragma: no cover
     unicode = str  # pylint: disable=invalid-name,redefined-builtin
 
 
 # pylint: disable=invalid-name
-extract = tldextract.TLDExtract(cache_file=temporary_file())
-extract_no_cache = tldextract.TLDExtract(cache_file=False)
-extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_file=temporary_file())
-extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_file=False)
+extract = tldextract.TLDExtract(cache_dir=temporary_dir())
+extract_no_cache = tldextract.TLDExtract(cache_dir=False)
+extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_dir=temporary_dir())
+extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_dir=False)
 extract_using_fallback_to_snapshot_no_cache = tldextract.TLDExtract(
-    cache_file=None,
+    cache_dir=None,
     suffix_list_urls=None
 )
 # pylint: enable=invalid-name
@@ -236,12 +239,14 @@ def test_result_as_dict():
 
 
 @responses.activate  # pylint: disable=no-member
-def test_cache_timeouts():
+def test_cache_timeouts(tmpdir):
     server = 'http://some-server.com'
     responses.add(  # pylint: disable=no-member
         responses.GET,  # pylint: disable=no-member
         server,
         status=408
     )
+    cache = DiskCache(tmpdir)
 
-    assert tldextract.remote.find_first_response([server], 5) == unicode('')
+    with pytest.raises(SuffixListNotFound):
+        tldextract.suffix_list.find_first_response(cache, [server], 5)
diff --git a/tests/test_cache.py b/tests/test_cache.py
@@ -0,0 +1,18 @@
+"""Test the caching functionality"""
+import pytest
+
+from tldextract.cache import DiskCache
+
+
+def test_disk_cache(tmpdir):
+    cache = DiskCache(tmpdir)
+    cache.set("testing", "foo", "bar")
+    assert cache.get("testing", "foo") == "bar"
+
+    cache.clear()
+
+    with pytest.raises(KeyError):
+        cache.get("testing", "foo")
+
+    cache.set("testing", "foo", "baz")
+    assert cache.get("testing", "foo") == "baz"