Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: private tlds can be used at call-time #207

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ tldextract/_version.py
tldextract_app/tldextract
tldextract_app/web
tldextract.egg-info
tldextract/.suffix_cache/*
.tox
.pytest_cache
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@
After upgrading, update your cache file by deleting it or via `tldextract
--update`.

## Unreleased

* Breaking Changes
* Rename `cache_file` to `cache_dir` as it is no longer a single file but a directory ([#207](https://github.com/john-kurkowski/tldextract/issues/207))
* The CLI arg also changed from `--cache_file` to `--cache_dir`
* Features
* Can pass `include_psl_private_domains` on call, not only on construction
* Use filelocking to support multi-processing and multithreading environments
* Bugfixes
* Select public or private suffixes at runtime. Fixes [#66](https://github.com/john-kurkowski/tldextract/issues/66).


## 2.2.3 (2020-08-05)

* Bugfixes
Expand Down
21 changes: 13 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,19 @@ when I haven't kept this code up to date.)

To avoid this fetch or control the cache's location, use your own extract
callable by setting TLDEXTRACT_CACHE environment variable or by setting the
cache_file path in TLDExtract initialization.
cache_dir path in TLDExtract initialization.

```python
# extract callable that falls back to the included TLD snapshot, no live HTTP fetching
no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
no_fetch_extract('http://www.google.com')

# extract callable that reads/writes the updated TLD set to a different path
custom_cache_extract = tldextract.TLDExtract(cache_file='/path/to/your/cache/file')
custom_cache_extract = tldextract.TLDExtract(cache_dir='/path/to/your/cache/')
custom_cache_extract('http://www.google.com')

# extract callable that doesn't use caching
no_cache_extract = tldextract.TLDExtract(cache_file=False)
no_cache_extract = tldextract.TLDExtract(cache_dir=False)
no_cache_extract('http://www.google.com')
```

Expand Down Expand Up @@ -167,10 +167,15 @@ ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
```

The following overrides this.
```python
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
```

or to change the default for all extract calls,
```python
>>> extract = tldextract.TLDExtract(include_psl_private_domains=True)
>>> extract.update() # necessary until #66 is fixed
>>> extract = tldextract.TLDExtract( include_psl_private_domains=True)
>>> extract('waiterrant.blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
```
Expand All @@ -189,19 +194,19 @@ extract = tldextract.TLDExtract(
suffix_list_urls=["http://foo.bar.baz"],
# Recommended: Specify your own cache file, to minimize ambiguities about where
# tldextract is getting its data, or cached data, from.
cache_file='/path/to/your/cache/file',
cache_dir='/path/to/your/cache/',
fallback_to_snapshot=False)
```

The above snippet will fetch from the URL *you* specified, upon first need to download the
suffix list (i.e. if the cache_file doesn't exist).
suffix list (i.e. if the cached version doesn't exist).

If you want to use input data from your local filesystem, just use the `file://` protocol:

```python
extract = tldextract.TLDExtract(
suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
cache_file='/path/to/your/cache/file',
cache_dir='/path/to/your/cache/',
fallback_to_snapshot=False)
```

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"version of tldextract." % (sys.version_info[0], sys.version_info[1])
)

INSTALL_REQUIRES = ["idna", "requests>=2.1.0", "requests-file>=1.4"]
INSTALL_REQUIRES = ["idna", "requests>=2.1.0", "requests-file>=1.4", "filelock>=3.0.8"]

setup(
name="tldextract",
Expand Down
20 changes: 14 additions & 6 deletions tests/custom_suffix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
import os

import tldextract

from .helpers import temporary_file

from .helpers import temporary_dir

FAKE_SUFFIX_LIST_URL = "file://" + os.path.join(
os.path.dirname(os.path.abspath(__file__)),
Expand All @@ -15,21 +13,31 @@

# pylint: disable=invalid-name
extract_using_fake_suffix_list = tldextract.TLDExtract(
cache_file=temporary_file(),
cache_dir=temporary_dir(),
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_fake_suffix_list_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_extra_suffixes = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL],
extra_suffixes=EXTRA_SUFFIXES
)
# pylint: enable=invalid-name


def test_private_extraction():
tld = tldextract.TLDExtract(
cache_dir=temporary_dir(),
suffix_list_urls=[]
)

assert tld("foo.blogspot.com") == ('foo', 'blogspot', 'com')
assert tld("foo.blogspot.com", include_psl_private_domains=True) == ('', 'foo', 'blogspot.com')


def test_suffix_which_is_not_in_custom_list():
for fun in (extract_using_fake_suffix_list, extract_using_fake_suffix_list_no_cache):
result = fun("www.google.com")
Expand Down
4 changes: 2 additions & 2 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def check_output(*popenargs, **kwargs):
return output


def temporary_file():
def temporary_dir():
""" Make a writable temporary file and return its absolute path.
"""
return tempfile.mkstemp()[1]
return tempfile.mkdtemp()
23 changes: 1 addition & 22 deletions tests/integration_test.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,12 @@
'''tldextract integration tests.'''

import logging
import os
import traceback

import pytest

import tldextract


def test_log_snapshot_diff(mocker):
mocker.patch.object(logging.getLogger(), 'level', logging.DEBUG)
debug_mock = mocker.patch.object(logging.getLogger('tldextract'), 'debug')

extractor = tldextract.TLDExtract()
try:
os.remove(extractor.cache_file)
except (IOError, OSError):
logging.warning(traceback.format_exc())

extractor('ignore.com')

assert debug_mock.call_count == 1
log_str = debug_mock.call_args[0][0]
assert log_str.startswith('computed TLD diff')


def test_bad_kwargs():
with pytest.raises(ValueError):
tldextract.TLDExtract(
cache_file=False, suffix_list_urls=False, fallback_to_snapshot=False
cache_dir=False, suffix_list_urls=False, fallback_to_snapshot=False
)
21 changes: 13 additions & 8 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,23 @@

import sys

import pytest
import responses
import tldextract
from .helpers import temporary_file
from tldextract.cache import DiskCache
from tldextract.suffix_list import SuffixListNotFound
from .helpers import temporary_dir
if sys.version_info >= (3,): # pragma: no cover
unicode = str # pylint: disable=invalid-name,redefined-builtin


# pylint: disable=invalid-name
extract = tldextract.TLDExtract(cache_file=temporary_file())
extract_no_cache = tldextract.TLDExtract(cache_file=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_file=temporary_file())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_file=False)
extract = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_fallback_to_snapshot_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=None
)
# pylint: enable=invalid-name
Expand Down Expand Up @@ -236,12 +239,14 @@ def test_result_as_dict():


@responses.activate # pylint: disable=no-member
def test_cache_timeouts():
def test_cache_timeouts(tmpdir):
server = 'http://some-server.com'
responses.add( # pylint: disable=no-member
responses.GET, # pylint: disable=no-member
server,
status=408
)
cache = DiskCache(tmpdir)

assert tldextract.remote.find_first_response([server], 5) == unicode('')
with pytest.raises(SuffixListNotFound):
tldextract.suffix_list.find_first_response(cache, [server], 5)
18 changes: 18 additions & 0 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Test the caching functionality"""
import pytest

from tldextract.cache import DiskCache


def test_disk_cache(tmpdir):
cache = DiskCache(tmpdir)
cache.set("testing", "foo", "bar")
assert cache.get("testing", "foo") == "bar"

cache.clear()

with pytest.raises(KeyError):
cache.get("testing", "foo")

cache.set("testing", "foo", "baz")
assert cache.get("testing", "foo") == "baz"
Loading