Skip to content

Commit

Permalink
webhdfs: expose kerberos and https options (#6936)
Browse files Browse the repository at this point in the history
* Add fsspec options for webhdfs

* Add requests-kerberos dependency

* Update dvc/fs/webhdfs.py

Simplify config.pop.

Co-authored-by: Ruslan Kuprieiev <kupruser@gmail.com>

* Update dvc/fs/webhdfs.py

* Simplify kerb_kwargs config setting.

Co-authored-by: Ruslan Kuprieiev <kupruser@gmail.com>

Co-authored-by: Ruslan Kuprieiev <kupruser@gmail.com>
  • Loading branch information
gudmundur-heimisson and efiop committed Nov 7, 2021
1 parent 044e790 commit c5b1a73
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 23 deletions.
10 changes: 6 additions & 4 deletions dvc/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,12 @@ class RelPath(str):
},
"hdfs": {"user": str, "kerb_ticket": str, **REMOTE_COMMON},
"webhdfs": {
"hdfscli_config": str,
"webhdfs_token": str,
"user": str,
"webhdfs_alias": str,
"kerberos": Bool,
"kerberos_principal": str,
"proxy_to": str,
"ssl_verify": Any(Bool, str),
"token": str,
"use_https": Bool,
**REMOTE_COMMON,
},
"azure": {
Expand Down
11 changes: 7 additions & 4 deletions dvc/fs/webhdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,20 @@ def _get_kwargs_from_urls(urlpath):
)

def _prepare_credentials(self, **config):
if "webhdfs_token" in config:
config["token"] = config.pop("webhdfs_token")

self._ssl_verify = config.pop("ssl_verify", True)
principal = config.pop("kerberos_principal", None)
if principal:
config["kerb_kwargs"] = {"principal": principal}
return config

@wrap_prop(threading.Lock())
@cached_property
def fs(self):
from fsspec.implementations.webhdfs import WebHDFS

return WebHDFS(**self.fs_args)
fs = WebHDFS(**self.fs_args)
fs.session.verify = self._ssl_verify
return fs

def checksum(self, path_info):
path = self._with_bucket(path_info)
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ ssh_gssapi = sshfs[gssapi]>=2021.8.1
webdav = webdav4>=0.9.3
# not to break `dvc[webhdfs]`
webhdfs =
requests-kerberos==0.13.0
terraform = tpi[ssh]>=2.1.0
tests =
%(terraform)s
Expand Down
63 changes: 48 additions & 15 deletions tests/unit/remote/test_webhdfs.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,54 @@
from unittest.mock import Mock, create_autospec

import pytest
import requests

from dvc.fs.webhdfs import WebHDFSFileSystem

host = "host"
kerberos = False
kerberos_principal = "principal"
port = 12345
proxy_to = "proxy"
ssl_verify = False
token = "token"
use_https = True
user = "test"
webhdfs_token = "token"
webhdfs_alias = "alias-name"
hdfscli_config = "path/to/cli/config"


def test_init(dvc):
url = "webhdfs://test@127.0.0.1:50070"
config = {
"host": url,
"webhdfs_token": webhdfs_token,
"webhdfs_alias": webhdfs_alias,
"hdfscli_config": hdfscli_config,
"user": user,


@pytest.fixture()
def webhdfs_config():
url = f"webhdfs://{user}@{host}:{port}"
url_config = WebHDFSFileSystem._get_kwargs_from_urls(url)
return {
"kerberos": kerberos,
"kerberos_principal": kerberos_principal,
"proxy_to": proxy_to,
"ssl_verify": ssl_verify,
"token": token,
"use_https": use_https,
**url_config,
}

fs = WebHDFSFileSystem(**config)
assert fs.fs_args["token"] == webhdfs_token

def test_init(dvc, webhdfs_config):
fs = WebHDFSFileSystem(**webhdfs_config)
assert fs.fs_args["host"] == host
assert fs.fs_args["token"] == token
assert fs.fs_args["user"] == user
assert fs.fs_args["port"] == port
assert fs.fs_args["kerberos"] == kerberos
assert fs.fs_args["kerb_kwargs"] == {"principal": kerberos_principal}
assert fs.fs_args["proxy_to"] == proxy_to
assert fs.fs_args["use_https"] == use_https


def test_verify_ssl(dvc, webhdfs_config, monkeypatch):
mock_session = create_autospec(requests.Session)
monkeypatch.setattr(requests, "Session", Mock(return_value=mock_session))
# can't have token at the same time as user or proxy_to
del webhdfs_config["token"]
fs = WebHDFSFileSystem(**webhdfs_config)
# ssl verify can't be set until after the file system is instantiated
fs.fs # pylint: disable=pointless-statement
assert mock_session.verify == ssl_verify

0 comments on commit c5b1a73

Please sign in to comment.