From 2fbf5ec1c8b7f3cd683e950431cc348e4b8ebe4a Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 16 Sep 2021 10:36:13 -0400 Subject: [PATCH] Update dateinfer and urlparser imports --- optimus/engines/base/dask/functions.py | 40 ++++++++++----------- optimus/engines/base/functions.py | 48 +++++++++++++------------- optimus/engines/base/io/connect.py | 5 ++- optimus/infer.py | 4 +-- 4 files changed, 48 insertions(+), 49 deletions(-) diff --git a/optimus/engines/base/dask/functions.py b/optimus/engines/base/dask/functions.py index 76bb1893c..5178ed90c 100644 --- a/optimus/engines/base/dask/functions.py +++ b/optimus/engines/base/dask/functions.py @@ -135,41 +135,41 @@ def min_max_scaler(self, series): # return counts, edges[0], edges[1] def domain(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["domain"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["domain"], na_action=None, meta=(series.name, "str")) def top_domain(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["top_domain"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["top_domain"], na_action=None, meta=(series.name, "str")) def sub_domain(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["sub_domain"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["sub_domain"], na_action=None, meta=(series.name, "str")) def url_scheme(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["protocol"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["protocol"], na_action=None, meta=(series.name, "str")) def url_path(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["path"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["path"], na_action=None, meta=(series.name, "str")) def url_file(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["file"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["file"], na_action=None, meta=(series.name, "str")) def url_query(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["query"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["query"], na_action=None, meta=(series.name, "str")) def url_fragment(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["fragment"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["fragment"], na_action=None, meta=(series.name, "str")) def host(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["host"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["host"], na_action=None, meta=(series.name, "str")) def port(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["port"], na_action=None, meta=(series.name, "str")) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["port"], na_action=None, meta=(series.name, "str")) diff --git a/optimus/engines/base/functions.py b/optimus/engines/base/functions.py index d6dd483b3..0fad028ab 100644 --- a/optimus/engines/base/functions.py +++ b/optimus/engines/base/functions.py @@ -204,8 +204,8 @@ def impute(series, strategy, fill_value): def date_format(self, series): dtype = str(series.dtype) if dtype in self.constants.STRING_TYPES: - import pydateinfer - result = pydateinfer.infer(self.compute(series).values) + import hidateinfer + result = hidateinfer.infer(self.compute(series).values) return result if is_valid_datetime_format(result) else False elif dtype in self.constants.DATETIME_TYPES: return True @@ -753,44 +753,44 @@ def seconds_between(self, series, value=None, date_format=None): return series.dt.days * 86400 + series.dt.seconds def domain(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["domain"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["domain"], na_action=None) def top_domain(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["top_domain"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["top_domain"], na_action=None) def sub_domain(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["sub_domain"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["sub_domain"], na_action=None) def url_scheme(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["protocol"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["protocol"], na_action=None) def url_path(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["path"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["path"], na_action=None) def url_file(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["file"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["file"], na_action=None) def url_query(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["query"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["query"], na_action=None) def url_fragment(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["fragment"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["fragment"], na_action=None) def host(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["host"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["host"], na_action=None) def port(self, series): - import url_parser - return self.to_string(series).map(lambda v: url_parser.parse_url(v)["port"], na_action=None) + import hiurlparser + return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["port"], na_action=None) def email_username(self, series): return self.to_string_accessor(series).split('@').str[0] @@ -846,8 +846,8 @@ def infer_data_types(self, value, cols_data_types): return dtype def date_formats(self, series): - import pydateinfer - return series.map(lambda v: pydateinfer.infer([v])) + import hidateinfer + return series.map(lambda v: hidateinfer.infer([v])) def metaphone(self, series): return self.to_string(series).map(jellyfish.metaphone, na_action='ignore') diff --git a/optimus/engines/base/io/connect.py b/optimus/engines/base/io/connect.py index 39aab51f5..d46402218 100644 --- a/optimus/engines/base/io/connect.py +++ b/optimus/engines/base/io/connect.py @@ -1,5 +1,3 @@ -import url_parser - from optimus.helpers.types import * from optimus.engines.base.dask.io.jdbc import DaskBaseJDBC from optimus.engines.spark.io.properties import DriverProperties @@ -75,7 +73,8 @@ def __init__(self, endpoint_url=None, bucket=None, **kwargs): if endpoint_url is None: RaiseIt.value_error(endpoint_url, "") - schema = url_parser.parse_url(endpoint_url)["schema"] + import hiurlparser + schema = hiurlparser.parse_url(endpoint_url)["schema"] if schema is not None: endpoint_url = endpoint_url[len(schema + "://"):] # removes schema from endpoint_url diff --git a/optimus/infer.py b/optimus/infer.py index e2ea37708..6503cfdb2 100644 --- a/optimus/infer.py +++ b/optimus/infer.py @@ -10,7 +10,7 @@ import fastnumbers import pandas as pd import pendulum -import pydateinfer +import hidateinfer @@ -20,7 +20,7 @@ def is_datetime_str(_value: str): try: - pdi = pydateinfer.infer([_value]) + pdi = hidateinfer.infer([_value]) code_count = pdi.count('%') value_code_count = _value.count('%') return code_count >= 2 and value_code_count < code_count and code_count >= len(_value) / 7 and any(