Skip to content

Commit

Permalink
Update dateinfer and urlparser imports
Browse files Browse the repository at this point in the history
  • Loading branch information
luis11011 committed Sep 16, 2021
1 parent 4201bf5 commit 2fbf5ec
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 49 deletions.
40 changes: 20 additions & 20 deletions optimus/engines/base/dask/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,41 +135,41 @@ def min_max_scaler(self, series):
# return counts, edges[0], edges[1]

def domain(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["domain"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["domain"], na_action=None, meta=(series.name, "str"))

def top_domain(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["top_domain"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["top_domain"], na_action=None, meta=(series.name, "str"))

def sub_domain(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["sub_domain"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["sub_domain"], na_action=None, meta=(series.name, "str"))

def url_scheme(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["protocol"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["protocol"], na_action=None, meta=(series.name, "str"))

def url_path(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["path"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["path"], na_action=None, meta=(series.name, "str"))

def url_file(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["file"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["file"], na_action=None, meta=(series.name, "str"))

def url_query(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["query"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["query"], na_action=None, meta=(series.name, "str"))

def url_fragment(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["fragment"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["fragment"], na_action=None, meta=(series.name, "str"))

def host(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["host"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["host"], na_action=None, meta=(series.name, "str"))

def port(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["port"], na_action=None, meta=(series.name, "str"))
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["port"], na_action=None, meta=(series.name, "str"))
48 changes: 24 additions & 24 deletions optimus/engines/base/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,8 @@ def impute(series, strategy, fill_value):
def date_format(self, series):
dtype = str(series.dtype)
if dtype in self.constants.STRING_TYPES:
import pydateinfer
result = pydateinfer.infer(self.compute(series).values)
import hidateinfer
result = hidateinfer.infer(self.compute(series).values)
return result if is_valid_datetime_format(result) else False
elif dtype in self.constants.DATETIME_TYPES:
return True
Expand Down Expand Up @@ -753,44 +753,44 @@ def seconds_between(self, series, value=None, date_format=None):
return series.dt.days * 86400 + series.dt.seconds

def domain(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["domain"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["domain"], na_action=None)

def top_domain(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["top_domain"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["top_domain"], na_action=None)

def sub_domain(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["sub_domain"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["sub_domain"], na_action=None)

def url_scheme(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["protocol"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["protocol"], na_action=None)

def url_path(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["path"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["path"], na_action=None)

def url_file(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["file"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["file"], na_action=None)

def url_query(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["query"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["query"], na_action=None)

def url_fragment(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["fragment"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["fragment"], na_action=None)

def host(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["host"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["host"], na_action=None)

def port(self, series):
import url_parser
return self.to_string(series).map(lambda v: url_parser.parse_url(v)["port"], na_action=None)
import hiurlparser
return self.to_string(series).map(lambda v: hiurlparser.parse_url(v)["port"], na_action=None)

def email_username(self, series):
return self.to_string_accessor(series).split('@').str[0]
Expand Down Expand Up @@ -846,8 +846,8 @@ def infer_data_types(self, value, cols_data_types):
return dtype

def date_formats(self, series):
import pydateinfer
return series.map(lambda v: pydateinfer.infer([v]))
import hidateinfer
return series.map(lambda v: hidateinfer.infer([v]))

def metaphone(self, series):
return self.to_string(series).map(jellyfish.metaphone, na_action='ignore')
Expand Down
5 changes: 2 additions & 3 deletions optimus/engines/base/io/connect.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import url_parser

from optimus.helpers.types import *
from optimus.engines.base.dask.io.jdbc import DaskBaseJDBC
from optimus.engines.spark.io.properties import DriverProperties
Expand Down Expand Up @@ -75,7 +73,8 @@ def __init__(self, endpoint_url=None, bucket=None, **kwargs):
if endpoint_url is None:
RaiseIt.value_error(endpoint_url, "")

schema = url_parser.parse_url(endpoint_url)["schema"]
import hiurlparser
schema = hiurlparser.parse_url(endpoint_url)["schema"]

if schema is not None:
endpoint_url = endpoint_url[len(schema + "://"):] # removes schema from endpoint_url
Expand Down
4 changes: 2 additions & 2 deletions optimus/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import fastnumbers
import pandas as pd
import pendulum
import pydateinfer
import hidateinfer



Expand All @@ -20,7 +20,7 @@

def is_datetime_str(_value: str):
try:
pdi = pydateinfer.infer([_value])
pdi = hidateinfer.infer([_value])
code_count = pdi.count('%')
value_code_count = _value.count('%')
return code_count >= 2 and value_code_count < code_count and code_count >= len(_value) / 7 and any(
Expand Down

0 comments on commit 2fbf5ec

Please sign in to comment.