diff --git a/emm/preprocessing/functions.py b/emm/preprocessing/functions.py index b88c1c8..ed8cf91 100644 --- a/emm/preprocessing/functions.py +++ b/emm/preprocessing/functions.py @@ -19,11 +19,21 @@ from __future__ import annotations +import warnings from functools import partial from typing import Any, Callable import cleanco -from unidecode import unidecode + +try: + from unidecode import unidecode +except ImportError: + unidecode = None + warnings.warn( + "The 'unidecode' module is not installed. 'strip_accents_unicode' will default to an identity function. " + "Install 'unidecode' to enable accent stripping functionality.", + ImportWarning, + ) from emm.preprocessing.abbreviation_util import abbreviations_to_words, legal_abbreviations_to_words @@ -50,7 +60,7 @@ def map_shorthands(name): return { # Replace accented characters by their normalized representation, e.g. replace 'รค' with 'A\xa4' - "strip_accents_unicode": F.run_custom_function(unidecode), + "strip_accents_unicode": F.run_custom_function(unidecode if unidecode is not None else (lambda x: x)), # Replace all dash and underscore characters with a space characters "strip_hyphens": F.regex_replace(r"""[-_]""", " ", simple=True), # Replace all punctuation characters (e.g. '.', '-', '_', ''', ';') with spaces diff --git a/pyproject.toml b/pyproject.toml index 4ee6d89..89a8caa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,7 @@ dependencies = [ "sparse-dot-topn>=1.1.1", "joblib", "pyarrow>=6.0.1", # seems to work with spark 3.1.2 - 3.3.1 - "requests", - "unidecode" + "requests" ] dynamic = ["version"] @@ -53,10 +52,14 @@ dev = [ "pandoc", "pympler" ] +preprocessing = [ + "unidecode" +] test = [ "pytest", "pytest-ordering", - "virtualenv" + "virtualenv", + "unidecode" ] test-cov = [ "coverage",