From 080b851aa6fe72f05e872ac10222eebb6edf65a6 Mon Sep 17 00:00:00 2001 From: chrispiros Date: Thu, 16 Oct 2025 15:35:53 +0200 Subject: [PATCH 1/6] Update functions.py --- emm/preprocessing/functions.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/emm/preprocessing/functions.py b/emm/preprocessing/functions.py index b88c1c8..b81b8ce 100644 --- a/emm/preprocessing/functions.py +++ b/emm/preprocessing/functions.py @@ -23,7 +23,11 @@ from typing import Any, Callable import cleanco -from unidecode import unidecode + +try: + from unidecode import unidecode +except ImportError: + unidecode = None from emm.preprocessing.abbreviation_util import abbreviations_to_words, legal_abbreviations_to_words @@ -50,7 +54,7 @@ def map_shorthands(name): return { # Replace accented characters by their normalized representation, e.g. replace 'รค' with 'A\xa4' - "strip_accents_unicode": F.run_custom_function(unidecode), + "strip_accents_unicode": F.run_custom_function(unidecode if unidecode is not None else (lambda x: x)), # Replace all dash and underscore characters with a space characters "strip_hyphens": F.regex_replace(r"""[-_]""", " ", simple=True), # Replace all punctuation characters (e.g. '.', '-', '_', ''', ';') with spaces From f44f64b896c5389e0b664d19285a8ffcbbef9d1d Mon Sep 17 00:00:00 2001 From: chrispiros Date: Thu, 16 Oct 2025 15:36:29 +0200 Subject: [PATCH 2/6] Update pyproject.toml --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4ee6d89..edcf4da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,7 @@ dependencies = [ "sparse-dot-topn>=1.1.1", "joblib", "pyarrow>=6.0.1", # seems to work with spark 3.1.2 - 3.3.1 - "requests", - "unidecode" + "requests" ] dynamic = ["version"] @@ -51,7 +50,8 @@ dev = [ "matplotlib", "pygments", "pandoc", - "pympler" + "pympler", + "unidecode" ] test = [ "pytest", From bdc1089800f87ee2eb26ab95f07fce761be02ed5 Mon Sep 17 00:00:00 2001 From: chrispiros Date: Mon, 3 Nov 2025 15:58:14 +0100 Subject: [PATCH 3/6] added preprocessing group in pyproject.toml --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index edcf4da..57fb37a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,9 @@ dev = [ "matplotlib", "pygments", "pandoc", - "pympler", + "pympler" +] +preprocessing = [ "unidecode" ] test = [ From f8b4567302d8f77716ef9d9ce8aa7d2ff3161a10 Mon Sep 17 00:00:00 2001 From: chrispiros Date: Mon, 3 Nov 2025 16:03:22 +0100 Subject: [PATCH 4/6] added warning in functions.py --- emm/preprocessing/functions.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/emm/preprocessing/functions.py b/emm/preprocessing/functions.py index b81b8ce..1d2163a 100644 --- a/emm/preprocessing/functions.py +++ b/emm/preprocessing/functions.py @@ -23,11 +23,17 @@ from typing import Any, Callable import cleanco +import warnings try: from unidecode import unidecode except ImportError: unidecode = None + warnings.warn( + "The 'unidecode' module is not installed. 'strip_accents_unicode' will default to an identity function. " + "Install 'unidecode' to enable accent stripping functionality.", + ImportWarning + ) from emm.preprocessing.abbreviation_util import abbreviations_to_words, legal_abbreviations_to_words From 348ef3a0a3685301d1baad2498ed1de30b3a45c6 Mon Sep 17 00:00:00 2001 From: chrispiros Date: Mon, 24 Nov 2025 14:46:23 +0100 Subject: [PATCH 5/6] pre-commit applied --- emm/preprocessing/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emm/preprocessing/functions.py b/emm/preprocessing/functions.py index 1d2163a..ed8cf91 100644 --- a/emm/preprocessing/functions.py +++ b/emm/preprocessing/functions.py @@ -19,11 +19,11 @@ from __future__ import annotations +import warnings from functools import partial from typing import Any, Callable import cleanco -import warnings try: from unidecode import unidecode @@ -32,7 +32,7 @@ warnings.warn( "The 'unidecode' module is not installed. 'strip_accents_unicode' will default to an identity function. " "Install 'unidecode' to enable accent stripping functionality.", - ImportWarning + ImportWarning, ) from emm.preprocessing.abbreviation_util import abbreviations_to_words, legal_abbreviations_to_words From 966a09c54c14449838bde9ff32ab14300d31441c Mon Sep 17 00:00:00 2001 From: chrispiros Date: Tue, 25 Nov 2025 14:25:44 +0100 Subject: [PATCH 6/6] added 'unidecode' to test deps --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 57fb37a..89a8caa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,8 @@ preprocessing = [ test = [ "pytest", "pytest-ordering", - "virtualenv" + "virtualenv", + "unidecode" ] test-cov = [ "coverage",