From aa978a1d6be598e2b2bf9aab59d5afdcab374a2a Mon Sep 17 00:00:00 2001 From: Benjamin Ramser Date: Sun, 28 Feb 2021 15:50:37 +0100 Subject: [PATCH] feat: optionally allow case insensitive lookup --- docs/geotext.md | 21 ++++++++++++-- docs/lookup.md | 2 +- flashgeotext/geotext.py | 32 ++++++++++++++++----- flashgeotext/lookup.py | 29 +++++++------------ readme.md | 2 +- tests/conftest.py | 2 +- tests/integration/test_geotext_extractor.py | 16 +++++++++-- 7 files changed, 69 insertions(+), 35 deletions(-) diff --git a/docs/geotext.md b/docs/geotext.md index 0a37096..6fb09b9 100644 --- a/docs/geotext.md +++ b/docs/geotext.md @@ -1,6 +1,7 @@ # Table of Contents * [flashgeotext.geotext](#flashgeotext.geotext) + * [GeoTextConfiguration](#flashgeotext.geotext.GeoTextConfiguration) * [GeoText](#flashgeotext.geotext.GeoText) * [\_\_init\_\_](#flashgeotext.geotext.GeoText.__init__) * [extract](#flashgeotext.geotext.GeoText.extract) @@ -8,6 +9,20 @@ # flashgeotext.geotext + +## GeoTextConfiguration Objects + +```python +class GeoTextConfiguration(BaseModel) +``` + +GeoText configuration + +**Arguments**: + +- `use_demo_data` _bool_ - load demo data or not, default True +- `case_sensitive` _bool_ - case sensitive lookup, default True + ## GeoText Objects @@ -29,7 +44,7 @@ span info. ```python from flashgeotext.geotext import GeoText - geotext = GeoText(use_demo_data=True) + geotext = GeoText() input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans to cut tariffs on $75 billion worth of goods that the country @@ -65,14 +80,14 @@ span info. #### \_\_init\_\_ ```python - | __init__(use_demo_data: bool = True) -> None + | __init__(config: GeoTextConfiguration = GeoTextConfiguration().dict()) -> None ``` instantiate an empty LookupDataPool, optionally/by default with demo data **Arguments**: -- `use_demo_data` _bool_ - optionally use demo data, defaults to True. +- `config` - GeoTextConfiguration = { use_demo_data: True, case_sensitive: True }. #### extract diff --git a/docs/lookup.md b/docs/lookup.md index 9e5d6c1..69bf653 100644 --- a/docs/lookup.md +++ b/docs/lookup.md @@ -158,7 +158,7 @@ Collection of KeywordProcessors from LookupData #### add ```python - | add(lookup: LookupData, update: bool = False) -> None + | add(lookup: LookupData, update: bool = False, case_sensitive: bool = True) -> None ``` Add LookupData to LookupDataPool diff --git a/flashgeotext/geotext.py b/flashgeotext/geotext.py index 39fc722..cfb219e 100644 --- a/flashgeotext/geotext.py +++ b/flashgeotext/geotext.py @@ -1,7 +1,23 @@ +from typing import Optional + +from pydantic import BaseModel + from flashgeotext.lookup import LookupDataPool from flashgeotext.lookup import MissingLookupDataError +class GeoTextConfiguration(BaseModel): + """GeoText configuration + + Args: + use_demo_data (bool): load demo data or not, default True + case_sensitive (bool): case sensitive lookup, default True + """ + + use_demo_data: Optional[bool] = True + case_sensitive: Optional[bool] = True + + class GeoText(LookupDataPool): """Extract LookupData from input text @@ -16,7 +32,7 @@ class GeoText(LookupDataPool): ```python from flashgeotext.geotext import GeoText - geotext = GeoText(use_demo_data=True) + geotext = GeoText() input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans to cut tariffs on $75 billion worth of goods that the country @@ -50,21 +66,23 @@ class GeoText(LookupDataPool): """ - def __init__(self, use_demo_data: bool = True) -> None: + def __init__( + self, config: GeoTextConfiguration = GeoTextConfiguration().dict() + ) -> None: """ instantiate an empty LookupDataPool, optionally/by default with demo data Args: - use_demo_data (bool): optionally use demo data, defaults to True. + config: GeoTextConfiguration = { use_demo_data: True, case_sensitive: True }. """ self.pool: dict = {} - if use_demo_data: - self._add_demo_data() + if config["use_demo_data"]: + self._add_demo_data(case_sensitive=config["case_sensitive"]) def extract(self, input_text: str, span_info: bool = True) -> dict: """Extract LookupData from an input_text - Arguments: + Args: input_text (str): String to extract LookupData from. span_info (bool): Optionally, return span_info. Defaults to True. @@ -94,7 +112,7 @@ def _parse_extract(self, extract_data: list, span_info: bool = True) -> dict: Parse flashtext.KeywordProcessor.extract_keywords() output to count occurances, and optionally span_info. - Arguments: + Args: extract_data (list): flashtext.KeywordProcessor.extract_keywords() return value span_info (bool): optionally, parse span_info diff --git a/flashgeotext/lookup.py b/flashgeotext/lookup.py index dd85480..ad6dfb6 100644 --- a/flashgeotext/lookup.py +++ b/flashgeotext/lookup.py @@ -47,18 +47,6 @@ class LookupValidation: error_count (int): Error count in validation data. errors (dict): - Example: { - "Berlin": [ - "Berlin missing in list of synonyms", - "data['Berlin'] is not a list of synonyms" - ] - } - - Arguments: - status (str): Humanreadible string containing the Error status. - error_count (int): Error count in validation data. - errors (dict): - Example: { "Berlin": [ "Berlin missing in list of synonyms", @@ -161,7 +149,9 @@ class LookupDataPool: def __init__(self) -> None: self.pool: dict = {} - def add(self, lookup: LookupData, update: bool = False) -> None: + def add( + self, lookup: LookupData, update: bool = False, case_sensitive: bool = True + ) -> None: """Add LookupData to LookupDataPool Add LookupData to LookupDataPool. @@ -170,17 +160,18 @@ def add(self, lookup: LookupData, update: bool = False) -> None: Args: lookup (LookupData): LookupData to add to pool - update (bool): Allow update of an existing entry in LookupDataPool + update (bool): Allow update of an existing entry in LookupDataPool, default False + case_sensitive (bool): Allow case-sensitive lookup, default True """ if not isinstance(lookup, LookupData): - raise TypeError(f"lookup has to be instance of LookupData") + raise TypeError("lookup has to be instance of LookupData") if lookup.name in self.pool and not update: raise LookupDuplicateError( f"'{lookup.name}' has already been added. Set update=True to update" ) else: - self.pool[lookup.name] = KeywordProcessor(case_sensitive=True) + self.pool[lookup.name] = KeywordProcessor(case_sensitive=case_sensitive) self.pool[lookup.name].add_keywords_from_dict(lookup.data) # if there is a script specified, then update non word boundaries with @@ -209,7 +200,7 @@ def remove_all(self): self.pool = {} - def _add_demo_data(self): + def _add_demo_data(self, case_sensitive: bool = True): """(private) Add demo data to pool Adds DEMODATA_CITIES and DEMODATA_COUNTRIES to LookupDataPool @@ -220,8 +211,8 @@ def _add_demo_data(self): countries = LookupData( name="countries", data=load_data_from_file(file=DEMODATA_COUNTRIES) ) - self.add(cities) - self.add(countries) + self.add(cities, case_sensitive=case_sensitive) + self.add(countries, case_sensitive=case_sensitive) logger.debug(f"demo data loaded for: {list(self.pool.keys())}") diff --git a/readme.md b/readme.md index 125b11e..d356318 100644 --- a/readme.md +++ b/readme.md @@ -20,7 +20,7 @@ Extract and count countries and cities (+their synonyms) from text, like [GeoTex ```python from flashgeotext.geotext import GeoText -geotext = GeoText(use_demo_data=True) +geotext = GeoText() input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans to cut tariffs on $75 billion worth of goods that the country diff --git a/tests/conftest.py b/tests/conftest.py index 2e09666..f30d369 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,4 +18,4 @@ def test_data_countries(): @pytest.fixture def geotext(): - return GeoText(use_demo_data=True) + return GeoText() diff --git a/tests/integration/test_geotext_extractor.py b/tests/integration/test_geotext_extractor.py index 0342f12..a949e35 100644 --- a/tests/integration/test_geotext_extractor.py +++ b/tests/integration/test_geotext_extractor.py @@ -8,7 +8,7 @@ def test_geotext_demo_data(): - geotext = GeoText(use_demo_data=True) + geotext = GeoText() assert geotext.pool["cities"] assert geotext.pool["countries"] @@ -20,7 +20,7 @@ def test_geotext_extract(geotext): def test_geotext_raises_on_empty_pool(): - output = GeoText(use_demo_data=False) + output = GeoText(config={"use_demo_data": False}) with pytest.raises(MissingLookupDataError): output.extract(text) @@ -39,6 +39,16 @@ def test_geotext_extract_with_count_span_info_false(geotext): assert output["cities"]["Berlin"]["span_info"] == [(0, 6), (43, 49)] +def test_geotext_case_sensitive_demo_data(): + geotext = GeoText(config={"use_demo_data": True, "case_sensitive": False}) + text = "berlin ist ne tolle stadt" + output = geotext.extract(input_text=text, span_info=True) + + print(output) + + assert output["cities"]["Berlin"]["span_info"] == [(0, 6)] + + # tests used in geotext (https://github.com/elyase/geotext) @@ -161,7 +171,7 @@ def test_geotext_with_script_added_to_non_word_boundaries(): cyrillic = LookupData( name="test_1", data={"Нижневартовск": ["Нижневартовск"]}, script="cyrillic" ) - geotext = GeoText(use_demo_data=False) + geotext = GeoText(config={"use_demo_data": False}) geotext.add(cyrillic) text = """