Skip to content

Commit

Permalink
feat: optionally allow case insensitive lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
iwpnd committed Feb 28, 2021
1 parent d247ca7 commit aa978a1
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 35 deletions.
21 changes: 18 additions & 3 deletions docs/geotext.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
# Table of Contents

* [flashgeotext.geotext](#flashgeotext.geotext)
* [GeoTextConfiguration](#flashgeotext.geotext.GeoTextConfiguration)
* [GeoText](#flashgeotext.geotext.GeoText)
* [\_\_init\_\_](#flashgeotext.geotext.GeoText.__init__)
* [extract](#flashgeotext.geotext.GeoText.extract)

<a name="flashgeotext.geotext"></a>
# flashgeotext.geotext

<a name="flashgeotext.geotext.GeoTextConfiguration"></a>
## GeoTextConfiguration Objects

```python
class GeoTextConfiguration(BaseModel)
```

GeoText configuration

**Arguments**:

- `use_demo_data` _bool_ - load demo data or not, default True
- `case_sensitive` _bool_ - case sensitive lookup, default True

<a name="flashgeotext.geotext.GeoText"></a>
## GeoText Objects

Expand All @@ -29,7 +44,7 @@ span info.
```python
from flashgeotext.geotext import GeoText

geotext = GeoText(use_demo_data=True)
geotext = GeoText()

input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans
to cut tariffs on $75 billion worth of goods that the country
Expand Down Expand Up @@ -65,14 +80,14 @@ span info.
#### \_\_init\_\_

```python
| __init__(use_demo_data: bool = True) -> None
| __init__(config: GeoTextConfiguration = GeoTextConfiguration().dict()) -> None
```

instantiate an empty LookupDataPool, optionally/by default with demo data

**Arguments**:

- `use_demo_data` _bool_ - optionally use demo data, defaults to True.
- `config` - GeoTextConfiguration = { use_demo_data: True, case_sensitive: True }.

<a name="flashgeotext.geotext.GeoText.extract"></a>
#### extract
Expand Down
2 changes: 1 addition & 1 deletion docs/lookup.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ Collection of KeywordProcessors from LookupData
#### add

```python
| add(lookup: LookupData, update: bool = False) -> None
| add(lookup: LookupData, update: bool = False, case_sensitive: bool = True) -> None
```

Add LookupData to LookupDataPool
Expand Down
32 changes: 25 additions & 7 deletions flashgeotext/geotext.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,23 @@
from typing import Optional

from pydantic import BaseModel

from flashgeotext.lookup import LookupDataPool
from flashgeotext.lookup import MissingLookupDataError


class GeoTextConfiguration(BaseModel):
"""GeoText configuration
Args:
use_demo_data (bool): load demo data or not, default True
case_sensitive (bool): case sensitive lookup, default True
"""

use_demo_data: Optional[bool] = True
case_sensitive: Optional[bool] = True


class GeoText(LookupDataPool):
"""Extract LookupData from input text
Expand All @@ -16,7 +32,7 @@ class GeoText(LookupDataPool):
```python
from flashgeotext.geotext import GeoText
geotext = GeoText(use_demo_data=True)
geotext = GeoText()
input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans
to cut tariffs on $75 billion worth of goods that the country
Expand Down Expand Up @@ -50,21 +66,23 @@ class GeoText(LookupDataPool):
"""

def __init__(self, use_demo_data: bool = True) -> None:
def __init__(
self, config: GeoTextConfiguration = GeoTextConfiguration().dict()
) -> None:
""" instantiate an empty LookupDataPool, optionally/by default with demo data
Args:
use_demo_data (bool): optionally use demo data, defaults to True.
config: GeoTextConfiguration = { use_demo_data: True, case_sensitive: True }.
"""
self.pool: dict = {}

if use_demo_data:
self._add_demo_data()
if config["use_demo_data"]:
self._add_demo_data(case_sensitive=config["case_sensitive"])

def extract(self, input_text: str, span_info: bool = True) -> dict:
"""Extract LookupData from an input_text
Arguments:
Args:
input_text (str): String to extract LookupData from.
span_info (bool): Optionally, return span_info. Defaults to True.
Expand Down Expand Up @@ -94,7 +112,7 @@ def _parse_extract(self, extract_data: list, span_info: bool = True) -> dict:
Parse flashtext.KeywordProcessor.extract_keywords() output to count occurances,
and optionally span_info.
Arguments:
Args:
extract_data (list): flashtext.KeywordProcessor.extract_keywords() return value
span_info (bool): optionally, parse span_info
Expand Down
29 changes: 10 additions & 19 deletions flashgeotext/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,6 @@ class LookupValidation:
error_count (int): Error count in validation data.
errors (dict):
Example: {
"Berlin": [
"Berlin missing in list of synonyms",
"data['Berlin'] is not a list of synonyms"
]
}
Arguments:
status (str): Humanreadible string containing the Error status.
error_count (int): Error count in validation data.
errors (dict):
Example: {
"Berlin": [
"Berlin missing in list of synonyms",
Expand Down Expand Up @@ -161,7 +149,9 @@ class LookupDataPool:
def __init__(self) -> None:
self.pool: dict = {}

def add(self, lookup: LookupData, update: bool = False) -> None:
def add(
self, lookup: LookupData, update: bool = False, case_sensitive: bool = True
) -> None:
"""Add LookupData to LookupDataPool
Add LookupData to LookupDataPool.
Expand All @@ -170,17 +160,18 @@ def add(self, lookup: LookupData, update: bool = False) -> None:
Args:
lookup (LookupData): LookupData to add to pool
update (bool): Allow update of an existing entry in LookupDataPool
update (bool): Allow update of an existing entry in LookupDataPool, default False
case_sensitive (bool): Allow case-sensitive lookup, default True
"""
if not isinstance(lookup, LookupData):
raise TypeError(f"lookup has to be instance of LookupData")
raise TypeError("lookup has to be instance of LookupData")

if lookup.name in self.pool and not update:
raise LookupDuplicateError(
f"'{lookup.name}' has already been added. Set update=True to update"
)
else:
self.pool[lookup.name] = KeywordProcessor(case_sensitive=True)
self.pool[lookup.name] = KeywordProcessor(case_sensitive=case_sensitive)
self.pool[lookup.name].add_keywords_from_dict(lookup.data)

# if there is a script specified, then update non word boundaries with
Expand Down Expand Up @@ -209,7 +200,7 @@ def remove_all(self):

self.pool = {}

def _add_demo_data(self):
def _add_demo_data(self, case_sensitive: bool = True):
"""(private) Add demo data to pool
Adds DEMODATA_CITIES and DEMODATA_COUNTRIES to LookupDataPool
Expand All @@ -220,8 +211,8 @@ def _add_demo_data(self):
countries = LookupData(
name="countries", data=load_data_from_file(file=DEMODATA_COUNTRIES)
)
self.add(cities)
self.add(countries)
self.add(cities, case_sensitive=case_sensitive)
self.add(countries, case_sensitive=case_sensitive)
logger.debug(f"demo data loaded for: {list(self.pool.keys())}")


Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Extract and count countries and cities (+their synonyms) from text, like [GeoTex
```python
from flashgeotext.geotext import GeoText

geotext = GeoText(use_demo_data=True)
geotext = GeoText()

input_text = '''Shanghai. The Chinese Ministry of Finance in Shanghai said that China plans
to cut tariffs on $75 billion worth of goods that the country
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ def test_data_countries():

@pytest.fixture
def geotext():
return GeoText(use_demo_data=True)
return GeoText()
16 changes: 13 additions & 3 deletions tests/integration/test_geotext_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def test_geotext_demo_data():
geotext = GeoText(use_demo_data=True)
geotext = GeoText()

assert geotext.pool["cities"]
assert geotext.pool["countries"]
Expand All @@ -20,7 +20,7 @@ def test_geotext_extract(geotext):


def test_geotext_raises_on_empty_pool():
output = GeoText(use_demo_data=False)
output = GeoText(config={"use_demo_data": False})

with pytest.raises(MissingLookupDataError):
output.extract(text)
Expand All @@ -39,6 +39,16 @@ def test_geotext_extract_with_count_span_info_false(geotext):
assert output["cities"]["Berlin"]["span_info"] == [(0, 6), (43, 49)]


def test_geotext_case_sensitive_demo_data():
geotext = GeoText(config={"use_demo_data": True, "case_sensitive": False})
text = "berlin ist ne tolle stadt"
output = geotext.extract(input_text=text, span_info=True)

print(output)

assert output["cities"]["Berlin"]["span_info"] == [(0, 6)]


# tests used in geotext (https://github.com/elyase/geotext)


Expand Down Expand Up @@ -161,7 +171,7 @@ def test_geotext_with_script_added_to_non_word_boundaries():
cyrillic = LookupData(
name="test_1", data={"Нижневартовск": ["Нижневартовск"]}, script="cyrillic"
)
geotext = GeoText(use_demo_data=False)
geotext = GeoText(config={"use_demo_data": False})
geotext.add(cyrillic)

text = """
Expand Down

0 comments on commit aa978a1

Please sign in to comment.