Skip to content

Commit

Permalink
Merge pull request #2 from iwpnd/dataclass-approach
Browse files Browse the repository at this point in the history
Refactoring
  • Loading branch information
iwpnd committed Feb 19, 2020
2 parents 9e5f210 + f0d5e11 commit adb26e6
Show file tree
Hide file tree
Showing 11 changed files with 451 additions and 150 deletions.
37 changes: 24 additions & 13 deletions TODO
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
Extractor:
✔ write extractor class with KeywordProcessor globals for now @done(20-02-17 13:49)
✔ write test for extract method in Extractor @done(20-02-17 14:29)
✔ add extract method to Extractor @done(20-02-17 14:29)
✔ refactor, so extract actually extracts content @done(20-02-17 15:45)
☐ write test to check if extract returns counts and span_info properly
☐ refactor extract so it returns counts and span_info properly
LookupData:
✔ LookupData class with .name and .data, validated with pydantic @done(20-02-18 08:54)
✔ see if pytest.raises(ValidationError) @done(20-02-19 09:13)
✘ use custom validator to check .data @cancelled(20-02-19 09:13)
☐ since a pydantic validator on the input would be too expensive, add an optional validate method
☐ use .validate() to write tests to validate demo_data (cities, countries)

Dataclass:
☐ maybe have a Class per KeywordProcessor to pass around?
LookupDataPool:
✔ LookupDataPool with .add() method to add LookupData instances to a dictionary of KeywordProcessors @done(20-02-18 08:54)
✔ validate if LookupData has already been added to LookupDataPool.pool, add update flag to let the user update the pool @done(20-02-18 08:55)
✔ add remove method to LookupDataPool @done(20-02-18 13:27)

GeoText:
✔ add demo data method @done(20-02-18 14:30)
✔ Extractor to extract LookupData from an input text @done(20-02-18 15:36)
✔ Extractor should count the occurances, add span_info and update a result dictionary to be returned @done(20-02-18 15:49)
✔ optionally be able to set, whether span_info is returned or not @done(20-02-18 15:49)
☐ handle different alphabets as non_word_boundaries
☐ maybe add toponym?

GeoLookup:
✔ build keywordprocessors with demo data by default @done(20-02-17 13:49)
✔ make sure that user can also add her own data @done(20-02-17 13:49)
✔ test init with multiple inputs using pytest.mark.parametrize @done(20-02-17 13:49)
General:
☐ docs
☐ readme
☐ version
☐ travis.yml or github actions
☐ release to testpypi
☐ release to pypi
59 changes: 0 additions & 59 deletions flashgeotext/extractor.py

This file was deleted.

53 changes: 53 additions & 0 deletions flashgeotext/geotext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from flashgeotext.lookup import LookupDataPool
from flashgeotext.lookup import MissingLookupDataError


class GeoText(LookupDataPool):
def __init__(self, use_demo_data: bool = True) -> None:
self.pool: dict = {}

if use_demo_data:
self._add_demo_data()

def extract(self, input_text: str, span_info: bool = True) -> dict:
if not self.pool:
raise MissingLookupDataError(
"Empty LookupDataPool. use .add(LookupData) to add data."
)

output: dict = {}

for lookup in self.pool.keys():
extract = self.pool[lookup].extract_keywords(
input_text, span_info=span_info
)
output[lookup] = self._parse_extract(extract, span_info=span_info)

return output

def _parse_extract(self, extract_data: list, span_info: bool = True) -> dict:
parsed_extract: dict = {}

if span_info:
for entry in extract_data:
if entry[0] not in parsed_extract:
parsed_extract[entry[0]] = {
"count": 1,
"span_info": [(entry[1], entry[2])],
}
else:
parsed_extract[entry[0]]["count"] = (
parsed_extract[entry[0]]["count"] + 1
)
parsed_extract[entry[0]]["span_info"] = parsed_extract[entry[0]][
"span_info"
] + [(entry[1], entry[2])]

else:
for entry in extract_data:
if entry not in parsed_extract:
parsed_extract[entry] = {"count": 1}
else:
parsed_extract[entry]["count"] = parsed_extract[entry]["count"] + 1

return parsed_extract
90 changes: 90 additions & 0 deletions flashgeotext/lookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import json

from flashtext import KeywordProcessor
from loguru import logger
from pydantic import BaseModel
from pydantic import StrictStr

from flashgeotext.settings import DEMODATA_CITIES
from flashgeotext.settings import DEMODATA_COUNTRIES


class LookupDuplicateError(Exception):
pass


class MissingLookupDataError(Exception):
pass


class LookupData(BaseModel):
name: StrictStr
data: dict

def validate(self) -> dict:
validation = {}
validation["status"] = "No errors detected"
validation["error_count"] = 0
validation["errors"] = {}

for key, value in self.data.items():
if not isinstance(value, list):
validation["errors"][key] = [f"data[{key}] is not a list of synonyms"]
validation["error_count"] = validation["error_count"] + 1

if key not in value:
if key in validation["errors"]:
validation["errors"][key] = validation["errors"][key] + [
f"{key} missing in list of synonyms"
]
else:
validation["errors"][key] = [f"{key} missing in list of synonyms"]

validation["error_count"] = validation["error_count"] + 1

if validation["error_count"] > 0:
validation["status"] = f"Found {validation['error_count']} errors"

return validation


class LookupDataPool:
"""
"""

def __init__(self) -> None:
self.pool: dict = {}

def add(self, lookup: LookupData, update: bool = False) -> None:
if not isinstance(lookup, LookupData):
raise TypeError(f"lookup has to be instance of LookupData")

if lookup.name in self.pool and not update:
raise LookupDuplicateError(
f"'{lookup.name}' has already been added. Set update=True to update"
)
else:
self.pool[lookup.name] = KeywordProcessor(case_sensitive=True)
self.pool[lookup.name].add_keywords_from_dict(lookup.data)
logger.debug(f"{lookup.name} added to pool")

def remove(self, lookup_to_remove: str) -> None:
if lookup_to_remove in self.pool:
del self.pool[lookup_to_remove]
logger.debug(f"{lookup_to_remove} removed from pool")

def _add_demo_data(self):
cities = LookupData(
name="cities", data=load_data_from_file(file=DEMODATA_CITIES)
)
countries = LookupData(
name="countries", data=load_data_from_file(file=DEMODATA_COUNTRIES)
)
self.add(cities)
self.add(countries)
logger.debug(f"demo data loaded for: {list(self.pool.keys())}")


def load_data_from_file(file: str) -> dict:
with open(file, "r", encoding="utf-8") as f:
return json.loads(f.read())
2 changes: 1 addition & 1 deletion flashgeotext/resources/countries.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
flashtext
flashtext==2.7
loguru==0.4.1
19 changes: 15 additions & 4 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
import pytest

from flashgeotext.extractor import GeoText
from flashgeotext.geotext import GeoText


@pytest.fixture
def geotext_demodata():
geotext_demodata = GeoText()
def test_data_cities():
return {"Berlin": ["Berlin", "Dickes B"], "Hamburg": ["Hamburg", "Dickes H"]}

return geotext_demodata

@pytest.fixture
def test_data_countries():
return {
"Germany": ["Germany", "Deutschland"],
"Netherlands": ["The Netherlands", "Netherlands", "Holland"],
}


@pytest.fixture
def geotext():
return GeoText(use_demo_data=True)
72 changes: 0 additions & 72 deletions tests/unit/test_extractor.py

This file was deleted.

Loading

0 comments on commit adb26e6

Please sign in to comment.