-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from iwpnd/dataclass-approach
Refactoring
- Loading branch information
Showing
11 changed files
with
451 additions
and
150 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,27 @@ | ||
Extractor: | ||
✔ write extractor class with KeywordProcessor globals for now @done(20-02-17 13:49) | ||
✔ write test for extract method in Extractor @done(20-02-17 14:29) | ||
✔ add extract method to Extractor @done(20-02-17 14:29) | ||
✔ refactor, so extract actually extracts content @done(20-02-17 15:45) | ||
☐ write test to check if extract returns counts and span_info properly | ||
☐ refactor extract so it returns counts and span_info properly | ||
LookupData: | ||
✔ LookupData class with .name and .data, validated with pydantic @done(20-02-18 08:54) | ||
✔ see if pytest.raises(ValidationError) @done(20-02-19 09:13) | ||
✘ use custom validator to check .data @cancelled(20-02-19 09:13) | ||
☐ since a pydantic validator on the input would be too expensive, add an optional validate method | ||
☐ use .validate() to write tests to validate demo_data (cities, countries) | ||
|
||
Dataclass: | ||
☐ maybe have a Class per KeywordProcessor to pass around? | ||
LookupDataPool: | ||
✔ LookupDataPool with .add() method to add LookupData instances to a dictionary of KeywordProcessors @done(20-02-18 08:54) | ||
✔ validate if LookupData has already been added to LookupDataPool.pool, add update flag to let the user update the pool @done(20-02-18 08:55) | ||
✔ add remove method to LookupDataPool @done(20-02-18 13:27) | ||
|
||
GeoText: | ||
✔ add demo data method @done(20-02-18 14:30) | ||
✔ Extractor to extract LookupData from an input text @done(20-02-18 15:36) | ||
✔ Extractor should count the occurances, add span_info and update a result dictionary to be returned @done(20-02-18 15:49) | ||
✔ optionally be able to set, whether span_info is returned or not @done(20-02-18 15:49) | ||
☐ handle different alphabets as non_word_boundaries | ||
☐ maybe add toponym? | ||
|
||
GeoLookup: | ||
✔ build keywordprocessors with demo data by default @done(20-02-17 13:49) | ||
✔ make sure that user can also add her own data @done(20-02-17 13:49) | ||
✔ test init with multiple inputs using pytest.mark.parametrize @done(20-02-17 13:49) | ||
General: | ||
☐ docs | ||
☐ readme | ||
☐ version | ||
☐ travis.yml or github actions | ||
☐ release to testpypi | ||
☐ release to pypi |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from flashgeotext.lookup import LookupDataPool | ||
from flashgeotext.lookup import MissingLookupDataError | ||
|
||
|
||
class GeoText(LookupDataPool): | ||
def __init__(self, use_demo_data: bool = True) -> None: | ||
self.pool: dict = {} | ||
|
||
if use_demo_data: | ||
self._add_demo_data() | ||
|
||
def extract(self, input_text: str, span_info: bool = True) -> dict: | ||
if not self.pool: | ||
raise MissingLookupDataError( | ||
"Empty LookupDataPool. use .add(LookupData) to add data." | ||
) | ||
|
||
output: dict = {} | ||
|
||
for lookup in self.pool.keys(): | ||
extract = self.pool[lookup].extract_keywords( | ||
input_text, span_info=span_info | ||
) | ||
output[lookup] = self._parse_extract(extract, span_info=span_info) | ||
|
||
return output | ||
|
||
def _parse_extract(self, extract_data: list, span_info: bool = True) -> dict: | ||
parsed_extract: dict = {} | ||
|
||
if span_info: | ||
for entry in extract_data: | ||
if entry[0] not in parsed_extract: | ||
parsed_extract[entry[0]] = { | ||
"count": 1, | ||
"span_info": [(entry[1], entry[2])], | ||
} | ||
else: | ||
parsed_extract[entry[0]]["count"] = ( | ||
parsed_extract[entry[0]]["count"] + 1 | ||
) | ||
parsed_extract[entry[0]]["span_info"] = parsed_extract[entry[0]][ | ||
"span_info" | ||
] + [(entry[1], entry[2])] | ||
|
||
else: | ||
for entry in extract_data: | ||
if entry not in parsed_extract: | ||
parsed_extract[entry] = {"count": 1} | ||
else: | ||
parsed_extract[entry]["count"] = parsed_extract[entry]["count"] + 1 | ||
|
||
return parsed_extract |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import json | ||
|
||
from flashtext import KeywordProcessor | ||
from loguru import logger | ||
from pydantic import BaseModel | ||
from pydantic import StrictStr | ||
|
||
from flashgeotext.settings import DEMODATA_CITIES | ||
from flashgeotext.settings import DEMODATA_COUNTRIES | ||
|
||
|
||
class LookupDuplicateError(Exception): | ||
pass | ||
|
||
|
||
class MissingLookupDataError(Exception): | ||
pass | ||
|
||
|
||
class LookupData(BaseModel): | ||
name: StrictStr | ||
data: dict | ||
|
||
def validate(self) -> dict: | ||
validation = {} | ||
validation["status"] = "No errors detected" | ||
validation["error_count"] = 0 | ||
validation["errors"] = {} | ||
|
||
for key, value in self.data.items(): | ||
if not isinstance(value, list): | ||
validation["errors"][key] = [f"data[{key}] is not a list of synonyms"] | ||
validation["error_count"] = validation["error_count"] + 1 | ||
|
||
if key not in value: | ||
if key in validation["errors"]: | ||
validation["errors"][key] = validation["errors"][key] + [ | ||
f"{key} missing in list of synonyms" | ||
] | ||
else: | ||
validation["errors"][key] = [f"{key} missing in list of synonyms"] | ||
|
||
validation["error_count"] = validation["error_count"] + 1 | ||
|
||
if validation["error_count"] > 0: | ||
validation["status"] = f"Found {validation['error_count']} errors" | ||
|
||
return validation | ||
|
||
|
||
class LookupDataPool: | ||
""" | ||
""" | ||
|
||
def __init__(self) -> None: | ||
self.pool: dict = {} | ||
|
||
def add(self, lookup: LookupData, update: bool = False) -> None: | ||
if not isinstance(lookup, LookupData): | ||
raise TypeError(f"lookup has to be instance of LookupData") | ||
|
||
if lookup.name in self.pool and not update: | ||
raise LookupDuplicateError( | ||
f"'{lookup.name}' has already been added. Set update=True to update" | ||
) | ||
else: | ||
self.pool[lookup.name] = KeywordProcessor(case_sensitive=True) | ||
self.pool[lookup.name].add_keywords_from_dict(lookup.data) | ||
logger.debug(f"{lookup.name} added to pool") | ||
|
||
def remove(self, lookup_to_remove: str) -> None: | ||
if lookup_to_remove in self.pool: | ||
del self.pool[lookup_to_remove] | ||
logger.debug(f"{lookup_to_remove} removed from pool") | ||
|
||
def _add_demo_data(self): | ||
cities = LookupData( | ||
name="cities", data=load_data_from_file(file=DEMODATA_CITIES) | ||
) | ||
countries = LookupData( | ||
name="countries", data=load_data_from_file(file=DEMODATA_COUNTRIES) | ||
) | ||
self.add(cities) | ||
self.add(countries) | ||
logger.debug(f"demo data loaded for: {list(self.pool.keys())}") | ||
|
||
|
||
def load_data_from_file(file: str) -> dict: | ||
with open(file, "r", encoding="utf-8") as f: | ||
return json.loads(f.read()) |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
flashtext | ||
flashtext==2.7 | ||
loguru==0.4.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,21 @@ | ||
import pytest | ||
|
||
from flashgeotext.extractor import GeoText | ||
from flashgeotext.geotext import GeoText | ||
|
||
|
||
@pytest.fixture | ||
def geotext_demodata(): | ||
geotext_demodata = GeoText() | ||
def test_data_cities(): | ||
return {"Berlin": ["Berlin", "Dickes B"], "Hamburg": ["Hamburg", "Dickes H"]} | ||
|
||
return geotext_demodata | ||
|
||
@pytest.fixture | ||
def test_data_countries(): | ||
return { | ||
"Germany": ["Germany", "Deutschland"], | ||
"Netherlands": ["The Netherlands", "Netherlands", "Holland"], | ||
} | ||
|
||
|
||
@pytest.fixture | ||
def geotext(): | ||
return GeoText(use_demo_data=True) |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.