From 42f362fee9f3ee184e49f49d4c0a76dc377383c3 Mon Sep 17 00:00:00 2001 From: iwpnd Date: Tue, 18 Feb 2020 14:33:37 +0100 Subject: [PATCH] added geotext class that upon init loads demo data by default. added tests. updated extractor.py --- TODO | 1 + flashgeotext/extractor.py | 53 -------------------------------- flashgeotext/geotext.py | 10 ++++++ flashgeotext/lookup.py | 26 +++++++++++++--- tests/unit/test_dataprocessor.py | 35 ++++++++++----------- tests/unit/test_extractor.py | 0 6 files changed, 49 insertions(+), 76 deletions(-) create mode 100644 flashgeotext/geotext.py create mode 100644 tests/unit/test_extractor.py diff --git a/TODO b/TODO index be571aa..59c431d 100644 --- a/TODO +++ b/TODO @@ -3,5 +3,6 @@ DataclassApproach: ✔ LookupData processor with .add() method to add LookupData instances to a dictionary of KeywordProcessors @done(20-02-18 08:54) ✔ validate if LookupData has already been added to LookupDataProcessor.pool, add update flag to let the user update the pool @done(20-02-18 08:55) ✔ add remove method to LookupDataProcessor @done(20-02-18 13:27) + ☐ add demo data method ☐ Extractor to extract LookupData from an input text ☐ Extractor should count the occurances, add span_info and update a result dictionary to be returned diff --git a/flashgeotext/extractor.py b/flashgeotext/extractor.py index 2ff7224..e8a21ab 100644 --- a/flashgeotext/extractor.py +++ b/flashgeotext/extractor.py @@ -1,56 +1,3 @@ -import json - -from flashtext import KeywordProcessor - -from flashgeotext.settings import DEMODATA_CITIES -from flashgeotext.settings import DEMODATA_COUNTRIES - - -class Alphabets(object): - pass - - class Extractor(object): - _cities_processor: KeywordProcessor = KeywordProcessor(case_sensitive=True) - _countries_processor: KeywordProcessor = KeywordProcessor(case_sensitive=True) - def extract(self, input_text: str, span_info: bool = True): pass - - -class GeoText(Extractor): - cities: dict = {} - countries: dict = {} - - def __init__(self, use_demo_data: bool = True): - self._flush_processor() - - if use_demo_data: - self.cities = load_data_from_file(file=DEMODATA_CITIES) - self.countries = load_data_from_file(file=DEMODATA_COUNTRIES) - self.build_cities_processor() - self.build_countries_processor() - - def build(self) -> None: - self._flush_processor() - self.build_cities_processor() - self.build_countries_processor() - - def build_cities_processor(self) -> None: - if self.cities: - self._cities_processor.add_keywords_from_dict(self.cities) - - def build_countries_processor(self) -> None: - if self.countries: - self._countries_processor.add_keywords_from_dict(self.countries) - - def _flush_processor(self) -> None: - self._cities_processor.keyword_trie_dict = dict() - self._cities_processor._terms_in_trie = 0 - self._countries_processor.keyword_trie_dict = dict() - self._countries_processor._terms_in_trie = 0 - - -def load_data_from_file(file: str) -> dict: - with open(file, "r", encoding="utf-8") as f: - return json.loads(f.read()) diff --git a/flashgeotext/geotext.py b/flashgeotext/geotext.py new file mode 100644 index 0000000..b5cd0d6 --- /dev/null +++ b/flashgeotext/geotext.py @@ -0,0 +1,10 @@ +from flashgeotext.extractor import Extractor +from flashgeotext.lookup import LookupDataPool + + +class GeoText(LookupDataPool, Extractor): + def __init__(self, use_demo_data: bool = True) -> None: + self.pool: dict = {} + + if use_demo_data: + self._add_demo_data() diff --git a/flashgeotext/lookup.py b/flashgeotext/lookup.py index 3b84399..68a5824 100644 --- a/flashgeotext/lookup.py +++ b/flashgeotext/lookup.py @@ -1,7 +1,12 @@ +import json + from flashtext import KeywordProcessor from loguru import logger from pydantic import BaseModel +from flashgeotext.settings import DEMODATA_CITIES +from flashgeotext.settings import DEMODATA_COUNTRIES + class LookupDuplicateError(Exception): pass @@ -12,7 +17,7 @@ class LookupData(BaseModel): data: dict -class LookupDataProcessor: +class LookupDataPool: """ """ @@ -34,7 +39,18 @@ def remove(self, lookup_to_remove: str) -> None: del self.pool[lookup_to_remove] logger.debug(f"{lookup_to_remove} removed from pool") - -class GeoText(LookupDataProcessor): - def __init__(self): - pass + def _add_demo_data(self): + cities = LookupData( + name="cities", data=load_data_from_file(file=DEMODATA_CITIES) + ) + countries = LookupData( + name="countries", data=load_data_from_file(file=DEMODATA_COUNTRIES) + ) + self.add(cities) + self.add(countries) + logger.debug(f"demo data loaded for: {list(self.pool.keys())}") + + +def load_data_from_file(file: str) -> dict: + with open(file, "r", encoding="utf-8") as f: + return json.loads(f.read()) diff --git a/tests/unit/test_dataprocessor.py b/tests/unit/test_dataprocessor.py index 87a71fc..298b1dd 100644 --- a/tests/unit/test_dataprocessor.py +++ b/tests/unit/test_dataprocessor.py @@ -1,8 +1,9 @@ import pytest from pydantic import ValidationError +from flashgeotext.geotext import GeoText from flashgeotext.lookup import LookupData -from flashgeotext.lookup import LookupDataProcessor +from flashgeotext.lookup import LookupDataPool from flashgeotext.lookup import LookupDuplicateError @@ -23,51 +24,49 @@ def test_lookup_data_fails(): assert isinstance(lookup.data, dict) -def test_lookup_data_processor(test_data_cities): +def test_lookup_data_pool(test_data_cities): lookup = LookupData(name="cities", data=test_data_cities) - processor = LookupDataProcessor() - processor.add(lookup) - - assert processor - - -def test_lookup_data_processor_pool(test_data_cities): - lookup = LookupData(name="cities", data=test_data_cities) - - processor = LookupDataProcessor() + processor = LookupDataPool() processor.add(lookup) assert processor.pool[lookup.name] -def test_lookup_data_processor_pool_duplicate_data(test_data_cities): +def test_lookup_data_pool_duplicate_data(test_data_cities): lookup = LookupData(name="cities", data=test_data_cities) - processor = LookupDataProcessor() + processor = LookupDataPool() processor.add(lookup) with pytest.raises(LookupDuplicateError): processor.add(lookup) -def test_lookup_data_processor_pool_duplicate_data_update_true(test_data_cities): +def test_lookup_data_pool_duplicate_data_update_true(test_data_cities): lookup = LookupData(name="cities", data=test_data_cities) - processor = LookupDataProcessor() + processor = LookupDataPool() processor.add(lookup=lookup, update=True) processor.add(lookup=lookup, update=True) assert processor.pool["cities"] -def test_lookup_data_processor_remove_lookup_from_pool(test_data_cities): +def test_lookup_data_pool_remove_lookup_from_pool(test_data_cities): lookup = LookupData(name="cities", data=test_data_cities) - processor = LookupDataProcessor() + processor = LookupDataPool() processor.add(lookup) processor.remove(lookup_to_remove="cities") with pytest.raises(KeyError): assert processor.pool["cities"] + + +def test_lookup_data_pool_with_test_data(): + geotext = GeoText(use_demo_data=True) + + assert geotext.pool["cities"] + assert geotext.pool["countries"] diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py new file mode 100644 index 0000000..e69de29