Skip to content

Commit

Permalink
added geotext class that upon init loads demo data by default. added …
Browse files Browse the repository at this point in the history
…tests. updated extractor.py
  • Loading branch information
iwpnd committed Feb 18, 2020
1 parent 4c84dc8 commit 42f362f
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 76 deletions.
1 change: 1 addition & 0 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ DataclassApproach:
✔ LookupData processor with .add() method to add LookupData instances to a dictionary of KeywordProcessors @done(20-02-18 08:54)
✔ validate if LookupData has already been added to LookupDataProcessor.pool, add update flag to let the user update the pool @done(20-02-18 08:55)
✔ add remove method to LookupDataProcessor @done(20-02-18 13:27)
☐ add demo data method
☐ Extractor to extract LookupData from an input text
☐ Extractor should count the occurances, add span_info and update a result dictionary to be returned
53 changes: 0 additions & 53 deletions flashgeotext/extractor.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,3 @@
import json

from flashtext import KeywordProcessor

from flashgeotext.settings import DEMODATA_CITIES
from flashgeotext.settings import DEMODATA_COUNTRIES


class Alphabets(object):
pass


class Extractor(object):
_cities_processor: KeywordProcessor = KeywordProcessor(case_sensitive=True)
_countries_processor: KeywordProcessor = KeywordProcessor(case_sensitive=True)

def extract(self, input_text: str, span_info: bool = True):
pass


class GeoText(Extractor):
cities: dict = {}
countries: dict = {}

def __init__(self, use_demo_data: bool = True):
self._flush_processor()

if use_demo_data:
self.cities = load_data_from_file(file=DEMODATA_CITIES)
self.countries = load_data_from_file(file=DEMODATA_COUNTRIES)
self.build_cities_processor()
self.build_countries_processor()

def build(self) -> None:
self._flush_processor()
self.build_cities_processor()
self.build_countries_processor()

def build_cities_processor(self) -> None:
if self.cities:
self._cities_processor.add_keywords_from_dict(self.cities)

def build_countries_processor(self) -> None:
if self.countries:
self._countries_processor.add_keywords_from_dict(self.countries)

def _flush_processor(self) -> None:
self._cities_processor.keyword_trie_dict = dict()
self._cities_processor._terms_in_trie = 0
self._countries_processor.keyword_trie_dict = dict()
self._countries_processor._terms_in_trie = 0


def load_data_from_file(file: str) -> dict:
with open(file, "r", encoding="utf-8") as f:
return json.loads(f.read())
10 changes: 10 additions & 0 deletions flashgeotext/geotext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from flashgeotext.extractor import Extractor
from flashgeotext.lookup import LookupDataPool


class GeoText(LookupDataPool, Extractor):
def __init__(self, use_demo_data: bool = True) -> None:
self.pool: dict = {}

if use_demo_data:
self._add_demo_data()
26 changes: 21 additions & 5 deletions flashgeotext/lookup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import json

from flashtext import KeywordProcessor
from loguru import logger
from pydantic import BaseModel

from flashgeotext.settings import DEMODATA_CITIES
from flashgeotext.settings import DEMODATA_COUNTRIES


class LookupDuplicateError(Exception):
pass
Expand All @@ -12,7 +17,7 @@ class LookupData(BaseModel):
data: dict


class LookupDataProcessor:
class LookupDataPool:
"""
"""

Expand All @@ -34,7 +39,18 @@ def remove(self, lookup_to_remove: str) -> None:
del self.pool[lookup_to_remove]
logger.debug(f"{lookup_to_remove} removed from pool")


class GeoText(LookupDataProcessor):
def __init__(self):
pass
def _add_demo_data(self):
cities = LookupData(
name="cities", data=load_data_from_file(file=DEMODATA_CITIES)
)
countries = LookupData(
name="countries", data=load_data_from_file(file=DEMODATA_COUNTRIES)
)
self.add(cities)
self.add(countries)
logger.debug(f"demo data loaded for: {list(self.pool.keys())}")


def load_data_from_file(file: str) -> dict:
with open(file, "r", encoding="utf-8") as f:
return json.loads(f.read())
35 changes: 17 additions & 18 deletions tests/unit/test_dataprocessor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import pytest
from pydantic import ValidationError

from flashgeotext.geotext import GeoText
from flashgeotext.lookup import LookupData
from flashgeotext.lookup import LookupDataProcessor
from flashgeotext.lookup import LookupDataPool
from flashgeotext.lookup import LookupDuplicateError


Expand All @@ -23,51 +24,49 @@ def test_lookup_data_fails():
assert isinstance(lookup.data, dict)


def test_lookup_data_processor(test_data_cities):
def test_lookup_data_pool(test_data_cities):
lookup = LookupData(name="cities", data=test_data_cities)

processor = LookupDataProcessor()
processor.add(lookup)

assert processor


def test_lookup_data_processor_pool(test_data_cities):
lookup = LookupData(name="cities", data=test_data_cities)

processor = LookupDataProcessor()
processor = LookupDataPool()
processor.add(lookup)

assert processor.pool[lookup.name]


def test_lookup_data_processor_pool_duplicate_data(test_data_cities):
def test_lookup_data_pool_duplicate_data(test_data_cities):
lookup = LookupData(name="cities", data=test_data_cities)

processor = LookupDataProcessor()
processor = LookupDataPool()
processor.add(lookup)

with pytest.raises(LookupDuplicateError):
processor.add(lookup)


def test_lookup_data_processor_pool_duplicate_data_update_true(test_data_cities):
def test_lookup_data_pool_duplicate_data_update_true(test_data_cities):
lookup = LookupData(name="cities", data=test_data_cities)

processor = LookupDataProcessor()
processor = LookupDataPool()
processor.add(lookup=lookup, update=True)
processor.add(lookup=lookup, update=True)

assert processor.pool["cities"]


def test_lookup_data_processor_remove_lookup_from_pool(test_data_cities):
def test_lookup_data_pool_remove_lookup_from_pool(test_data_cities):
lookup = LookupData(name="cities", data=test_data_cities)

processor = LookupDataProcessor()
processor = LookupDataPool()
processor.add(lookup)

processor.remove(lookup_to_remove="cities")

with pytest.raises(KeyError):
assert processor.pool["cities"]


def test_lookup_data_pool_with_test_data():
geotext = GeoText(use_demo_data=True)

assert geotext.pool["cities"]
assert geotext.pool["countries"]
Empty file added tests/unit/test_extractor.py
Empty file.

0 comments on commit 42f362f

Please sign in to comment.