diff --git a/flashgeotext/extractor.py b/flashgeotext/extractor.py index 3d62d60..55f4a98 100644 --- a/flashgeotext/extractor.py +++ b/flashgeotext/extractor.py @@ -1,5 +1,4 @@ import json -from typing import Union from flashtext import KeywordProcessor @@ -12,30 +11,18 @@ class Alphabets(object): class DemoData(object): - cities: Union[list, dict] = [] - countries: Union[list, dict] = [] + cities: dict = {} + countries: dict = {} - def __init__(self, with_synonyms: bool = True): - self.load_demo_data(with_synonyms=with_synonyms) + def load(self) -> None: - def load_demo_data(self, with_synonyms: bool = True) -> None: + self.cities = self._load_data_dict(file=DEMODATA_CITIES) + self.countries = self._load_data_dict(file=DEMODATA_COUNTRIES) - if with_synonyms: - self.cities = self._load_data_dict(file=DEMODATA_CITIES) - self.countries = self._load_data_dict(file=DEMODATA_COUNTRIES) - - else: - self.cities = self._load_data_list(file=DEMODATA_CITIES) - self.countries = self._load_data_list(file=DEMODATA_COUNTRIES) - - def _load_data_dict(self, file: str = "") -> dict: + def _load_data_dict(self, file: str) -> dict: with open(file, "r", encoding="utf-8") as f: return json.loads(f.read()) - def _load_data_list(self, file: str = "") -> list: - with open(file, "r", encoding="utf-8") as f: - return list(json.loads(f.read()).keys()) - class Extractor: cities: KeywordProcessor = KeywordProcessor(case_sensitive=True) diff --git a/scripts/Untitled.ipynb b/scripts/Untitled.ipynb deleted file mode 100644 index cce25f8..0000000 --- a/scripts/Untitled.ipynb +++ /dev/null @@ -1,708 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import pandas as pd\n", - "from flashtext import KeywordProcessor" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on function read_csv in module pandas.io.parsers:\n", - "\n", - "read_csv(filepath_or_buffer: Union[str, pathlib.Path, IO[~AnyStr]], sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal: str = '.', lineterminator=None, quotechar='\"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)\n", - " Read a comma-separated values (csv) file into DataFrame.\n", - " \n", - " Also supports optionally iterating or breaking of the file\n", - " into chunks.\n", - " \n", - " Additional help can be found in the online docs for\n", - " `IO Tools `_.\n", - " \n", - " Parameters\n", - " ----------\n", - " filepath_or_buffer : str, path object or file-like object\n", - " Any valid string path is acceptable. The string could be a URL. Valid\n", - " URL schemes include http, ftp, s3, and file. For file URLs, a host is\n", - " expected. A local file could be: file://localhost/path/to/table.csv.\n", - " \n", - " If you want to pass in a path object, pandas accepts any ``os.PathLike``.\n", - " \n", - " By file-like object, we refer to objects with a ``read()`` method, such as\n", - " a file handler (e.g. via builtin ``open`` function) or ``StringIO``.\n", - " sep : str, default ','\n", - " Delimiter to use. If sep is None, the C engine cannot automatically detect\n", - " the separator, but the Python parsing engine can, meaning the latter will\n", - " be used and automatically detect the separator by Python's builtin sniffer\n", - " tool, ``csv.Sniffer``. In addition, separators longer than 1 character and\n", - " different from ``'\\s+'`` will be interpreted as regular expressions and\n", - " will also force the use of the Python parsing engine. Note that regex\n", - " delimiters are prone to ignoring quoted data. Regex example: ``'\\r\\t'``.\n", - " delimiter : str, default ``None``\n", - " Alias for sep.\n", - " header : int, list of int, default 'infer'\n", - " Row number(s) to use as the column names, and the start of the\n", - " data. Default behavior is to infer the column names: if no names\n", - " are passed the behavior is identical to ``header=0`` and column\n", - " names are inferred from the first line of the file, if column\n", - " names are passed explicitly then the behavior is identical to\n", - " ``header=None``. Explicitly pass ``header=0`` to be able to\n", - " replace existing names. The header can be a list of integers that\n", - " specify row locations for a multi-index on the columns\n", - " e.g. [0,1,3]. Intervening rows that are not specified will be\n", - " skipped (e.g. 2 in this example is skipped). Note that this\n", - " parameter ignores commented lines and empty lines if\n", - " ``skip_blank_lines=True``, so ``header=0`` denotes the first line of\n", - " data rather than the first line of the file.\n", - " names : array-like, optional\n", - " List of column names to use. If the file contains a header row,\n", - " then you should explicitly pass ``header=0`` to override the column names.\n", - " Duplicates in this list are not allowed.\n", - " index_col : int, str, sequence of int / str, or False, default ``None``\n", - " Column(s) to use as the row labels of the ``DataFrame``, either given as\n", - " string name or column index. If a sequence of int / str is given, a\n", - " MultiIndex is used.\n", - " \n", - " Note: ``index_col=False`` can be used to force pandas to *not* use the first\n", - " column as the index, e.g. when you have a malformed file with delimiters at\n", - " the end of each line.\n", - " usecols : list-like or callable, optional\n", - " Return a subset of the columns. If list-like, all elements must either\n", - " be positional (i.e. integer indices into the document columns) or strings\n", - " that correspond to column names provided either by the user in `names` or\n", - " inferred from the document header row(s). For example, a valid list-like\n", - " `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.\n", - " Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.\n", - " To instantiate a DataFrame from ``data`` with element order preserved use\n", - " ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns\n", - " in ``['foo', 'bar']`` order or\n", - " ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``\n", - " for ``['bar', 'foo']`` order.\n", - " \n", - " If callable, the callable function will be evaluated against the column\n", - " names, returning names where the callable function evaluates to True. An\n", - " example of a valid callable argument would be ``lambda x: x.upper() in\n", - " ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster\n", - " parsing time and lower memory usage.\n", - " squeeze : bool, default False\n", - " If the parsed data only contains one column then return a Series.\n", - " prefix : str, optional\n", - " Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...\n", - " mangle_dupe_cols : bool, default True\n", - " Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than\n", - " 'X'...'X'. Passing in False will cause data to be overwritten if there\n", - " are duplicate names in the columns.\n", - " dtype : Type name or dict of column -> type, optional\n", - " Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,\n", - " 'c': 'Int64'}\n", - " Use `str` or `object` together with suitable `na_values` settings\n", - " to preserve and not interpret dtype.\n", - " If converters are specified, they will be applied INSTEAD\n", - " of dtype conversion.\n", - " engine : {'c', 'python'}, optional\n", - " Parser engine to use. The C engine is faster while the python engine is\n", - " currently more feature-complete.\n", - " converters : dict, optional\n", - " Dict of functions for converting values in certain columns. Keys can either\n", - " be integers or column labels.\n", - " true_values : list, optional\n", - " Values to consider as True.\n", - " false_values : list, optional\n", - " Values to consider as False.\n", - " skipinitialspace : bool, default False\n", - " Skip spaces after delimiter.\n", - " skiprows : list-like, int or callable, optional\n", - " Line numbers to skip (0-indexed) or number of lines to skip (int)\n", - " at the start of the file.\n", - " \n", - " If callable, the callable function will be evaluated against the row\n", - " indices, returning True if the row should be skipped and False otherwise.\n", - " An example of a valid callable argument would be ``lambda x: x in [0, 2]``.\n", - " skipfooter : int, default 0\n", - " Number of lines at bottom of file to skip (Unsupported with engine='c').\n", - " nrows : int, optional\n", - " Number of rows of file to read. Useful for reading pieces of large files.\n", - " na_values : scalar, str, list-like, or dict, optional\n", - " Additional strings to recognize as NA/NaN. If dict passed, specific\n", - " per-column NA values. By default the following values are interpreted as\n", - " NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',\n", - " '1.#IND', '1.#QNAN', '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',\n", - " 'nan', 'null'.\n", - " keep_default_na : bool, default True\n", - " Whether or not to include the default NaN values when parsing the data.\n", - " Depending on whether `na_values` is passed in, the behavior is as follows:\n", - " \n", - " * If `keep_default_na` is True, and `na_values` are specified, `na_values`\n", - " is appended to the default NaN values used for parsing.\n", - " * If `keep_default_na` is True, and `na_values` are not specified, only\n", - " the default NaN values are used for parsing.\n", - " * If `keep_default_na` is False, and `na_values` are specified, only\n", - " the NaN values specified `na_values` are used for parsing.\n", - " * If `keep_default_na` is False, and `na_values` are not specified, no\n", - " strings will be parsed as NaN.\n", - " \n", - " Note that if `na_filter` is passed in as False, the `keep_default_na` and\n", - " `na_values` parameters will be ignored.\n", - " na_filter : bool, default True\n", - " Detect missing value markers (empty strings and the value of na_values). In\n", - " data without any NAs, passing na_filter=False can improve the performance\n", - " of reading a large file.\n", - " verbose : bool, default False\n", - " Indicate number of NA values placed in non-numeric columns.\n", - " skip_blank_lines : bool, default True\n", - " If True, skip over blank lines rather than interpreting as NaN values.\n", - " parse_dates : bool or list of int or names or list of lists or dict, default False\n", - " The behavior is as follows:\n", - " \n", - " * boolean. If True -> try parsing the index.\n", - " * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3\n", - " each as a separate date column.\n", - " * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as\n", - " a single date column.\n", - " * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call\n", - " result 'foo'\n", - " \n", - " If a column or index cannot be represented as an array of datetimes,\n", - " say because of an unparseable value or a mixture of timezones, the column\n", - " or index will be returned unaltered as an object data type. For\n", - " non-standard datetime parsing, use ``pd.to_datetime`` after\n", - " ``pd.read_csv``. To parse an index or column with a mixture of timezones,\n", - " specify ``date_parser`` to be a partially-applied\n", - " :func:`pandas.to_datetime` with ``utc=True``. See\n", - " :ref:`io.csv.mixed_timezones` for more.\n", - " \n", - " Note: A fast-path exists for iso8601-formatted dates.\n", - " infer_datetime_format : bool, default False\n", - " If True and `parse_dates` is enabled, pandas will attempt to infer the\n", - " format of the datetime strings in the columns, and if it can be inferred,\n", - " switch to a faster method of parsing them. In some cases this can increase\n", - " the parsing speed by 5-10x.\n", - " keep_date_col : bool, default False\n", - " If True and `parse_dates` specifies combining multiple columns then\n", - " keep the original columns.\n", - " date_parser : function, optional\n", - " Function to use for converting a sequence of string columns to an array of\n", - " datetime instances. The default uses ``dateutil.parser.parser`` to do the\n", - " conversion. Pandas will try to call `date_parser` in three different ways,\n", - " advancing to the next if an exception occurs: 1) Pass one or more arrays\n", - " (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the\n", - " string values from the columns defined by `parse_dates` into a single array\n", - " and pass that; and 3) call `date_parser` once for each row using one or\n", - " more strings (corresponding to the columns defined by `parse_dates`) as\n", - " arguments.\n", - " dayfirst : bool, default False\n", - " DD/MM format dates, international and European format.\n", - " cache_dates : bool, default True\n", - " If True, use a cache of unique, converted dates to apply the datetime\n", - " conversion. May produce significant speed-up when parsing duplicate\n", - " date strings, especially ones with timezone offsets.\n", - " \n", - " .. versionadded:: 0.25.0\n", - " iterator : bool, default False\n", - " Return TextFileReader object for iteration or getting chunks with\n", - " ``get_chunk()``.\n", - " chunksize : int, optional\n", - " Return TextFileReader object for iteration.\n", - " See the `IO Tools docs\n", - " `_\n", - " for more information on ``iterator`` and ``chunksize``.\n", - " compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'\n", - " For on-the-fly decompression of on-disk data. If 'infer' and\n", - " `filepath_or_buffer` is path-like, then detect compression from the\n", - " following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no\n", - " decompression). If using 'zip', the ZIP file must contain only one data\n", - " file to be read in. Set to None for no decompression.\n", - " thousands : str, optional\n", - " Thousands separator.\n", - " decimal : str, default '.'\n", - " Character to recognize as decimal point (e.g. use ',' for European data).\n", - " lineterminator : str (length 1), optional\n", - " Character to break file into lines. Only valid with C parser.\n", - " quotechar : str (length 1), optional\n", - " The character used to denote the start and end of a quoted item. Quoted\n", - " items can include the delimiter and it will be ignored.\n", - " quoting : int or csv.QUOTE_* instance, default 0\n", - " Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of\n", - " QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).\n", - " doublequote : bool, default ``True``\n", - " When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate\n", - " whether or not to interpret two consecutive quotechar elements INSIDE a\n", - " field as a single ``quotechar`` element.\n", - " escapechar : str (length 1), optional\n", - " One-character string used to escape other characters.\n", - " comment : str, optional\n", - " Indicates remainder of line should not be parsed. If found at the beginning\n", - " of a line, the line will be ignored altogether. This parameter must be a\n", - " single character. Like empty lines (as long as ``skip_blank_lines=True``),\n", - " fully commented lines are ignored by the parameter `header` but not by\n", - " `skiprows`. For example, if ``comment='#'``, parsing\n", - " ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being\n", - " treated as the header.\n", - " encoding : str, optional\n", - " Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python\n", - " standard encodings\n", - " `_ .\n", - " dialect : str or csv.Dialect, optional\n", - " If provided, this parameter will override values (default or not) for the\n", - " following parameters: `delimiter`, `doublequote`, `escapechar`,\n", - " `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to\n", - " override values, a ParserWarning will be issued. See csv.Dialect\n", - " documentation for more details.\n", - " error_bad_lines : bool, default True\n", - " Lines with too many fields (e.g. a csv line with too many commas) will by\n", - " default cause an exception to be raised, and no DataFrame will be returned.\n", - " If False, then these \"bad lines\" will dropped from the DataFrame that is\n", - " returned.\n", - " warn_bad_lines : bool, default True\n", - " If error_bad_lines is False, and warn_bad_lines is True, a warning for each\n", - " \"bad line\" will be output.\n", - " delim_whitespace : bool, default False\n", - " Specifies whether or not whitespace (e.g. ``' '`` or ``' '``) will be\n", - " used as the sep. Equivalent to setting ``sep='\\s+'``. If this option\n", - " is set to True, nothing should be passed in for the ``delimiter``\n", - " parameter.\n", - " low_memory : bool, default True\n", - " Internally process the file in chunks, resulting in lower memory use\n", - " while parsing, but possibly mixed type inference. To ensure no mixed\n", - " types either set False, or specify the type with the `dtype` parameter.\n", - " Note that the entire file is read into a single DataFrame regardless,\n", - " use the `chunksize` or `iterator` parameter to return the data in chunks.\n", - " (Only valid with C parser).\n", - " memory_map : bool, default False\n", - " If a filepath is provided for `filepath_or_buffer`, map the file object\n", - " directly onto memory and access the data directly from there. Using this\n", - " option can improve performance because there is no longer any I/O overhead.\n", - " float_precision : str, optional\n", - " Specifies which converter the C engine should use for floating-point\n", - " values. The options are `None` for the ordinary converter,\n", - " `high` for the high-precision converter, and `round_trip` for the\n", - " round-trip converter.\n", - " \n", - " Returns\n", - " -------\n", - " DataFrame or TextParser\n", - " A comma-separated values (csv) file is returned as two-dimensional\n", - " data structure with labeled axes.\n", - " \n", - " See Also\n", - " --------\n", - " to_csv : Write DataFrame to a comma-separated values (csv) file.\n", - " read_csv : Read a comma-separated values (csv) file into DataFrame.\n", - " read_fwf : Read a table of fixed-width formatted lines into DataFrame.\n", - " \n", - " Examples\n", - " --------\n", - " >>> pd.read_csv('data.csv') # doctest: +SKIP\n", - "\n" - ] - } - ], - "source": [ - "help(pd.read_csv)" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 18237 entries, 0 to 18236\n", - "Data columns (total 4 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 name 18237 non-null object\n", - " 1 synonyms 16268 non-null object\n", - " 2 country_alpha2 18227 non-null object\n", - " 3 population 18237 non-null int64 \n", - "dtypes: int64(1), object(3)\n", - "memory usage: 570.0+ KB\n" - ] - } - ], - "source": [ - "df = pd.read_csv(\"cities15000.txt\", sep=\"\\t\", header=None, usecols=[1,3,8,14])\n", - "columns = [\"name\", \"synonyms\", \"country_alpha2\", \"population\"]\n", - "df.columns = columns\n", - "\n", - "df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [], - "source": [ - "german_cities = df[df.country_alpha2 == \"DE\"].copy()\n", - "german_cities.fillna(\"\", inplace=True)\n", - "german_cities_dict = {\n", - " name: synonym.split(\",\") for name, synonym \n", - " in german_cities[[\"name\", \"synonyms\"]].values.tolist()\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "import string" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~ \\t\\n\\r\\x0b\\x0c'" - ] - }, - "execution_count": 91, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "test = string.\n", - "test" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "german_cities_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "cities = KeywordProcessor()\n", - "cities.add_keywords_from_dict(german_cities_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('Erlangen', 64, 72)]" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "keywords_found = cities.extract_keywords('Berlins ist die schoenste Stadt Deutschlands. Mal abgesehen von Erlangen meine ich natuerlich.', span_info=True)\n", - "keywords_found" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# alphabets, letters used additionally to basic unicode\n", - "https://www.unicode.org/cldr/charts/latest/summary/root.html" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [], - "source": [ - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"untitled.txt\", \"r\") as f:\n", - " alph = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [], - "source": [ - "df2 = pd.DataFrame(alph)" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namesilcodesourceregioncountriesscriptspeakersletterpunctuationlocal...relatedseparatorsymbolnumberredirectothersildcodedeprecatedmoresymbols
dsbLower SorbianNaNcldr_dsbeurGermanylatn6900óÓčćěłńŕšśžźČĆĚŁŃŔŠŚŽŹ«»§‐–—…‘’‚“„dolnoserbšćina, dolnoserbski [ˈdɔlnɔˌsɛrskʲi]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
hsbUpper SorbianNaNcldr_hsb,udhr_hsbeurGermanylatn13000čćźěłńřšžČĆŹĚŁŃŘŠŽóÓ«»§‐–—…‘’‚“„Hornjoserbski, Hornjoserbšćina...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
kshKölsch (Colognian)NaNcldr_ksheurGermanylatn250000ėœůĖŒŮåäæëößüÅÄÆËÖÜ‐–—…‘‚“„†‡§⸗Kölsch...NaNNaN°NaNNaNNaNNaNNaNNaNNaN
\n", - "

3 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " name silcode source region countries script \\\n", - "dsb Lower Sorbian NaN cldr_dsb eur Germany latn \n", - "hsb Upper Sorbian NaN cldr_hsb,udhr_hsb eur Germany latn \n", - "ksh Kölsch (Colognian) NaN cldr_ksh eur Germany latn \n", - "\n", - " speakers letter punctuation \\\n", - "dsb 6900 óÓčćěłńŕšśžźČĆĚŁŃŔŠŚŽŹ «»§‐–—…‘’‚“„ \n", - "hsb 13000 čćźěłńřšžČĆŹĚŁŃŘŠŽóÓ «»§‐–—…‘’‚“„ \n", - "ksh 250000 ėœůĖŒŮåäæëößüÅÄÆËÖÜ ‐–—…‘‚“„†‡§⸗ \n", - "\n", - " local ... related separator \\\n", - "dsb dolnoserbšćina, dolnoserbski [ˈdɔlnɔˌsɛrskʲi] ... NaN NaN \n", - "hsb Hornjoserbski, Hornjoserbšćina ... NaN NaN \n", - "ksh Kölsch ... NaN NaN \n", - "\n", - " symbol number redirect other sildcode deprecated more symbols \n", - "dsb NaN NaN NaN NaN NaN NaN NaN NaN \n", - "hsb NaN NaN NaN NaN NaN NaN NaN NaN \n", - "ksh ° NaN NaN NaN NaN NaN NaN NaN \n", - "\n", - "[3 rows x 26 columns]" - ] - }, - "execution_count": 102, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df2.T[df2.T.countries == \"Germany\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# flashgeotext" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from flashgeotext.extractor import Extractor" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "ext = Extractor()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hasattr(ext, \"cities\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py index 722d226..f39b1c2 100644 --- a/tests/unit/test_extractor.py +++ b/tests/unit/test_extractor.py @@ -14,3 +14,11 @@ def test_demodata_content(): assert hasattr(demodata, "cities") assert hasattr(demodata, "countries") + + +def test_demodata_load_data(): + demodata = DemoData() + demodata.load() + + assert demodata.cities + assert demodata.countries