In [1]:
# !pip install redisearch

In [2]:
from jamboree.utils.core import consistent_hash

  from pandas.util.testing import assert_frame_equal
  from pandas import DataFrame, Series, Panel


In [3]:
from redisearch import TextField, NumericField, TagField, GeoField

## Requirements Conversion

In [4]:
def is_nested(d):
    return any(isinstance(i,dict) for i in d.values())

In [5]:
def is_gen_type(item, _type):
    try:
        return isinstance(item, _type) or issubclass(item, _type) or item == _type
    except:
        return False

In [6]:
def name_match(item:str, name:str):
    return item.lower() == name.lower()

In [7]:
name_match("HELLO", "hello")

True

In [8]:
class GEO(type):
    
    def __call__(cls):
        return cls.__new__(cls)
    
    """Geotype placeholder """
    def __repr__(self):
        return "GEO"
    
    def __str__(self):
        return "GEO"

In [9]:
exp_requirements = {
    "name": str,
    "category": str,
    "location": "GEO",
    "subcategories": dict,
    "names": list
}

In [10]:
class SearchSchemaGenerator(object):
    """ 
        Creates a search schema and hash for a given requirements dictionary.
        Use this to create a simple search schema. If the search schema
    """
    
    
    def __init__(self):
        self.start = {}
        self.string_version = {}
        self.sub_dict_keys = []
        self.arr = set()
        self.invalid_dict = {}
        self.geo_fields = set()
        
    
        
    @property
    def generics(self):
        return [
            str, float, int, list, bool, dict
        ]
    
    def is_generic(self, _k):
        
        return _k in self.generics
    
    def is_geo(self, k) -> bool:
        if is_gen_type(k, GEO):
            return True
        
        if is_gen_type(k, str):
            if name_match(k, "geo"):
                return True
        return False
        
    def to_str(self, i, k):
        """Converts the item to a string version of it"""
        if i == bool:
            # This will be text that we'll force exact queries on
            self.arr.add(TextField(k, no_stem=True))
            return "BOOL"
        elif i == float or i == int:
            self.arr.add(NumericField(k))
            return "NUMERIC"
        elif i == str:
            self.arr.add(TextField(k))
            return "TEXT"
        elif i == list:
            self.arr.add(TagField(k))
            return "TAG"
    
    @property
    def subs(self):
        return self.sub_dict_keys
    
    @property
    def is_valid(self) -> bool:
        if len(self.start) == 0:
            return False
        for k, v in self.start.items():
            if self.is_generic(v):
                sval = self.to_str(v, k)
                self.string_version[k] = sval
                continue
            
            
            if not self.is_geo(v):
                self.invalid_dict[k] = v
                return False
            self.string_version[k] = "GEO"
            self.geo_fields.add(k)
            self.arr.add(GeoField(k))
        return True
    
    
    def index_array(self):
        return list(self.arr)
    
    
    def hashed(self):
        return consistent_hash(self.string_version)
    
    def process(self, required:dict):
        """Takes a schema - usually required schema, then converts it into"""
        self.start = required
        if not self.is_valid:
            raise TypeError(f"This information is not the right type: {self.invalid_dict}")
        return self
    
    

## Example Queries

Here are some example queries/data types we'll encounter.

### Get all US economic indicators

The query would include the following indicators:

1. Economic specific information (something labelled as economics)
2. A label stating that it's for the United States
3. We'd probably want everything pertaining the markets too.

Storing the information would look similar to the following:

* **data_type** - dataset
* **category** - markets
* **subcategories**
    * **field** - economics
    * **country** - US

The exact name wouldn't matter, but it should return all the datasets with complete information that we can use for an actual backtest system. There's two parts to this:

1. Actually inserting information into the system so we can find it later
2. Using a dictionary to find that information


The schema everything will likely rely on is the following:
```py
{
    "name": str
    "data_type":str,
    "category": str,
    "subcategories": dict,
    "description": str
}
```

This schema would allow us to save information inside of the database in the following way:


```py
{
    "name": "Real GDP",
    "data_type": "dataset",
    "category": "markets",
    "subcategories": {
        "aspect": "economic",
        "country": "US"
    },
    "description": "The Federal Reserve uses data such as the real GDP and other related economic indicators to adjust its monetary policy."
}
```

While our search schema would look like the following:

```py
query = {
   "data_type": "dataset",
   "category": "markets",
   "subcategories": {
       "aspect": "economic",
       "country": "US"
   }
}
```

It should return the Real GDP if we enter it in in the form of a list. If we're wrapping the dataset search into its own class, we wouldn't need to add the data_type field.


Instead of documenting a lot about it, how about we create an example.

In [11]:
from redisearch import Client, Query

In [22]:
from loguru import logger
from typing import List

In [13]:
schema = {
#     "type": "feature",
    "meta_type": str,
    "name": str,
    "category": str,
    "description": str,
    "location": "geo",
    "derp": list
}

In [14]:
scheme_gen = SearchSchemaGenerator()

In [15]:
def create_client(_schema):
    client = None
    try:
        scheme_gen.process(_schema)
        index_list = scheme_gen.index_array()
        client = Client(scheme_gen.hashed())
        client.create_index(index_list)
    except Exception as res:
        print(res)
    return client

In [21]:
create_client(schema)

Duplicate field in schema


<redisearch.client.Client at 0x7f51abce0908>

In [25]:
class QueryBuilder(object):
    def __init__(self):
        self._query_set = {
            
        }
        self._boolean_fields = set()
        self._number_fields = set()
        self._text_fields = set()
        self._tag_fields = set()
        self._geo_fields = []
    
    @property
    def qset(self):
        return self._query_set
    
    
    @property
    def geos(self):
        return self._geo_fields
    
    
    @geos.setter
    def geos(self, _geos):
        self._geo_fields = _geos
        
    
    def greater(self, field, num):
        placeholder = {
            "filter": "number",
            "value": {
                "upper": "+inf",
                "lower": num
            }
        }
        self._number_fields.add(field)
        self.qset[field] = placeholder
        return self
    
    
    def less(self, field, num):
        placeholder = {
            "filter": "number",
            "value": {
                "upper": num,
                "lower": "-inf"
            }
        }
        self._number_fields.add(field)
        self.qset[field] = placeholder
        return self
    
    
    def between(self, field, upper, lower):
        placeholder = {
            "filter": "number",
            "value": {
                "upper": upper,
                "lower": lower
            }
        }
        self._number_fields.add(field)
        self.qset[field] = placeholder
        return self
    
    
    def near(self, long, lat, distance=1, metric="km"):
        placeholder = {
            "filter": "geo",
            "value": {
                "long": long,
                "lat": lat,
                "distance": distance,
                "metric": metric
            }
        }
        self.qset[field] = placeholder
        return self
    
    
    def boolean(self, field:str, is_true=False):
        placeholder = {
            "filter": "boolean"
        }
        if is_true:
            placeholder['value'] = "TRUE"
        else:
            placeholder['value'] = "FALSE"
        self._boolean_fields.add(field)
        self.qset[field] = placeholder
        return self
    
    def tags(self, field, tags:list):
        self._tag_fields.add(field)
        return self
    
    def _process_boolean(self) -> str:
        """ Do an exact match on all boolean values """
    
    def _process_geo_filter(self) -> str:
        pass
    
    def _process_tag_filter(self):
        pass
    
    def _process_number_filter(self):
        pass
    
    def build(self, terms:str):
        """Builds a query to be executed"""
        return ""

In [17]:
class SampleSearch(SearchSchemaGenerator):
    """
        # SampleSearch
        
        
        This is a prototype of how the actual search would work.
    """
    def __init__(self):
        super().__init__()
        self._required = {
            "meta_type": str,
            "name": str,
            "category": str,
            "description": str,
            "location": "geo"
        }
        self._client = None
        
    
    @property
    def required(self):
        return self._required
    
    @property
    def client(self):
        
        if self._client is not None:
            return self._client
        with logger.catch():
            self.process(self.required)
            
            index_list = self.index_array()
            self._client = Client(self.hashed())
            self._client.create_index(index_list)
        return self._client
    
    @property
    def queryset(self):
        return self._query_set
    
    def around(self, long, lat, distance=1, metric="km"):
        """Search around a given point for every geo field"""
        if len(self.geo_fields) == 0:
            return self
        for field in self.geo_fields:
            _filter = ("geofilter", long, lat, distance, metric, field)
            self.queryset.append(_filter)
        return self
    
    def greater_than(self, field, num):
        _filter = ("numfilter", num, "+inf", field)
        self.queryset.append(_filter)
        return self
    
    def less_than(self, field, num):
        _filter = ("numfilter", "-inf", num, field)
        self.queryset.append(_filter)
        return self
    
    def between(self, field, upper, lower):
        _filter = ("numfilter", lower, upper, field)
        self.queryset.append(_filter)
        return self
    
    def switch(self, field, _bool=False):
        
        pass

In [18]:
sample_search = SampleSearch()

In [19]:
sample_search.client

2020-04-02 16:03:07.831 | ERROR    | __main__:client:34 - An error has been caught in function 'client', process 'MainProcess' (7731), thread 'MainThread' (139989967102336):
Traceback (most recent call last):
  File "/home/kevin/.pyenv/versions/3.7.3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
                └ ModuleSpec(name='ipykernel_launcher', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f51f34bcc18>, origin='...
  File "/home/kevin/.pyenv/versions/3.7.3/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x7f51f3539930, file "/home/kevin/.cache/pypoetry/virtualenvs/jamboree-8IC9UDZl-py3.7/lib/python3.7/...
  File "/home/kevin/.cache/pypoetry/virtualenvs/jamboree-8IC9UDZl-py3.7/lib/python3.7/site-packages/ipykernel_launc

<redisearch.client.Client at 0x7f51abcd09b0>

In [20]:
# print(client)