In [1]:
# !pip install redisearch

Collecting redisearch
  Using cached https://files.pythonhosted.org/packages/b0/3e/d71a9770fc397b5ac24d636ae46bbf03899bc82a5259e56b9e2c0c452b00/redisearch-0.9.0.tar.gz
Collecting rmtest>=0.2 (from redisearch)
  Using cached https://files.pythonhosted.org/packages/38/7f/a9ce6b95913e477995476e1a80f20ba199cc90f0a80636e37d795a0af7e1/rmtest-0.7.0.tar.gz
Installing collected packages: rmtest, redisearch
  Running setup.py install for rmtest ... [?25ldone
[?25h  Running setup.py install for redisearch ... [?25ldone
[?25hSuccessfully installed redisearch-0.9.0 rmtest-0.7.0
[33mYou are using pip version 19.0.3, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [128]:
from jamboree.utils.core import consistent_hash

  from pandas.util.testing import assert_frame_equal
  from pandas import DataFrame, Series, Panel


## Requirements Conversion

In [12]:
def is_nested(d):
    return any(isinstance(i,dict) for i in d.values())

In [76]:
def is_gen_type(item, _type):
    try:
        return isinstance(item, _type) or issubclass(item, _type) or item == _type
    except:
        return False

In [77]:
def name_match(item:str, name:str):
    return item.lower() == name.lower()

In [78]:
name_match("HELLO", "hello")

True

In [175]:
class GEO(type):
    
    def __call__(cls):
        return cls.__new__(cls)
    
    """Geotype placeholder """
    def __repr__(self):
        return "GEO"
    
    def __str__(self):
        return "GEO"

In [177]:
exp_requirements = {
    "name": str,
    "category": str,
    "location": "GEO",
    "subcategories": dict,
    "names": list
}

In [345]:
class SearchSchemaGenerator(object):
    """ 
        Creates a search schema and hash for a given requirements dictionary.
        Use this to create a simple search schema. If the search schema
    """
    
    
    def __init__(self):
        self.start = {}
        self.end = {}
        self.string_version = {}
        self.indexable = {}
        self.sub_dict_keys = []
        self.arr = []
        
    
        
    @property
    def generics(self):
        return [
            str, float, int, list, bool, dict
        ]
    
    def is_generic(self, _k):
        return _k in self.generics
    
    def is_geo(self, k):
        if is_gen_type(k, GEO):
            return True
        
        if is_gen_type(k, str):
            if name_match(k, "geo"):
                return True
        return False
        
    def to_str(self, i):
        """Converts the item to a string version of it"""
        if i == bool:
            # This will be text that we'll force exact queries on
            return "BOOL"
        elif i == float or i == int:
            return "NUMERIC"
        elif i == str:
            return "TEXT"
        elif i == list:
            return "TAG"
        else:
            return "DICT"
    
    @property
    def subs(self):
        return self.sub_dict_keys
    
    @property
    def is_valid(self) -> bool:
        if len(self.start) == 0:
            return False
        for k, v in self.start.items():
            if not self.is_generic(v):
                if not self.is_geo(v):
                    return False
                self.string_version[k] = "GEO"
                self.arr.append(GeoField(k))
                continue
            sval = self.to_str(v)
            self.string_version[k] = sval
            if sval in ["NUMERIC", "TEXT", "BOOL"]:
                self.indexable[k] = sval
                if v == "NUMERIC":
                    self.arr.append(NumericField(k))
                else:
                    self.arr.append(TextField(k))
        return True
    
    
    def index_array(self):
        return self.arr
    
    @property
    def hashed_schema(self):
        return consistent_hash(self.string_version)
    
    def generate(self, required:dict):
        self.start = required
        if not self.is_valid:
            raise TypeError("This information is not the right type")
        _hashed = self.hashed_schema
        for k, v in self.string_version.items():
            if v == "DICT":
                self.sub_dict_keys.append(f"{_hashed}:{k}")
        return self
    
    

## Example Queries

Here are some example queries/data types we'll encounter.

### Get all US economic indicators

The query would include the following indicators:

1. Economic specific information (something labelled as economics)
2. A label stating that it's for the United States
3. We'd probably want everything pertaining the markets too.

Storing the information would look similar to the following:

* **data_type** - dataset
* **category** - markets
* **subcategories**
    * **field** - economics
    * **country** - US

The exact name wouldn't matter, but it should return all the datasets with complete information that we can use for an actual backtest system. There's two parts to this:

1. Actually inserting information into the system so we can find it later
2. Using a dictionary to find that information


The schema everything will likely rely on is the following:
```py
{
    "name": str
    "data_type":str,
    "category": str,
    "subcategories": dict,
    "description": str
}
```

This schema would allow us to save information inside of the database in the following way:


```py
{
    "name": "Real GDP",
    "data_type": "dataset",
    "category": "markets",
    "subcategories": {
        "aspect": "economic",
        "country": "US"
    },
    "description": "The Federal Reserve uses data such as the real GDP and other related economic indicators to adjust its monetary policy."
}
```

While our search schema would look like the following:

```py
query = {
   "data_type": "dataset",
   "category": "markets",
   "subcategories": {
       "aspect": "economic",
       "country": "US"
   }
}
```

It should return the Real GDP if we enter it in in the form of a list. If we're wrapping the dataset search into its own class, we wouldn't need to add the data_type field.


Instead of documenting a lot about it, how about we create an example.

In [346]:
from redisearch import Client
from redisearch import Client, TextField, NumericField, Query

In [347]:
schema = {
#     "type": "feature",
    "meta_type": str,
    "name": str,
    "category": str,
    "subcategories": dict,
    "description": str,
    "poop": "world",
    "location": "geo"
}

In [348]:
scheme_gen = SearchSchemaGenerator()

In [349]:
def create_client(_schema):
    client = None
    try:
        scheme_gen.generate(_schema)
        index_list = scheme_gen.index_array()
        client = Client(scheme_gen.hashed_schema)
        client.create_index(index_list)
    except Exception as res:
        print(str(res))
    return client

In [350]:
create_client(schema)

This information is not the right type


In [386]:
class SampleSearch(SearchSchemaGenerator):
    """
        # SampleSearch
        
        
        This is a prototype of how the actual search would work.
    """
    def __init__(self):
        super().__init__()
        self.required = {
            "meta_type": str,
            "name": str,
            "category": str,
            "subcategories": dict,
            "description": str,
            "location": "geo"
        }
        self._client = None
    
    @property
    def client(self):
        
        if self._client is not None:
            return self._client
        
        self._client = None
        try:
            self.generate(self.required)
            
            index_list = self.index_array()
            self._client = Client(self.hashed_schema)
            self._client.create_index(index_list)
        except Exception as res:
            pass
        return self._client

In [387]:
sample_search = SampleSearch()

In [406]:
sample_search.client

<redisearch.client.Client at 0x7f6d552ecd68>

In [389]:
# print(client)