# Vexpresso Walkthrough

## In this walkthrough, we will be using a small dataset of Pokemon statistics to showcase vexpresso's capabilities

### Imports

In [1]:
import json
import pandas as pd
from langchain.embeddings import HuggingFaceHubEmbeddings, HuggingFaceEmbeddings
from vexpresso.collection import Collection
from vexpresso.retrieval import FaissRetrievalStrategy, TopKRetrievalStrategy
from vexpresso.embedding_function import LangChainEmbeddingsFunction

### Loading in data

In [2]:
with open("./data/pokedex_processed.json", "r") as f:
    documents = json.load(f)

In [3]:
df = pd.DataFrame(documents)

In [4]:
df.head(10)

Unnamed: 0,id,type,info,profile,name,description
0,1,"['Grass', 'Poison']","{'HP': 45, 'Attack': 49, 'Defense': 49, 'Sp. A...","{'height': '0.7 m', 'weight': '6.9 kg', 'egg':...",Bulbasaur,Seed Pokémon: Bulbasaur can be seen napping in...
1,2,"['Grass', 'Poison']","{'HP': 60, 'Attack': 62, 'Defense': 63, 'Sp. A...","{'height': '1 m', 'weight': '13 kg', 'egg': ['...",Ivysaur,Seed Pokémon: There is a bud on this Pokémon’s...
2,3,"['Grass', 'Poison']","{'HP': 80, 'Attack': 82, 'Defense': 83, 'Sp. A...","{'height': '2 m', 'weight': '100 kg', 'egg': [...",Venusaur,Seed Pokémon: There is a large flower on Venus...
3,4,['Fire'],"{'HP': 39, 'Attack': 52, 'Defense': 43, 'Sp. A...","{'height': '0.6 m', 'weight': '8.5 kg', 'egg':...",Charmander,Lizard Pokémon: The flame that burns at the ti...
4,5,['Fire'],"{'HP': 58, 'Attack': 64, 'Defense': 58, 'Sp. A...","{'height': '1.1 m', 'weight': '19 kg', 'egg': ...",Charmeleon,Flame Pokémon: Charmeleon mercilessly destroys...
5,6,"['Fire', 'Flying']","{'HP': 78, 'Attack': 84, 'Defense': 78, 'Sp. A...","{'height': '1.7 m', 'weight': '90.5 kg', 'egg'...",Charizard,Flame Pokémon: Charizard flies around the sky ...
6,7,['Water'],"{'HP': 44, 'Attack': 48, 'Defense': 65, 'Sp. A...","{'height': '0.5 m', 'weight': '9 kg', 'egg': [...",Squirtle,Tiny Turtle Pokémon: Squirtle’s shell is not m...
7,8,['Water'],"{'HP': 59, 'Attack': 63, 'Defense': 80, 'Sp. A...","{'height': '1 m', 'weight': '22.5 kg', 'egg': ...",Wartortle,Turtle Pokémon: Its tail is large and covered ...
8,9,['Water'],"{'HP': 79, 'Attack': 83, 'Defense': 100, 'Sp. ...","{'height': '1.6 m', 'weight': '85.5 kg', 'egg'...",Blastoise,Shellfish Pokémon: Blastoise has water spouts ...
9,10,['Bug'],"{'HP': 45, 'Attack': 30, 'Defense': 35, 'Sp. A...","{'height': '0.3 m', 'weight': '2.9 kg', 'egg':...",Caterpie,Worm Pokémon: Its body is soft and weak. In na...


In [5]:
df.shape

(809, 6)

## Assemble content, metadata, embedding_fn

In [6]:
content = list(df["description"])
embeddings_fn = LangChainEmbeddingsFunction(HuggingFaceEmbeddings())

  from .autonotebook import tqdm as notebook_tqdm


## Create Collection
### This may take some time because we are embedding 809 entries

In [7]:
# retrieval_strategy = FaissRetrievalStrategy() # this requires faiss, but is faster than numpy
retrieval_strategy = TopKRetrievalStrategy()

collection = Collection(
    content = content,
    embedding_fn = embeddings_fn,
    metadata = df,
    ids = list(df["name"]),
    retrieval_strategy=retrieval_strategy
)

### you can also create this without metadata or ids

In [8]:
without_metadata = Collection(
    content,
    embedding_fn = embeddings_fn,
    retrieval_strategy=retrieval_strategy
)

## Query by Pokemon description

### Let's find a pokemon that "loves to sleep"

In [9]:
query = "Loves to sleep"
sleepy_pokemon = collection.query("Loves to sleep", k=10)

In [10]:
sleepy_pokemon.df()

Unnamed: 0,id,type,info,profile,name,description,vexpresso_index,ids,content
142,143,['Normal'],"{'HP': 160, 'Attack': 110, 'Defense': 65, 'Sp....","{'height': '2.1 m', 'weight': '460 kg', 'egg':...",Snorlax,Sleeping Pokémon: It has no interest in anythi...,0,143,Sleeping Pokémon: It has no interest in anythi...
660,661,"['Normal', 'Flying']","{'HP': 45, 'Attack': 50, 'Defense': 43, 'Sp. A...","{'height': '0.3 m', 'weight': '1.7 kg', 'egg':...",Fletchling,Tiny Robin Pokémon: Its body is always warm. T...,1,661,Tiny Robin Pokémon: Its body is always warm. T...
774,775,['Normal'],"{'HP': 65, 'Attack': 115, 'Defense': 65, 'Sp. ...","{'height': '0.4 m', 'weight': '19.9 kg', 'egg'...",Komala,Drowsing Pokémon: It remains asleep from birth...,2,775,Drowsing Pokémon: It remains asleep from birth...
160,161,['Normal'],"{'HP': 35, 'Attack': 46, 'Defense': 34, 'Sp. A...","{'height': '0.8 m', 'weight': '6 kg', 'egg': [...",Sentret,"Scout Pokémon: When Sentret sleeps, it does so...",3,161,"Scout Pokémon: When Sentret sleeps, it does so..."
300,301,['Normal'],"{'HP': 70, 'Attack': 65, 'Defense': 65, 'Sp. A...","{'height': '1.1 m', 'weight': '32.6 kg', 'egg'...",Delcatty,Prim Pokémon: Delcatty sleeps anywhere it want...,4,301,Prim Pokémon: Delcatty sleeps anywhere it want...
69,70,"['Grass', 'Poison']","{'HP': 65, 'Attack': 90, 'Defense': 50, 'Sp. A...","{'height': '1 m', 'weight': '6.4 kg', 'egg': [...",Weepinbell,Flycatcher Pokémon: Weepinbell has a large hoo...,5,70,Flycatcher Pokémon: Weepinbell has a large hoo...
703,704,['Dragon'],"{'HP': 45, 'Attack': 50, 'Defense': 35, 'Sp. A...","{'height': '0.3 m', 'weight': '2.8 kg', 'egg':...",Goomy,Soft Tissue Pokémon: Its body is mostly water....,6,704,Soft Tissue Pokémon: Its body is mostly water....
95,96,['Psychic'],"{'HP': 60, 'Attack': 48, 'Defense': 45, 'Sp. A...","{'height': '1 m', 'weight': '32.4 kg', 'egg': ...",Drowzee,Hypnosis Pokémon: It puts its prey to sleep an...,7,96,Hypnosis Pokémon: It puts its prey to sleep an...
490,491,['Dark'],"{'HP': 70, 'Attack': 90, 'Defense': 90, 'Sp. A...","{'height': '1.5 m', 'weight': '50.5 kg', 'egg'...",Darkrai,Pitch-Black Pokémon: It can lull people to sle...,8,491,Pitch-Black Pokémon: It can lull people to sle...
24,25,['Electric'],"{'HP': 35, 'Attack': 55, 'Defense': 40, 'Sp. A...","{'height': '0.4 m', 'weight': '6 kg', 'egg': [...",Pikachu,"Mouse Pokémon: While sleeping, it generates el...",9,25,"Mouse Pokémon: While sleeping, it generates el..."


## Grabbing metadata fields

In [11]:
# get a single field
names = sleepy_pokemon.get_field("name")
types = sleepy_pokemon.get_field("type")
descriptions = sleepy_pokemon.get_field("description")

# get multiple
names, types, descriptions = sleepy_pokemon.get_fields(["name", "type", "description"])
for n, t, d in zip(names, types, descriptions):
    print(f"{n}: {t} -- {d}")

Snorlax: ['Normal'] -- Sleeping Pokémon: It has no interest in anything other than eating. Even if you climb up on its stomach while it’s napping, it doesn’t seem to mind at all!
Fletchling: ['Normal', 'Flying'] -- Tiny Robin Pokémon: Its body is always warm. Trainers who live in cold areas apparently sleep with it in their bed.
Komala: ['Normal'] -- Drowsing Pokémon: It remains asleep from birth to death as a result of the sedative properties of the leaves that form its diet.
Sentret: ['Normal'] -- Scout Pokémon: When Sentret sleeps, it does so while another stands guard. The sentry wakes the others at the first sign of danger. When this Pokémon becomes separated from its pack, it becomes incapable of sleep due to fear.
Delcatty: ['Normal'] -- Prim Pokémon: Delcatty sleeps anywhere it wants without keeping a permanent nest. If other Pokémon approach it as it sleeps, this Pokémon will never fight—it will just move away somewhere else.
Weepinbell: ['Grass', 'Poison'] -- Flycatcher Pokém

## Performing filters

### With vexpresso we can leverage duckdb for powerful queries

#### In order to filter on metadata, you must supply a filter dictionary to the filter method. The dictionary must have the following structure:

```python
{
    <field>: {
        <filter_method>: <value>
    },
    <field>: {
        <filter_method>: <value>
    },
}

```

#### An example query to only get Pokemon with the name "Snorlax":


In [12]:
filter_condition = {
    "name": {
        "eq":"Snorlax"
    }
}

In [13]:
sleepy_pokemon.filter(filter_condition).df()

Unnamed: 0,id,type,info,profile,name,description,vexpresso_index,ids,content
142,143,['Normal'],"{'HP': 160, 'Attack': 110, 'Defense': 65, 'Sp....","{'height': '2.1 m', 'weight': '460 kg', 'egg':...",Snorlax,Sleeping Pokémon: It has no interest in anythi...,0,143,Sleeping Pokémon: It has no interest in anythi...


#### Here are the supported filter methods:

In [14]:
sleepy_pokemon.metadata.print_filter_methods()

contains: 
        {field} (str) contains {value} (str)
        
----------------------------------
eq: 
        {field} equal to {value} (str, int, float)
        
----------------------------------
gt: 
        {field} greater than {value} (int, float)
        
----------------------------------
gte: 
        {field} greater than or equal to {value} (int, float)
        
----------------------------------
isin: 
        {field} is in list of {values} (list of str, int, or float)
        
----------------------------------
lt: 
        {field} less than {value} (int, float)
        
----------------------------------
lte: 
        {field} less than or equal to {value} (int, float)
        
----------------------------------
neq: 
        {field} not equal to {value} (str, int, float)
        
----------------------------------
notcontains: 
        {field} (str) does not contains {value} (str)
        
----------------------------------
notin: 
        {field} not in list of {values} 

#### Let's filter the above collection for "Normal" type Pokemon

In [15]:
filter_condition = {"type":{"contains":"Normal"}}
normal_sleepy_pokemon = sleepy_pokemon.filter(filter_condition)

In [16]:
normal_sleepy_pokemon.df()

Unnamed: 0,id,type,info,profile,name,description,vexpresso_index,ids,content
142,143,['Normal'],"{'HP': 160, 'Attack': 110, 'Defense': 65, 'Sp....","{'height': '2.1 m', 'weight': '460 kg', 'egg':...",Snorlax,Sleeping Pokémon: It has no interest in anythi...,0,143,Sleeping Pokémon: It has no interest in anythi...
660,661,"['Normal', 'Flying']","{'HP': 45, 'Attack': 50, 'Defense': 43, 'Sp. A...","{'height': '0.3 m', 'weight': '1.7 kg', 'egg':...",Fletchling,Tiny Robin Pokémon: Its body is always warm. T...,1,661,Tiny Robin Pokémon: Its body is always warm. T...
774,775,['Normal'],"{'HP': 65, 'Attack': 115, 'Defense': 65, 'Sp. ...","{'height': '0.4 m', 'weight': '19.9 kg', 'egg'...",Komala,Drowsing Pokémon: It remains asleep from birth...,2,775,Drowsing Pokémon: It remains asleep from birth...
160,161,['Normal'],"{'HP': 35, 'Attack': 46, 'Defense': 34, 'Sp. A...","{'height': '0.8 m', 'weight': '6 kg', 'egg': [...",Sentret,"Scout Pokémon: When Sentret sleeps, it does so...",3,161,"Scout Pokémon: When Sentret sleeps, it does so..."
300,301,['Normal'],"{'HP': 70, 'Attack': 65, 'Defense': 65, 'Sp. A...","{'height': '1.1 m', 'weight': '32.6 kg', 'egg'...",Delcatty,Prim Pokémon: Delcatty sleeps anywhere it want...,4,301,Prim Pokémon: Delcatty sleeps anywhere it want...


### We can also filter for multiple conditions on nested data

#### For example, let's filter the above collection for Pokemon with HP >= 35 and Defense < 45

In [17]:
filter_condition = {"info.HP": {"gte":35}, "info.Defense": {"lt":45}}
filtered = normal_sleepy_pokemon.filter(filter_condition)

In [18]:
filtered.df()

Unnamed: 0,id,type,info,profile,name,description,vexpresso_index,ids,content
660,661,"['Normal', 'Flying']","{'HP': 45, 'Attack': 50, 'Defense': 43, 'Sp. A...","{'height': '0.3 m', 'weight': '1.7 kg', 'egg':...",Fletchling,Tiny Robin Pokémon: Its body is always warm. T...,0,661,Tiny Robin Pokémon: Its body is always warm. T...
160,161,['Normal'],"{'HP': 35, 'Attack': 46, 'Defense': 34, 'Sp. A...","{'height': '0.8 m', 'weight': '6 kg', 'egg': [...",Sentret,"Scout Pokémon: When Sentret sleeps, it does so...",1,161,"Scout Pokémon: When Sentret sleeps, it does so..."
