# Showcase

#### This is a simple tutorial to go over vexpresso capabilities

Imports

In [1]:
import vexpresso
import numpy as np
from vexpresso.retriever import Retriever

In [1]:
data = {
    "status": ["read", "unread", "read", "unread", "read", "unread", "read", "unread"],
    "documents": ["A document that discusses domestic policy", "A document that discusses international affairs", "A document that discusses kittens", "A document that discusses dogs", "A document that discusses chocolate", "A document that is sixth that discusses government", "A document that discusses international affairs", "A document that discusses global affairs"],
    "ids": ["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"]
}
embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ]

In [4]:
import inspect
import daft

In [5]:
inspect.getmembers(daft.DataFrame)

[('_DataFrame__column_input_to_expression',
  <function daft.dataframe.dataframe.DataFrame.__column_input_to_expression(self, columns: Iterable[Union[daft.expressions.expressions.Expression, str]]) -> daft.expressions.expressions.ExpressionsProjection>),
 ('__class__', type),
 ('__delattr__', <slot wrapper '__delattr__' of 'object' objects>),
 ('__dict__',
  mappingproxy({'__module__': 'daft.dataframe.dataframe',
                '__doc__': 'A Daft DataFrame is a table of data. It has columns, where each column has a type and the same\n    number of items (rows) as all other columns.\n    ',
                '__init__': <function daft.dataframe.dataframe.DataFrame.__init__(self, plan: daft.logical.logical_plan.LogicalPlan) -> None>,
                '_plan': <property at 0x7f66a27ad180>,
                '_result': <property at 0x7f669bf3ee50>,
                'plan': <function daft.dataframe.dataframe.DataFrame.plan(self) -> daft.logical.logical_plan.LogicalPlan>,
                'explain

In [6]:
collection = vexpresso.create(data=data)
collection = collection.add_column( 'embeddings', embeddings)

ValueError: If `on` is None then both `left_on` and `right_on` must not be None

In [4]:
collection.show(5)

status Utf8,documents Utf8,ids Utf8
read,A document that discusses domestic policy,id1
unread,A document that discusses international affairs,id2
read,A document that discusses kittens,id3
unread,A document that discusses dogs,id4
read,A document that discusses chocolate,id5


## Collection Creation

#### First we'll create some sample data. Here we're using just strings, but because `vexpresso` uses `daft`, you can use any datatype!

In [2]:
data = {"numbers":list(range(1, 100)), "strings":[f"test_{i}" for i in range(1, 100)]}

#### To create the collection, use the `create` method. Lets also use a NumpyRetriever that uses euclidian distance. This by default is lazy execution, meaning that we actually don't load in any data until `execute` or `show` is called. (Or if `lazy` is passed)

In [3]:
collection = vexpresso.create(data=data, retriever=Retriever(similarity_fn="euclidian"))
collection

2023-06-13 20:27:32.649 | INFO     | daft.context:runner:80 - Using PyRunner


0,1
numbers Int64,strings Utf8


### If you want to operate directly

### Vexpresso also works on clusters with Ray!

```python
collection = vexpresso.create(data=data, retriever=Retriever(similarity_fn="euclidian"), backend="ray", cluster_address=..., cluster_kwargs=...)
```

#### Lets see what's in the collection now!

In [5]:
collection.show(5)

status Utf8,documents Utf8,ids Utf8
read,A document that discusses domestic policy,id1
unread,A document that discusses international affairs,id2
read,A document that discusses kittens,id3
unread,A document that discusses dogs,id4
read,A document that discusses chocolate,id5


#### vexpresso's `Collection` methods return `Collection` objects, allowing for complex chaining of calls

## Embed Data

#### Lets embed the data using a simple "fake" embedding function. This simply returns a vector of integers based on the row number. For example, row 1's embedding is an array of 1s of size 100

In [5]:
import numpy as np

def embed_fn(strings):
    return [np.array([i]*100) for i in range(len(strings))]

In [6]:
collection = collection.embed("strings", embedding_fn=embed_fn) # returns a new collection

#### By default vexpresso is "lazy", meaning that nothing is executed until `.execute` is called
Note: this can be bypassed by passing `lazy=False`

```python
collection = collection.embed("strings", embedding_fn=embed_fn, lazy=False)
```

In [7]:
collection

0,1,2
numbers Int64,strings Utf8,embeddings_strings Python


#### Let's execute it to get embeddings

In [8]:
collection = collection.execute()

In [9]:
collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python
1,test_1,"<np.ndarray shape=(100,) dtype=int64>"
2,test_2,"<np.ndarray shape=(100,) dtype=int64>"
3,test_3,"<np.ndarray shape=(100,) dtype=int64>"
4,test_4,"<np.ndarray shape=(100,) dtype=int64>"
5,test_5,"<np.ndarray shape=(100,) dtype=int64>"


In [10]:
collection.to_dict()["embeddings_strings"][:3]

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 

In [11]:
embed_fn(["test_3"])

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]

## Query

#### as you can see we now have an `embeddings_strings` column, let's query it and return the top 5 results!

In [12]:
queried = collection.query("embeddings_strings", query="test_3", k=5).execute()

#### As expected, the closest strings to `test_3` (according to our embedding function above) are `test_3`, `test_2`, `test_4`, `test_1`, `test_5`.
#### In addition, we can see the actual similarity scores in `embeddings_strings_score` column

In [13]:
queried.show(5)

numbers Int64,strings Utf8,embeddings_strings Python,embeddings_strings_score Float64
1,test_1,"<np.ndarray shape=(100,) dtype=int64>",1.0
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",0.0909091
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",0.047619
4,test_4,"<np.ndarray shape=(100,) dtype=int64>",0.0322581
5,test_5,"<np.ndarray shape=(100,) dtype=int64>",0.0243902


#### Sometimes you will want to batch queries together into a single call. vexpresso has a convenient `batch_query` function. This will return a list of Collections

In [14]:
queries = ["test_1", "test_5", "test_10"]

In [15]:
batch_queried = collection.batch_query("embeddings_strings", queries=queries, k=2)

#### We now have collections for each query

In [16]:
batch_queried[0].show(2)

numbers Int64,strings Utf8,embeddings_strings Python,embeddings_strings_score Float64
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",1.0
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",0.0909091


In [17]:
batch_queried[1].show(2)

numbers Int64,strings Utf8,embeddings_strings Python,embeddings_strings_score Float64
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",1.0
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",0.0909091


In [18]:
batch_queried[2].show(2)

numbers Int64,strings Utf8,embeddings_strings Python,embeddings_strings_score Float64
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",1.0
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",0.0909091


## Filtering

#### With `vexpresso`, filtering is super easy. The syntax is similar to `chromadb`

#### Filter dictionary must have the following structure:

```python
{
    <field>: {
        <filter_method>: <value>
    },
    <field>: {
        <filter_method>: <value>
    },
}

```

Let's filter the original collection to only include rows with `numbers` > 95

In [19]:
filtered_collection = collection.filter(
    {
        "numbers":{
            "gt":95
        }
    }
).execute()

In [20]:
filtered_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python
96,test_96,"<np.ndarray shape=(100,) dtype=int64>"
97,test_97,"<np.ndarray shape=(100,) dtype=int64>"
98,test_98,"<np.ndarray shape=(100,) dtype=int64>"
99,test_99,"<np.ndarray shape=(100,) dtype=int64>"


#### We can use multiple filter conditions as well
Let's filter the collection to only return rows with numbers <= 50 and strings with "0" in them

In [21]:
filtered_collection = collection.filter(
    {
        "numbers":{
            "lte":50
        },
        "strings":{
            "contains":"0"
        }
    }
).execute()

In [22]:
filtered_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python
10,test_10,"<np.ndarray shape=(100,) dtype=int64>"
20,test_20,"<np.ndarray shape=(100,) dtype=int64>"
30,test_30,"<np.ndarray shape=(100,) dtype=int64>"
40,test_40,"<np.ndarray shape=(100,) dtype=int64>"
50,test_50,"<np.ndarray shape=(100,) dtype=int64>"


#### Sometimes you need a custom filtering function, with vexpresso its easy to do that with the `custom` filter keyword!
Lets filter a collection to only return rows with even `numbers` and `strings` that contain a "3"

In [23]:
def custom_filter(number, mod_val) -> bool:
    return number % mod_val == 0

In [24]:
filtered_collection = collection.filter(
    {
        "numbers":{
            "custom":{"function":custom_filter, "function_kwargs":{"mod_val":2}}
        },
        "strings":{
            "contains":"3"
        }
    }
).execute()

In [25]:
filtered_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python
30,test_30,"<np.ndarray shape=(100,) dtype=int64>"
32,test_32,"<np.ndarray shape=(100,) dtype=int64>"
34,test_34,"<np.ndarray shape=(100,) dtype=int64>"
36,test_36,"<np.ndarray shape=(100,) dtype=int64>"
38,test_38,"<np.ndarray shape=(100,) dtype=int64>"


#### You can also combine filters + queries in the same call

 Lets query the collection with "test_10" and filter only even numbers

In [26]:
even_filter = {
    "numbers":{
        "custom":{"function":custom_filter, "function_kwargs":{"mod_val":2}}
    }
}

In [27]:
query_filtered_collection = collection.query("embeddings_strings", "test_10", k=10, filter_conditions=even_filter).execute()

In [28]:
query_filtered_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python,embeddings_strings_score Float64
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",0.0909091
4,test_4,"<np.ndarray shape=(100,) dtype=int64>",0.0322581
6,test_6,"<np.ndarray shape=(100,) dtype=int64>",0.0196078
8,test_8,"<np.ndarray shape=(100,) dtype=int64>",0.0140845
10,test_10,"<np.ndarray shape=(100,) dtype=int64>",0.010989


## Chaining Functions

#### We can chain functions lazily easily

For instance, lets query and filter multiple times

In [29]:
even_filter = {
    "numbers":{
        "custom":{"function":custom_filter, "function_kwargs":{"mod_val":2}}
    }
}

In [30]:
chained_collection = collection.query("embeddings_strings", "test_10", k=50) \
                               .filter(even_filter) \
                               .query("embeddings_strings", "test_30", k=50) \
                               .filter({"numbers":{"lte":30}})

In [31]:
chained_collection.daft_df

0,1,2,3
numbers Int64,strings Utf8,embeddings_strings Python,embeddings_strings_score Float64


Here we queried for the closest 50 elements to "test_10", filtered for only even numbers, queried top 50 of "test_30", then filtered for numbers <= 30

In [32]:
chained_collection = chained_collection.execute()

In [33]:
chained_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python,embeddings_strings_score Float64
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",0.0909091
4,test_4,"<np.ndarray shape=(100,) dtype=int64>",0.0322581
6,test_6,"<np.ndarray shape=(100,) dtype=int64>",0.0196078
8,test_8,"<np.ndarray shape=(100,) dtype=int64>",0.0140845
10,test_10,"<np.ndarray shape=(100,) dtype=int64>",0.010989


get_text_features## Transforms

#### Sometimes you want to transform your data. Because of `daft`, you can use `vexpresso` to do this easily! 

#### For example, lets add a new column where we change "test" to "example" in the strings column. Lets specify that this output is also a string type

For a full list of datatypes, visit daft documentation: https://www.getdaft.io/projects/docs/en/latest/api_docs/datatype.html

In [34]:
def simple_apply_fn(strings):
    return [
        s.replace("test", "example") for s in strings
    ]

In [35]:
transformed_collection = collection.apply(simple_apply_fn, collection["strings"], datatype=vexpresso.DataType.string()).execute()

In [36]:
transformed_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python,tranformed_strings Utf8
1,test_1,"<np.ndarray shape=(100,) dtype=int64>",example_1
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",example_2
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",example_3
4,test_4,"<np.ndarray shape=(100,) dtype=int64>",example_4
5,test_5,"<np.ndarray shape=(100,) dtype=int64>",example_5


#### We can also pass in args, kwargs, and multiple columns into the apply function

For instance, lets replace the "test" chars in string column with "modified" and also replace the suffix with `number` times 1000. In addition lets name the column `modified`

In [37]:
def multi_column_apply_fn(string_columns, numbers):
    out = []
    for string, num in zip(string_columns, numbers):
        replaced = string.replace("test", "modified").split("_")[0]
        modified = f"{replaced}_{num*1000}"
        out.append(modified)
    return out

In [38]:
transformed_collection = collection.apply(
    multi_column_apply_fn,
    collection["strings"],
    numbers=collection["numbers"],
    to="modified",
    datatype=vexpresso.DataType.string()
).execute()

In [39]:
transformed_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python,modified Utf8
1,test_1,"<np.ndarray shape=(100,) dtype=int64>",modified_1000
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",modified_2000
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",modified_3000
4,test_4,"<np.ndarray shape=(100,) dtype=int64>",modified_4000
5,test_5,"<np.ndarray shape=(100,) dtype=int64>",modified_5000


## Adding data

## Saving + Loading

#### Once you've done a bunch of processing on a collection, you probably want to save it somewhere. Vexpresso supports local file saving + huggingface datasets

Lets save the `transformed_collection` above to a directory `saved_transformed_collection`

In [40]:
transformed_collection.save("./saved_collection/saved_transformed_collection")

saving to ./saved_collection/saved_transformed_collection


We can then load the collection with the same `create` function. Make sure to also include the embedding functions that were used on the original collection!

In [41]:
loaded_collection = vexpresso.create(
    directory_or_repo_id = "./saved_collection/saved_transformed_collection",
    embedding_functions = {"embeddings_strings":embed_fn}
)

In [42]:
loaded_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python,modified Utf8
1,test_1,"<np.ndarray shape=(100,) dtype=int64>",modified_1000
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",modified_2000
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",modified_3000
4,test_4,"<np.ndarray shape=(100,) dtype=int64>",modified_4000
5,test_5,"<np.ndarray shape=(100,) dtype=int64>",modified_5000


#### Now let's upload to huggingface!

For this you'll need to install huggingfacehub

In [43]:
# !pip install huggingface-hub

Automatically gets token from env variable: HUGGINGFACEHUB_API_TOKEN = ...

or you can pass in token directly via `collection.save(token=...)`

In [44]:
username = "shyamsn97"
repo_name = "vexpresso_test_showcase"
# username = "REPLACE"
# repo_name = "REPLACE"

In [45]:
loaded_collection.save(hf_username = username, repo_name = repo_name, to_hub=True, )

Uploading collection to None


  from .autonotebook import tqdm as notebook_tqdm

content.parquet: 100%|█████████████████████████████████████████████| 4.55k/4.55k [00:00<00:00, 10.1kB/s][A

Upload 1 LFS files: 100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.56it/s][A


Upload to shyamsn97/vexpresso_test_showcase complete!


'shyamsn97/vexpresso_test_showcase'

The example is private by default, but this can be changed by the `private` flag

In [46]:
# loaded_collection.save(hf_username = username, repo_name = repo_name, to_hub=True, private=False)

You can see an example of the above data: https://huggingface.co/datasets/shyamsn97/vexpresso_test_showcase

#### Now lets load it!

In [47]:
loaded_collection = vexpresso.create(
    hf_username = username,
    repo_name = repo_name,
    embedding_functions = {"embeddings_strings":embed_fn}
)

Retrieving from hf repo: shyamsn97/vexpresso_test_showcase


Fetching 2 files:  50%|█████████████████████████▌                         | 1/2 [00:00<00:00,  9.21it/s]
Downloading content.parquet: 100%|██████████████████████████████████| 4.55k/4.55k [00:00<00:00, 535kB/s][A
Fetching 2 files: 100%|███████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.21it/s]


In [48]:
loaded_collection.show(5)

numbers Int64,strings Utf8,embeddings_strings Python,modified Utf8
1,test_1,"<np.ndarray shape=(100,) dtype=int64>",modified_1000
2,test_2,"<np.ndarray shape=(100,) dtype=int64>",modified_2000
3,test_3,"<np.ndarray shape=(100,) dtype=int64>",modified_3000
4,test_4,"<np.ndarray shape=(100,) dtype=int64>",modified_4000
5,test_5,"<np.ndarray shape=(100,) dtype=int64>",modified_5000
