## Vectors v1

### Goal

Let users specify notification topics, for example:

"Hey BeepGPT, let me know when there’s an important engineering decision being made about the Fraud Detection Project."

### Method

Create embeddings for conversations and store them in a vector store. Then search on each topic for nearest matches alert
if conversations come within some threshold.

#### Pros:
* Fast response. Only requires an LLM to do embeddings of the conversation and search over the vector space
* Fairly cheap to run.
* Can potentially learn user's specific interests over time from via clustering analysis in the vector space.

#### Cons:
* Requires a vector database

### Questions:
* Will the topic embeddings be close to the associated conversations in the vector space?
* Are there different embedding methods that will work better than others?


### The code:

#### Install the tools, initiate the things

In [None]:
%pip install -q kaskada llama-cpp-python ipywidgets

In [None]:
import pandas as pd
import kaskada as kd

# Initialize Kaskada with a local execution context.
kd.init_session()

# set pandas to display all floats with 6 decimal places
pd.options.display.float_format = '{:.6f}'.format

#### Pull in the user list, create a `format_user()` method

In [None]:
users_df = pd.read_json("slack-generation.users.json")

columns_to_keep = ["id", "team_id", "name", "deleted", "real_name", "is_bot", "updated"]

users_df.drop(columns=users_df.columns.difference(columns_to_keep), inplace=True)

users = {}
for user in users_df.to_dict(orient='index').values():
    users[user["id"]] = user

In [None]:
def get_user(user_id):
    return users[user_id] if user_id in users.keys() else None

def format_user(user_id):
    user = get_user(user_id)
    return f"{user['name']} ({user_id})" if user else f"({user_id})"

format_user("UBB9D2B01")

#### Load the slack data, clean the message text, format message users

In [None]:
# Load events from a Parquet file
#
# if you wan to load in your own slack data, change this to the path of your output file from 1.1 above
# otherwise continue with `slack-generation.parquet`, which contains generated slack data for
# example purposes. See the `slack-generation/notebook.ipynb` notebook for more info.
input_file = "slack-generation.parquet"

# Use the "ts" column as the time associated with each row,
# and the "channel" column as the entity associated with each row.
raw_msgs = await kd.sources.Parquet.create(
    input_file,
    time_column = "ts",
    key_column = "channel",
    time_unit = "s"
)
raw_msgs.preview(5)

In [None]:
import json

@kd.udf("f<N: any>(x: N) -> string")
def format_users(batch: pd.Series):
    # Apply to each row in the batch
    return batch.map(format_user)

In [None]:
# Clean Text
import re

def strip_code_blocks(line):
    return re.sub(r"```.*?```", '', line)

def user_repl(match_obj):
    user_id = match_obj.group(1)
    return format_user(user_id)

def update_users(line):
    return re.sub(r"<@(.*?)>", user_repl, line)

def clean_message(text):
        text = strip_code_blocks(update_users(text)).strip()
        return None if text == "" else text

@kd.udf("f<N: any>(x: N) -> string")
def clean_text(batch: pd.Series):
    # Apply to each row in the batch
    return batch.map(clean_message)

In [None]:
formatted_msgs = raw_msgs.extend({
    "text": raw_msgs.col("text").pipe(clean_text),
    "user": raw_msgs.col("user").pipe(format_users)
})
formatted_msgs.preview(5)

#### Convert the non-threaded messages into threaded messages. See `FineTuning_v2.ipynb` for more details on this.

In [None]:
from datetime import timedelta

ts = formatted_msgs.col("ts")
thread_ts = formatted_msgs.col("thread_ts")

# split messages into two subgroups: threads and non-threads
threads = formatted_msgs.filter(thread_ts.is_not_null())
non_threads = formatted_msgs.filter(thread_ts.is_null())

# for non-threads, consider a message a new conversation when
# more than 10 mins have elapsed since the previous message
is_new = ts.seconds_since_previous() > timedelta(minutes=10)

# Eventually this will just be: `thread_ts = ts.first(window=kd.windows.Since(is_new, start="inclusive"))`
#
# However, `Since()` is currently exclusive on the start of the window, inclusive on the end.
# But we need inclusive on the start and exclusive on the end.
#
# The hack below does what we need until `Since()` provides additional options for inclusivity
shifted_non_threads = non_threads.shift_by(timedelta(microseconds=0.001))
shifted_ts = shifted_non_threads.lag(1).col("ts").first(window=kd.windows.Since(is_new))
thread_ts = ts.if_(is_new).else_(shifted_ts)

# create threads_ts column for non-threaded messages
non_threads_threads = non_threads.extend({"thread_ts": thread_ts}).filter(ts.is_not_null().and_(thread_ts.is_not_null()))

# re-join the two message subgroups
joined = threads.else_(non_threads_threads)

# join non-threads and threads back up, and key by conversations
messages = joined.with_key(kd.record({
        "channel": joined.col("channel"),
        "thread": joined.col("thread_ts"),
    }))
messages.preview(5)

#### Collect up all the messages, reactions, users in each conversation.

In [None]:
@kd.udf("f<N: any>(x: N) -> string")
def format_message(batch: pd.Series):
    def formatter(raw):
        return f"{raw['user']} --> {raw['text']}" # --> {raw['reactions']}"
    return batch.map(formatter)

@kd.udf("f<N: any>(x: N) -> string")
def format_messages(batch: pd.Series):
    def formatter(raw):
        return "\n---\n".join(raw)
    return batch.map(formatter)

@kd.udf("f<N: any>(x: N) -> string")
def extract_users(batch: pd.Series):
    def get_users(raw):
        users = [raw["user"]]
        # for user in json.loads(raw["reactions"]).keys():
        #     if user not in users:
        #         users.append(user)
        return json.dumps(users)
    return batch.map(get_users)

@kd.udf("f<N: any>(x: N) -> string")
def unique_users(batch: pd.Series):
    def get_users(raw):
        users = []
        for user_set in raw:
            users.extend(json.loads(user_set))
        return json.dumps(list(set(users)))
    return batch.map(get_users)

conversations = kd.record({
    # "conversation": messages.select("user", "text", "reactions").pipe(format_message).collect(max=None).pipe(format_messages),
    # "users": messages.select("user", "reactions").pipe(extract_users).collect(max=None).pipe(unique_users),
    "conversation": messages.select("user", "text").pipe(format_message).collect(max=None).pipe(format_messages),
    "users": messages.select("user").pipe(extract_users).collect(max=None).pipe(unique_users),
})
conversations.preview(5)

#### Use LlamaIndex to insert all the conversations into a vector store.

In [None]:
# first create LlamaIndex documents

from llama_index import Document

documents = []
async for row in conversations.run_iter(results=kd.results.Snapshot(), kind="row"):
    document = Document(
        id= row["_key"],
        text=row["conversation"],
        metadata={
            'users': row["users"],
        })
    documents.append(document)

In [None]:
# then insert the documents into the vector store

from llama_index import VectorStoreIndex, ServiceContext, set_global_service_context

# This will use llama2-chat-13B from with LlamaCPP, and assumes you have llama-cpp-python installed
service_context = ServiceContext.from_defaults(llm="local")

set_global_service_context(service_context)
index = VectorStoreIndex([])
for doc in documents:
    index.insert(doc)

#### Use a LlamaIndex retriever to test topics. 

This will convert the topic into an embedding and then find the nearest matches in the vector store.

In [None]:
from llama_index import get_response_synthesizer
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.response.notebook_utils import display_source_node

# build retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
    vector_store_query_mode="default",
    alpha=None,
    doc_ids=None,
)

nodes = retriever.retrieve("Tell me about engineering discussions related to the Supply Chain Management project.")
for node in nodes:
    display_source_node(node, source_length=2000, show_source_metadata=True)