## Vectors v1

### Goal

Let users specify notification topics, for example:

"Hey BeepGPT, let me know when there’s an important engineering decision being made about the Fraud Detection Project."

### Method

This method will do the opposite of Vectors v0. We will store the topics in a vector store and do retrieval on each conversation. 

Also we will explore various different embedding models to try to determine the best for this use case.

#### Pros:
* Fast response. Only requires an LLM to do embeddings of the conversation and search over the vector space.
* Fairly cheap to run.
* Can potentially learn user's specific interests over time from via clustering analysis in the vector space.

#### Cons:
* Requires a vector database

### Questions:
* Will the topic embeddings be close to the associated conversations in the vector space?
* Are there different embedding methods that will work better than others?


### The code:

#### Install the tools, initiate the things

In [None]:
%pip install -q kaskada openai llama-cpp-python ipywidgets

In [None]:
import openai, getpass

# Initialize OpenAI
openai.api_key = getpass.getpass('OpenAI: API Key')

In [None]:
import pandas as pd
import kaskada as kd

# Initialize Kaskada with a local execution context.
kd.init_session()

# set pandas to display all floats with 6 decimal places
pd.options.display.float_format = '{:.6f}'.format

#### Pull in the user list, create a `format_user()` method

In [None]:
users_df = pd.read_json("slack-generation.users.json")

columns_to_keep = ["id", "team_id", "name", "deleted", "real_name", "is_bot", "updated"]

users_df.drop(columns=users_df.columns.difference(columns_to_keep), inplace=True)

users = {}
for user in users_df.to_dict(orient='index').values():
    users[user["id"]] = user

In [None]:
def get_user(user_id):
    return users[user_id] if user_id in users.keys() else None

def format_user(user_id):
    user = get_user(user_id)
    return f"{user['name']} ({user_id})" if user else f"({user_id})"

format_user("UBB9D2B01")

#### Load the slack data, clean the message text, format message users

In [None]:
# Load events from a Parquet file
#
# if you wan to load in your own slack data, change this to the path of your output file from 1.1 above
# otherwise continue with `slack-generation.parquet`, which contains generated slack data for
# example purposes. See the `slack-generation/notebook.ipynb` notebook for more info.
input_file = "slack-generation.parquet"

# Use the "ts" column as the time associated with each row,
# and the "channel" column as the entity associated with each row.
raw_msgs = await kd.sources.Parquet.create(
    input_file,
    time_column = "ts",
    key_column = "channel",
    time_unit = "s"
)
raw_msgs.preview(5)

In [None]:
import json

@kd.udf("f<N: any>(x: N) -> string")
def format_users(batch: pd.Series):
    # Apply to each row in the batch
    return batch.map(format_user)

In [None]:
# Clean Text
import re

def strip_code_blocks(line):
    return re.sub(r"```.*?```", '', line)

def user_repl(match_obj):
    user_id = match_obj.group(1)
    return format_user(user_id)

def update_users(line):
    return re.sub(r"<@(.*?)>", user_repl, line)

def clean_message(text):
        text = strip_code_blocks(update_users(text)).strip()
        return None if text == "" else text

@kd.udf("f<N: any>(x: N) -> string")
def clean_text(batch: pd.Series):
    # Apply to each row in the batch
    return batch.map(clean_message)

In [None]:
formatted_msgs = raw_msgs.extend({
    "text": raw_msgs.col("text").pipe(clean_text),
    "user": raw_msgs.col("user").pipe(format_users)
})
formatted_msgs.preview(5)

#### Convert the non-threaded messages into threaded messages. See `FineTuning_v2.ipynb` for more details on this.

In [None]:
from datetime import timedelta

ts = formatted_msgs.col("ts")
thread_ts = formatted_msgs.col("thread_ts")

# split messages into two subgroups: threads and non-threads
threads = formatted_msgs.filter(thread_ts.is_not_null())
non_threads = formatted_msgs.filter(thread_ts.is_null())

# for non-threads, consider a message a new conversation when
# more than 10 mins have elapsed since the previous message
is_new = ts.seconds_since_previous() > timedelta(minutes=10)

# Eventually this will just be: `thread_ts = ts.first(window=kd.windows.Since(is_new, start="inclusive"))`
#
# However, `Since()` is currently exclusive on the start of the window, inclusive on the end.
# But we need inclusive on the start and exclusive on the end.
#
# The hack below does what we need until `Since()` provides additional options for inclusivity
shifted_non_threads = non_threads.shift_by(timedelta(microseconds=0.001))
shifted_ts = shifted_non_threads.lag(1).col("ts").first(window=kd.windows.Since(is_new))
thread_ts = ts.if_(is_new).else_(shifted_ts)

# create threads_ts column for non-threaded messages
non_threads_threads = non_threads.extend({"thread_ts": thread_ts}).filter(ts.is_not_null().and_(thread_ts.is_not_null()))

# re-join the two message subgroups
joined = threads.else_(non_threads_threads)

# join non-threads and threads back up, and key by conversations
messages = joined.with_key(kd.record({
        "channel": joined.col("channel"),
        "thread": joined.col("thread_ts"),
    }))
messages.preview(5)

#### Collect up all the messages, reactions, users in each conversation.

In [None]:
@kd.udf("f<N: any>(x: N) -> string")
def format_message(batch: pd.Series):
    def formatter(raw):
        return f"{raw['user']} --> {raw['text']}" # --> {raw['reactions']}"
    return batch.map(formatter)

@kd.udf("f<N: any>(x: N) -> string")
def format_messages(batch: pd.Series):
    def formatter(raw):
        return "\n---\n".join(raw)
    return batch.map(formatter)

@kd.udf("f<N: any>(x: N) -> string")
def extract_users(batch: pd.Series):
    def get_users(raw):
        users = [raw["user"]]
        # for user in json.loads(raw["reactions"]).keys():
        #     if user not in users:
        #         users.append(user)
        return json.dumps(users)
    return batch.map(get_users)

@kd.udf("f<N: any>(x: N) -> string")
def unique_users(batch: pd.Series):
    def get_users(raw):
        users = []
        for user_set in raw:
            users.extend(json.loads(user_set))
        return json.dumps(list(set(users)))
    return batch.map(get_users)

conversations = kd.record({
    # "conversation": messages.select("user", "text", "reactions").pipe(format_message).collect(max=None).pipe(format_messages),
    # "users": messages.select("user", "reactions").pipe(extract_users).collect(max=None).pipe(unique_users),
    "conversation": messages.select("user", "text").pipe(format_message).collect(max=None).pipe(format_messages),
    "users": messages.select("user").pipe(extract_users).collect(max=None).pipe(unique_users),
})
conversations.preview(5)

#### Create LlamaIndex documents for all the topics

In [None]:
from llama_index import Document

topics = {
    "ID1": "Tell me about engineering discussions related to the Supply Chain Management project",
    "ID2": "Alert me when people are making streaming technology decisions",
    "ID3": "Poke me when there are people chatting about SRE topics like monitoring and alerting",
    "ID4": "Let me know when people are talking about their weekends",
    "ID5": "Inform me of any important discussions happening on the Fraud Detection project",
}

topic_documents = []
for id in topics.keys():
    document = Document(
        doc_id=id,
        text=topics[id],
        metadata={
            "id": id,
        })
    document.excluded_llm_metadata_keys = ["id"]
    document.excluded_embed_metadata_keys = ["id"]
    topic_documents.append(document)

#### Create LlamaIndex documents for all the conversations

In [None]:
import json

conversation_documents = []
max_conversation_length = 0
async for row in conversations.run_iter(results=kd.results.Snapshot(), kind="row"):
    id = json.dumps(row["_key"])
    conversation = row["conversation"]
    users = row["users"]

    max_conversation_length = max(max_conversation_length, len(conversation))

    document = Document(
        doc_id=id,
        text=conversation,
        metadata={
            "users": users,
        })
    document.excluded_llm_metadata_keys = ["users"]
    document.excluded_embed_metadata_keys = ["users"]
    conversation_documents.append(document)

print(f"Max conversation length in chars: {max_conversation_length}")

#### Using LlamaIndex and HuggingFace, create embeddings for all the topics and conversations using various different embedding models.

* Store the embeddings for topics in a `topic_indexes` map of vector indexes
* Store the embeddings for conversations in a `conversation_indexes` map of vector indexes

In [None]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.embeddings import HuggingFaceEmbedding, InstructorEmbedding
from llama_index.embeddings.utils import EmbedType
from llama_index.node_parser.simple import SimpleNodeParser

topic_indexes = {}
conversation_indexes = {}

embedding_models: {str:EmbedType} = {
    # This will use llama2-chat-13B from with LlamaCPP, and assumes you have llama-cpp-python installed
    "llama2": "local",
    # This will use open-ai embeddings, and assumes OpenAI has already been initialized
    "openai": "default",

    "bge-small-en-v1.5" : HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    "bge-base-en-v1.5" : HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5"),
    "bge-large-en-v1.5" : HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5"),
    "ember-v1" : HuggingFaceEmbedding(model_name="llmrails/ember-v1"),
    "gte-large" : HuggingFaceEmbedding(model_name="thenlper/gte-large"),
    "gte-base" : HuggingFaceEmbedding(model_name="thenlper/gte-base"),
    "e5-large-v2" : HuggingFaceEmbedding(model_name="intfloat/e5-large-v2"),
    "e5-base-v2" : HuggingFaceEmbedding(model_name="intfloat/e5-base-v2"),
}

# limit documents to single-node
node_parser = SimpleNodeParser.from_defaults(
    chunk_size = max_conversation_length
)

for model_name in embedding_models.keys():
    service_context = ServiceContext.from_defaults(
        llm=None, # explicitly disable llm use
        embed_model=embedding_models[model_name],
        node_parser=node_parser,
    )
    topic_indexes[model_name] = VectorStoreIndex([], service_context=service_context)
    conversation_indexes[model_name] = VectorStoreIndex([], service_context=service_context)

In [None]:
# insert docs and generate embeddings for everything

for model_name in embedding_models.keys():
    for topic_doc in topic_documents:
        topic_indexes[model_name].insert(topic_doc)
    for conversation_doc in conversation_documents:
        conversation_indexes[model_name].insert(conversation_doc)

#### For each embedding model, perform retrieval on each conversation embedding against the topic embeddings vector index

Save score results to files.

In [None]:
base_result = {}
topic_count = len(topics.keys())

for topic_id in topics.keys():
    base_result[topic_id] = 0.0

from llama_index.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.indices.query.schema import QueryBundle

from llama_index.indices.vector_store.retrievers import VectorIndexRetriever


for model_name in embedding_models.keys():
    topic_index = topic_indexes[model_name]
    conversation_index = conversation_indexes[model_name]

    retriever = VectorIndexRetriever(
        index=topic_index,
        similarity_top_k=topic_count,
        vector_store_query_mode="default",
        alpha=None,
        doc_ids=None,
    )

    with open(f"Vectors_v1_{model_name}_scores.jsonl", "w") as out_file:
        for doc_id in conversation_index.ref_doc_info.keys():
            parsed_doc_id = json.loads(doc_id)
            for node_id in conversation_index.ref_doc_info[doc_id].node_ids:
                embedding = conversation_index.vector_store.get(node_id)
                text = conversation_index.docstore.docs[node_id].text

                qb = QueryBundle(query_str=text, embedding=embedding)
                result = base_result.copy()
                for node_score in retriever.retrieve(qb):
                    topic_id = node_score.metadata["id"]
                    result[topic_id] = node_score.score
                out = {"id": parsed_doc_id, "result": result}
                out_file.write(json.dumps(out) + "\n")

#### Load the results from `ChatCompletions_v1` to use as a baseline for comparison to this method.

In [None]:
chat_completion_results = {}

with open("ChatCompletion_v1_results_avg.jsonl") as input:
    for line in input.readlines():
        result = json.loads(line)
        key = json.dumps(result["id"])
        chat_completion_results[key] = result["result"]

#### Create methods to optimize the altering threshold in order to maximize the f1 score on each embedding model

Make comparisons against the data from `chat_completion_results`

In [None]:
def get_results_array(file_name:str, threshold:float):
    results = []
    with open(file_name) as in_file:
        for line in in_file.readlines():
            row = json.loads(line)
            true_ids = []
            for id in row["result"].keys():
                value = row["result"][id]
                if isinstance(value, float):
                    if value > threshold:
                        true_ids.append(id)
                elif isinstance(value, bool):
                    if value:
                        true_ids.append(id)
                else:
                    print(f"unknown value type: {type(value)}")
            results.append(true_ids)
    return results

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

def get_f1_score(file_name, threshold):
    mlb = MultiLabelBinarizer()
    mlb.fit([topics.keys()])

    test = get_results_array("ChatCompletion_v1_results_avg.jsonl", 0.0)
    pred = get_results_array(file_name, threshold)

    y_test_transformed = mlb.transform(test)
    y_pred_transformed = mlb.transform(pred)

    return f1_score(y_test_transformed, y_pred_transformed, average='macro') # Or 'micro', 'weighted' based on need

In [None]:
import numpy as np
from scipy.optimize import minimize_scalar

def optimize_f1_score(file_name) -> (float, float):
    # Use the minimize function to maximize the objective function
    result = minimize_scalar(fun=lambda x: -get_f1_score(file_name, x), bounds=(0.1, 1.0))

    return (result.x, -result.fun)

In [ ]:
for model_name in embedding_models.keys():
    file_name = f"Vectors_v1_{model_name}_scores.jsonl"

    (threshold, score) = optimize_f1_score(file_name)

    print(f"Model: {model_name} \tf1: {round(score,2)} \tthreshold: {round(threshold,2)}")

Model: llama2 			f1: 0.36 	threshold: 0.82
Model: openai 			f1: 0.52 	threshold: 0.77
Model: bge-small-en-v1.5 	f1: 0.42 	threshold: 0.65
Model: bge-base-en-v1.5 	f1: 0.47 	threshold: 0.57
Model: bge-large-en-v1.5 	f1: 0.52 	threshold: 0.63
Model: ember-v1 		f1: 0.48 	threshold: 0.57
Model: gte-large 		f1: 0.29 	threshold: 0.44
Model: gte-base 		f1: 0.49 	threshold: 0.79
Model: e5-large-v2 		f1: 0.29 	threshold: 0.44
Model: e5-base-v2 		f1: 0.33 	threshold: 0.7


#### Results

OpenAI and bge-large-en-v1.5 perform the best, however their f1 score doesn't come close to the level we were able to obtain using fine-tuning.