## ChatCompletion v1

### Goal

Let users specify notification topics, for example:

"Hey BeepGPT, let me know when there’s an important engineering decision being made about the Fraud Detection Project."

### Method

Use chat completion with a single-shot example to determine if a user should be notified.

#### Pros:
* Easy implementation: Doesn't require a vector database or fine-tuning
* Fast response

#### Cons:
* Limited to small notification sets. If there were 1000s of users each with different notification requests, this method would probably not work.
* Expensive to run

### Notes:

This notebook was primarily created as a baseline. Future notebooks will explore completing the same task with different methods.


### The code:

#### Install the tools, initiate the things

In [None]:
%pip install -q kaskada==0.6.0a4 openai

In [None]:
import openai, getpass

# Initialize OpenAI
openai.api_key = getpass.getpass('OpenAI: API Key')

In [None]:
import pandas as pd
import kaskada as kd

# Initialize Kaskada with a local execution context.
kd.init_session()

# set pandas to display all floats with 6 decimal places
pd.options.display.float_format = '{:.6f}'.format

#### Pull in the user list, create a `format_user()` method

In [None]:
users_df = pd.read_json("slack-generation.users.json")

columns_to_keep = ["id", "team_id", "name", "deleted", "real_name", "is_bot", "updated"]

users_df.drop(columns=users_df.columns.difference(columns_to_keep), inplace=True)

users = {}
for user in users_df.to_dict(orient='index').values():
    users[user["id"]] = user

In [None]:
def get_user(user_id):
    return users[user_id] if user_id in users.keys() else None

def format_user(user_id):
    user = get_user(user_id)
    return f"{user['name']} ({user_id})" if user else f"({user_id})"

format_user("UBB9D2B01")

#### Load the slack data, clean the message text, format message users

In [None]:
# Load events from a Parquet file
#
# if you wan to load in your own slack data, change this to the path of your output file from 1.1 above
# otherwise continue with `slack-generation.parquet`, which contains generated slack data for
# example purposes. See the `slack-generation/notebook.ipynb` notebook for more info.
input_file = "slack-generation.parquet"

# Use the "ts" column as the time associated with each row,
# and the "channel" column as the entity associated with each row.
raw_msgs = await kd.sources.Parquet.create(
    input_file,
    time_column = "ts",
    key_column = "channel",
    time_unit = "s"
)
raw_msgs.preview(5)

In [None]:
import json

@kd.udf("f<N: any>(x: N) -> string")
def format_users(batch: pd.Series):
    # Apply to each row in the batch
    return batch.map(format_user)

In [None]:
# Clean Text
import re

def strip_code_blocks(line):
    return re.sub(r"```.*?```", '', line)

def user_repl(match_obj):
    user_id = match_obj.group(1)
    return format_user(user_id)

def update_users(line):
    return re.sub(r"<@(.*?)>", user_repl, line)

def clean_message(text):
        text = strip_code_blocks(update_users(text)).strip()
        return None if text == "" else text

@kd.udf("f<N: any>(x: N) -> string")
def clean_text(batch: pd.Series):
    # Apply to each row in the batch
    return batch.map(clean_message)

In [None]:
formatted_msgs = raw_msgs.extend({
    "text": raw_msgs.col("text").pipe(clean_text),
    "user": raw_msgs.col("user").pipe(format_users)
})
formatted_msgs.preview(5)

#### Convert the non-threaded messages into threaded messages. See `FineTuning_v2.ipynb` for more details on this.

In [None]:
from datetime import timedelta

ts = formatted_msgs.col("ts")
thread_ts = formatted_msgs.col("thread_ts")

# split messages into two subgroups: threads and non-threads
threads = formatted_msgs.filter(thread_ts.is_not_null())
non_threads = formatted_msgs.filter(thread_ts.is_null())

# for non-threads, consider a message a new conversation when
# more than 10 mins have elapsed since the previous message
is_new = ts.seconds_since_previous() > timedelta(minutes=10)

# Eventually this will just be: `thread_ts = ts.first(window=kd.windows.Since(is_new, start="inclusive"))`
#
# However, `Since()` is currently exclusive on the start of the window, inclusive on the end.
# But we need inclusive on the start and exclusive on the end.
#
# The hack below does what we need until `Since()` provides additional options for inclusivity
shifted_non_threads = non_threads.shift_by(timedelta(microseconds=0.001))
shifted_ts = shifted_non_threads.lag(1).col("ts").first(window=kd.windows.Since(is_new))
thread_ts = ts.if_(is_new).else_(shifted_ts)

# create threads_ts column for non-threaded messages
non_threads_threads = non_threads.extend({"thread_ts": thread_ts}).filter(ts.is_not_null().and_(thread_ts.is_not_null()))

# re-join the two message subgroups
joined = threads.else_(non_threads_threads)

# join non-threads and threads back up, and key by conversations
messages = joined.with_key(kd.record({
        "channel": joined.col("channel"),
        "thread": joined.col("thread_ts"),
    }))
messages.preview(5)

#### Collect up all the messages, reactions, users in each conversation. output just the final collection.

In [None]:
@kd.udf("f<N: any>(x: N) -> string")
def format_message(batch: pd.Series):
    def formatter(raw):
        return f"{raw['user']} --> {raw['text']}" # --> {raw['reactions']}"
    return batch.map(formatter)

@kd.udf("f<N: any>(x: N) -> string")
def format_messages(batch: pd.Series):
    def formatter(raw):
        return "\n---\n".join(raw)
    return batch.map(formatter)

@kd.udf("f<N: any>(x: N) -> string")
def extract_users(batch: pd.Series):
    def get_users(raw):
        users = [raw["user"]]
        # for user in json.loads(raw["reactions"]).keys():
        #     if user not in users:
        #         users.append(user)
        return json.dumps(users)
    return batch.map(get_users)

@kd.udf("f<N: any>(x: N) -> string")
def unique_users(batch: pd.Series):
    def get_users(raw):
        users = []
        for user_set in raw:
            users.extend(json.loads(user_set))
        return json.dumps(list(set(users)))
    return batch.map(get_users)

conversations = kd.record({
    # "conversation": messages.select("user", "text", "reactions").pipe(format_message).collect(max=None).pipe(format_messages),
    # "users": messages.select("user", "reactions").pipe(extract_users).collect(max=None).pipe(unique_users),
    "conversation": messages.select("user", "text").pipe(format_message).collect(max=None).pipe(format_messages),
    "users": messages.select("user").pipe(extract_users).collect(max=None).pipe(unique_users),
})

conversations_df = conversations.to_pandas(results=kd.results.Snapshot())
conversations_df

#### Use ChatCompletion to see if any of these conversations fall into the following groupings:

* Tell me about engineering discussions related to the Supply Chain Management project
* Alert me when people are making streaming technology decisions
* Poke me when there are people chatting about SRE topics like monitoring and alerting
* Let me know when people are talking about their weekends
* Inform me of any important discussions happening on the Fraud Detection project

First set up the prompt and the single-shot example:


In [None]:
system = """
You are a helpful assistant. You respond `true` or `false` if a passed conversation
matches a specific request. You should respond with just the request-id and `true`
or `false`, in json format. A conversation may match no requests, one request, or
many requests.

Conversations will be passed in plain text, where the person writing and their
text is separated by an arrow like this: -->. And `---` characters on their own
indicate that the next line will contain the text from the next user in the
conversation.

The requests are:
* ID1 - Tell me about engineering discussions related to the Supply Chain Management project
* ID2 - Alert me when people are making streaming technology decisions
* ID3 - Poke me when there are people chatting about SRE topics like monitoring and alerting
* ID4 - Let me know when people are talking about their weekends
* ID5 - Inform me of any important discussions happening on the Fraud Detection project
"""

user = """
userf (UEA27BBFF) --> Great engagement so far, team! To delve deeper into the topic of
Software-defined networking (SDN) integration, let's share implementation experiences,
and discuss the potential impact of SDN on our Network Monitoring project.
---
usera (U3E44CFA1) --> UserF, Let's delve into the technical intricacies of SDN
integration, such as the architecture of SDN controllers, protocols like OpenFlow,
and the challenges of achieving seamless integration between SDNand our streaming
technologies.
---
usere (U03CC4325) --> UserA, initiating thread 76 focused on Software-defined
networking (SDN) integration is an excellent proposal. Let's discuss the various
aspects of SDN integration, including the real-world benefits of leveraging SDN
in network monitoring.
"""

assistant = '{"ID1": false, "ID2": false, "ID3": true, "ID4": false, "ID5": false}'

Generate ChatCompletions and output the results. Run the code in a way that we can resume if an error occurs.

In [None]:
# only re-run this cell to restart the process
rows_processed = 0

In [None]:
# re-run this cell as often as needed until the process is completed

from IPython.display import clear_output

with open("ChatCompletion_v1_results.jsonl", "a") as out:
  for row in conversations_df.iterrows():
    clear_output(wait=True)

    if row[0] < rows_processed:
      continue

    completion = openai.ChatCompletion.create(
      # model choices: gpt-4, gpt-4-32k, gpt-3.5-turbo, gpt-3.5-turbo-16k
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": system},
        {"role": "user", "content": user},
        {"role": "assistant", "content": assistant},
        {"role": "user", "content": row[1]["conversation"]}
      ]
    )
    response = completion.choices[0].message.content

    result = { "id": row[1]["_key"], "result": json.loads(response)}
    out.write(json.dumps(result) + "\n")
    out.flush()

    rows_processed += 1
    print(rows_processed)