# Next Users in Window

### For a set of recent messages, try to predict the set of users that might interact next.

For each set of messages, build a user set from the following properties:
* users that replied to the most-recent message
* users that reacted to the most-recent message
* any users that wrote a message in the next X minutes.  

In [13]:
!pip3 install --upgrade openai pip install file-read-backwards  -q

In [2]:
import json, openai, random, time, os
from openai import cli
from types import SimpleNamespace
from file_read_backwards import FileReadBackwards

work_dir = ""

# Remember to remove your key from your code when you're done.
openai.api_key = ""

## Pre-Output Stage

Pass through the dataset and for each message, find the set of users that write another message inside some time window

The code below reads the file backwards to make the algorithm easier

In [36]:
count = 0

current_channel = ""
ts_user_map = {}
window_seconds = 300

out_file = open(f'{work_dir}/next_users_in_window_temp.jsonl', 'w')

with FileReadBackwards(f'{work_dir}/messages.jsonl') as in_file:
    for line in in_file:
        data = json.loads(line)

        channel = data["channel"]
        msg_user = data["user"]
        msg_ts = data["ts"]

        channel = data["channel"]

        # restart if the channel changes in the file
        if channel != current_channel:
            current_channel = channel
            ts_user_map = {}
            print(f'New channel: {current_channel}')

        # remove too old timestamps from the window
        next_ts_user_map = {}
        #print(ts_user_map)
        for ts in ts_user_map:
            if ts < msg_ts + window_seconds:
                user = ts_user_map[ts]
                next_ts_user_map[ts] = user
        ts_user_map = next_ts_user_map

        # get the list of users in the window and de-duplicate it
        users_in_window = list(ts_user_map.values())
        users_in_window = list(dict.fromkeys(users_in_window))
        

        out_file.write(json.dumps({ "channel": channel, "ts": msg_ts, "users_in_window": users_in_window })+"\n")
        #print({ "channel": channel, "ts": msg_ts, "users_in_window": users_in_window })
        #count += 1
        #if count > 10:
        #    break

        # add the current timestamp to the map
        ts_user_map[msg_ts] = msg_user

out_file.close()

# reverse the file to get the actual output we want
out_file = open(f'{work_dir}/next_users_in_window_pre.jsonl', 'w')
with FileReadBackwards(f'{work_dir}/next_users_in_window_temp.jsonl') as in_file:
    for line in in_file:
        out_file.write(line+"\n")
out_file.close()

# delete temp file
os.remove(f'{work_dir}/next_users_in_window_temp.jsonl')


New channel: team-compute
New channel: team-api
New channel: sales-team
New channel: rust
New channel: random
New channel: product
New channel: on-call
New channel: inbound-leads
New channel: general
New channel: games
New channel: fluff-posting
New channel: dev-ops
New channel: dev
New channel: conferences
New channel: articles


## Initial Data Output Stage

This is a Generative problem, so the goals for training aren't as strict.

Goals:
* prompt and completion length must not be longer than 2048 tokens

In [38]:
in_file = open(f'{work_dir}/messages.jsonl', 'r')
in_pre = open(f'{work_dir}/next_users_in_window_pre.jsonl', 'r')
out_file = open(f'{work_dir}/next_users_in_window.jsonl', 'w')

current_channel = ""
previous_training_example = ""
recent_messages = []
count = 0

min_messages = 2
max_messages = 5
between_message_separator = " \n\n\n "
prompt_separator = "\n\n###\n\n"
completion_separator = " END"
reverse_messages = True

# gets a list of all users that reacted to a message
def get_reaction_users(reactions):
    users = []
    if reactions:
        for reaction in reactions:
            users.extend(reaction["users"])
    return users

while True:
    line = in_file.readline()

    if not line:
        break

    pre_line = in_pre.readline()

    if not pre_line:
        break

    data = json.loads(line)
    text = data["text"].strip()
    channel = data["channel"]
    ts = data["ts"]

    pre_data = json.loads(pre_line)
    pre_channel = pre_data["channel"]
    pre_ts = pre_data["ts"]

    if pre_ts != ts or pre_channel != channel:
        print("Pre data and data are mis-aligned. Something went wrong. Stopping")
        break

    # skip message if empty
    if text == "":
        continue

    # skip message if it contains a code block
    if text.find("```") >= 0:
        continue

    channel = data["channel"]

    # restart message history count if the channel changes in the file
    if channel != current_channel:
        current_channel = channel
        recent_messages = []
        print(f'New channel: {current_channel}')

    # add the recent message to the history.
    # remove the oldest message if the size window is exceeded
    recent_messages.append(text)
    if len(recent_messages) > max_messages:
        recent_messages.pop(0)

    # if we are in the message history size window, prep to output an example
    if len(recent_messages) > min_messages:
        messages = recent_messages.copy()
        if reverse_messages:
            messages.reverse()
        
        prompt = between_message_separator.join(messages) 

        # build up user set for the completion
        users = []

        reactions = data["reactions"]
        if reactions:
            for reaction in reactions:
                users.extend(reaction["users"])

        reply_users = data["reply_users"]
        if reply_users:
            users.extend(reply_users)

        users_in_window = pre_data["users_in_window"]
        if users_in_window:
            users.extend(users_in_window)
        
        # de-duplicate & sort users in the list.
        users = list(dict.fromkeys(users))
        users.sort()

        # remove the message writer from the user list
        msg_user = data["user"]
        if msg_user in users:
            users.remove(msg_user)

        # make the completion
        completion = json.dumps(users)

        training_example = { "prompt": f'{prompt}{prompt_separator}', "completion": f' {completion}{completion_separator}' }
        training_example = json.dumps(training_example)

        # if training example doesn't match the previous one, then output
        if previous_training_example != training_example:
            out_file.write(training_example + "\n")

        previous_training_example = training_example

in_file.close()
in_pre.close()
out_file.close()

New channel: articles
New channel: conferences
New channel: dev
New channel: dev-ops
New channel: fluff-posting
New channel: games
New channel: general
New channel: inbound-leads
New channel: on-call
New channel: product
New channel: random
New channel: rust
New channel: sales-team
New channel: team-api
New channel: team-compute


## Data Verification Stage

* make sure prompts end with same suffix
* make sure tokens per example are less than 2048
* ignore all other analysis
  * we are training for *conditional generation*, but the data prep tool incorreclty assumes we are fine-tuning for *classifaction*

In [39]:
args = SimpleNamespace(file=f'{work_dir}/next_users_in_window.jsonl', quiet=True)
cli.FineTune.prepare_data(args)

Analyzing...

- Your file contains 19672 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- More than a third of your `completion` column/key is uppercase. Uppercase completions tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details
- All prompts end with suffix `\n\n###\n\n`

Based on the analysis we will perform the following actions:
- [Recommended] Lowercase all your data in column/key `completion` [Y/n]: Y
- [Recommended] Would you like to split into training and validation set? [Y/n]: Y


Your data will be 

## Model Training Stage

* First need to upload the training & validation files to OpenAI

In [40]:
training_file_name = f'{work_dir}/next_users_in_window.jsonl'

def check_status(training_id):
    train_status = openai.File.retrieve(training_id)["status"]
    print(f'Status (training_file): {train_status} ')
    return (train_status)

# Upload the training and validation dataset files to Azure OpenAI.
training_id = cli.FineTune._get_or_upload(training_file_name, True)

# Check on the upload status of the training dataset file.
(train_status) = check_status(training_id)

# Poll and display the upload status once a second until both files have either
# succeeded or failed to upload.
while train_status not in ["succeeded", "failed", "processed"]:
    time.sleep(1)
    (train_status) = check_status(training_id)

Upload progress: 100%|██████████| 15.1M/15.1M [00:00<00:00, 18.9Git/s]


Uploaded file from /Users/eric.pinzur/Documents/slackbot2000/next_users_in_window.jsonl: file-NjHHSOB0TJZK7iODaT2Da6Us
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): processed 


## Start a fine-tuning Job

* no validation file since this is not a classification problem

In [41]:
# This example defines a fine-tune job that creates a customized model based on curie, 
# with just a single pass through the training data. The job also provides classification-
# specific metrics, using our validation data, at the end of that epoch.
create_args = {
    "training_file": training_id,
    "model": "ada",
    "n_epochs": 1,
    "suffix": "next_users_in_window_full_kaskada"
}
# Create the fine-tune job and retrieve the job ID
# and status from the response.
resp = openai.FineTune.create(**create_args)
job_id = resp["id"]
status = resp["status"]

# You can use the job ID to monitor the status of the fine-tune job.
# The fine-tune job may take some time to start and complete.
print(f'Fine-tuning model with job ID: {job_id}')

Fine-tuning model with job ID: ft-MqX1fWkyyyX3ZpwTrDo34rU2


In [4]:
job_id = "ft-MqX1fWkyyyX3ZpwTrDo34rU2"

## Wait for the fine-tuning to start

* Note that it can take several hours for the job to move from the `pending` state

In [None]:
# Get the status of our fine-tune job.
status = openai.FineTune.retrieve(id=job_id)["status"]

# If the job isn't yet done, poll it every 2 seconds.
if status not in ["succeeded", "failed"]:
    print(f'Job not in terminal status: {status}. Waiting.')
    while status not in ["succeeded", "failed"]:
        time.sleep(5)
        status = openai.FineTune.retrieve(id=job_id)["status"]
        print(f'Status: {status}')
else:
    print(f'Fine-tune job {job_id} finished with status: {status}')


## Check fine-tuning events

* Lets us know specifics about the fine-tuning job


In [None]:
# Get the events of our fine-tune job.
events = openai.FineTune.stream_events(id=job_id)

for event in events:
    print(event)


# Look at Training Results

* download the results file(s)

In [5]:
file_prefix = "next_users_in_window_results"

result = openai.FineTune.retrieve(id=job_id)
count = 0
for result_file in result["result_files"]:
    file_name = f'{work_dir}/{file_prefix}_{count}.csv'
    file = open(file_name, 'wb')
    file.write(openai.File.download(id=result_file["id"]))
    file.close()
    print(f'Outputted results to: {file_name}')


Outputted results to: /Users/eric.pinzur/Documents/slackbot2000/next_users_in_window_results_0.csv
