# Next Message User

### For a set of recent messages, try to predict the next user that will reply.

In [1]:
!pip3 install --upgrade openai -q

In [20]:
import json, openai, random, time
from openai import cli
from types import SimpleNamespace

work_dir = ""

# Remember to remove your key from your code when you're done.
openai.api_key = ""

## Initial Data Output Stage

Goals:
* prompt and completion length must not be longer than 2048 tokens
* each completion is a classification class and must be token unique, thus switching the completion to a number is ideal (1-100)
* there SHOULD be 100 data points for each completion
* both train and valid files need ALL completions present ideally at least 100 in each file.

In [23]:
file1 = open(f'{work_dir}/messages.jsonl', 'r')
file2 = open(f'{work_dir}/next_message_user.jsonl', 'w')

current_channel = ""
previous_training_example = ""
recent_messages = []
count = 0
user_map = {}
user_count = 0
user_counts = {}

min_messages = 2
max_messages = 5
between_message_separator = " \n\n\n "
prompt_separator = "\n\n###\n\n"
completion_separator = ""
reverse_messages = True

while True:
    line = file1.readline()

    if not line:
        break

    data = json.loads(line)
    text = data["text"].strip()

    # skip message if empty
    if text == "":
        continue

    # skip message if it contains a code block
    if text.find("```") >= 0:
        continue

    channel = data["channel"]
    user_id = data["user"]
    reactions = data["reactions"]

    # create a user_id -> user_num map
    #
    # also initialize a map to count training samples per user_id
    # goal for at least 300 samples for each user_id so that 
    # so that we can ensure at least 100 samples for each user in both
    # training and validation files
    #
    # note that we count by user_num and not user_id, to ease processing
    # in the next stage
    if user_id not in user_map:
        user_count += 1
        user_map[user_id] = user_count
        user_counts[user_count] = 0

    # restart message history count if the channel changes in the file
    if channel != current_channel:
        current_channel = channel
        recent_messages = []
        print(f'New channel: {current_channel}')

    # if we are in the message history size window, prep to output an example
    if len(recent_messages) > min_messages:
        messages = recent_messages.copy()
        if reverse_messages:
            messages.reverse()
        prompt = between_message_separator.join(messages)

        user_num = user_map[user_id]

        training_example = { "prompt": f'{prompt}{prompt_separator}', "completion": f' {user_num}{completion_separator}' }
        training_example = json.dumps(training_example)

        # if training example doesn't match the previous one, then output
        if previous_training_example != training_example:
            file2.write(training_example + "\n")
            # also update the example count for the user_id
            user_counts[user_num] += 1

        previous_training_example = training_example

    # add the recent message to the history.
    # remove the oldest message if the size window is exceeded
    recent_messages.append(text)
    if len(recent_messages) > max_messages:
        recent_messages.pop(0)

file1.close()
file2.close()

user_map_file = open(f'{work_dir}/user_map.json', 'w')
json.dump(user_map, user_map_file)
user_map_file.close()

user_counts_file = open(f'{work_dir}/next_message_user_counts.json', 'w')
json.dump(user_counts, user_counts_file)
user_counts_file.close()

print(f'Total user count: {user_count}')


New channel: articles
New channel: conferences
New channel: dev
New channel: dev-ops
New channel: fluff-posting
New channel: games
New channel: general
New channel: inbound-leads
New channel: on-call
New channel: product
New channel: random
New channel: rust
New channel: sales-team
New channel: team-api
New channel: team-compute
Total user count: 17


## Cleanup Stage

* Load the files from the previous stage output
* remove any examples where the total completion count for the user is less than 300

In [24]:
min_examples = 300

user_counts_file = open(f'{work_dir}/next_message_user_counts.json', 'r')

user_counts = json.load(user_counts_file)
user_counts_file.close()

user_nums_with_too_few_examples = []
user_nums_with_enough_examples = {}

for user_num in user_counts:
    example_count = user_counts[user_num]
    if example_count < min_examples:
        user_nums_with_too_few_examples.append(user_num)
    else:
        user_nums_with_enough_examples[user_num] = example_count

print("Removing the following users from the example set:")
print(user_nums_with_too_few_examples)

print("\nThis leaves the remaing users and counts: ")
print(user_nums_with_enough_examples)

print(f'Total classes for training are: {len(user_nums_with_enough_examples)}')

in_file = open(f'{work_dir}/next_message_user.jsonl', 'r')
out_file = open(f'{work_dir}/next_message_user_cleaned.jsonl', 'w')

completions_to_remove = []
for user_num in user_nums_with_too_few_examples:
    completions_to_remove.append(f' {user_num}{completion_separator}')

while True:
    training_example_line = in_file.readline()

    if not training_example_line:
        break

    training_example = json.loads(training_example_line)
    completion = training_example["completion"]
    if completion not in completions_to_remove:
        out_file.write(training_example_line)

in_file.close()
out_file.close()

Removing the following users from the example set:
['7', '8', '9', '11', '12', '14', '15', '16', '17']

This leaves the remaing users and counts: 
{'1': 6851, '2': 1488, '3': 786, '4': 3480, '5': 1138, '6': 2100, '10': 2566, '13': 406}
Total classes for training are: 8


## Data Split Stage

* Split the remaining exampes into training and validation sets

In [25]:
args = SimpleNamespace(file=f'{work_dir}/next_message_user_cleaned.jsonl', quiet=True)
cli.FineTune.prepare_data(args)

Analyzing...

- Your file contains 18815 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- All prompts end with suffix `\n\n###\n\n`

No remediations found.
- [Recommended] Would you like to split into training and validation set? [Y/n]: Y


Your data will be written to a new JSONL file. Proceed [Y/n]: Y

Wrote modified files to `/Users/eric.pinzur/Documents/slackbot2000/next_message_user_cleaned_prepared_train (1).jsonl` and `/Users/eric.pinzur/Documents/slackbot2000/next_message_user_cleaned_prepared_valid (1).jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "/Users/eric.pinzur/Documents/slackbot2000/next_message_user_cleaned_prepared_train (1

## Data Validation Stage
* Open the `train` and `valid` files. 
* Check to make sure there is enough examples for each `user_num` in each file
* Ideally a minimum of 100 examples per user, per file


In [26]:
def get_completion_counts(file_path):
    file = open(file_path)

    completion_counts = {}

    while True:
        training_example_line = file.readline()

        if not training_example_line:
            break

        training_example = json.loads(training_example_line)
        completion = training_example["completion"]

        if completion not in completion_counts:
            completion_counts[completion] = 1
        else:
            completion_counts[completion] += 1

    file.close()
    return completion_counts

train_completion_counts = get_completion_counts(f'{work_dir}/next_message_user_cleaned_prepared_train.jsonl')
valid_completion_counts = get_completion_counts(f'{work_dir}/next_message_user_cleaned_prepared_valid.jsonl')

print("The training file has the following completion counts:")
print(train_completion_counts)
print("\nThe validation file has the following completion counts:")
print(valid_completion_counts)

The training file has the following completion counts:
{' 1': 6502, ' 5': 1082, ' 10': 2444, ' 3': 749, ' 6': 1979, ' 4': 3272, ' 2': 1407, ' 13': 380}

The validation file has the following completion counts:
{' 4': 208, ' 1': 349, ' 5': 56, ' 2': 81, ' 3': 37, ' 6': 121, ' 10': 122, ' 13': 26}


## Well, that ^^ is not ideal

* Need to split the files myself and re-check

In [27]:
validate_percent = 25
random_seed = 41

file_in = open(f'{work_dir}/next_message_user_cleaned.jsonl', 'r')
file_train = open(f'{work_dir}/next_message_user_cleaned_train.jsonl', 'w')
file_valid = open(f'{work_dir}/next_message_user_cleaned_valid.jsonl', 'w')

random.seed(random_seed)
while True:
    line = file_in.readline()

    if not line:
        break

    if random.randint(0, 99) < validate_percent:
        file_valid.write(line)
    else:
        file_train.write(line)

file_in.close()
file_train.close()
file_valid.close()

In [28]:
## Recheck Files

train_completion_counts = get_completion_counts(f'{work_dir}/next_message_user_cleaned_train.jsonl')
valid_completion_counts = get_completion_counts(f'{work_dir}/next_message_user_cleaned_valid.jsonl')

print("The training file has the following completion counts:")
print(train_completion_counts)
print("\nThe validation file has the following completion counts:")
print(valid_completion_counts)

The training file has the following completion counts:
{' 1': 5093, ' 2': 1097, ' 3': 593, ' 4': 2622, ' 5': 859, ' 6': 1549, ' 10': 1904, ' 13': 304}

The validation file has the following completion counts:
{' 2': 391, ' 4': 858, ' 1': 1758, ' 6': 551, ' 5': 279, ' 3': 193, ' 10': 662, ' 13': 102}


## Fantastic, moving on to model training

* First need to upload the training & validation files to OpenAI

In [14]:
training_file_name = f'{work_dir}/next_message_user_cleaned_train.jsonl'
validation_file_name = f'{work_dir}/next_message_user_cleaned_valid.jsonl'

def check_status(training_id, validation_id):
    train_status = openai.File.retrieve(training_id)["status"]
    valid_status = openai.File.retrieve(validation_id)["status"]
    print(f'Status (training_file | validation_file): {train_status} | {valid_status}')
    return (train_status, valid_status)

# Upload the training and validation dataset files to Azure OpenAI.
training_id = cli.FineTune._get_or_upload(training_file_name, True)
validation_id = cli.FineTune._get_or_upload(validation_file_name, True)

# Check on the upload status of the training and validation dataset files.
(train_status, valid_status) = check_status(training_id, validation_id)

# Poll and display the upload status once a second until both files have either
# succeeded or failed to upload.
while train_status not in ["succeeded", "failed", "processed"] or valid_status not in ["succeeded", "failed", "processed"]:
    time.sleep(1)
    (train_status, valid_status) = check_status(training_id, validation_id)

Upload progress: 100%|██████████| 10.5M/10.5M [00:00<00:00, 10.1Git/s]


Uploaded file from /Users/eric.pinzur/Documents/slackbot2000/next_message_user_cleaned_train.jsonl: file-W2I8PmXypH8bzCPKAwcoXObW


Upload progress: 100%|██████████| 3.52M/3.52M [00:00<00:00, 3.56Git/s]


Uploaded file from /Users/eric.pinzur/Documents/slackbot2000/next_message_user_cleaned_valid.jsonl: file-sSSVNpj0IUAr1XUplVSpGrSG
Status (training_file | validation_file): uploaded | uploaded
Status (training_file | validation_file): uploaded | uploaded
Status (training_file | validation_file): uploaded | uploaded
Status (training_file | validation_file): processed | uploaded
Status (training_file | validation_file): processed | uploaded
Status (training_file | validation_file): processed | processed


## Start a fine-tuning Job

* `classification_n_classes` should match the total user count.


In [15]:
# This example defines a fine-tune job that creates a customized model based on curie, 
# with just a single pass through the training data. The job also provides classification-
# specific metrics, using our validation data, at the end of that epoch.
create_args = {
    "training_file": training_id,
    "validation_file": validation_id,
    "model": "ada",
    "compute_classification_metrics": True,
    "n_epochs": 1,
    "classification_n_classes": 8,
    "suffix": "next_message_user_full_kaskada"
}
# Create the fine-tune job and retrieve the job ID
# and status from the response.
resp = openai.FineTune.create(**create_args)
job_id = resp["id"]
status = resp["status"]

# You can use the job ID to monitor the status of the fine-tune job.
# The fine-tune job may take some time to start and complete.
print(f'Fine-tuning model with job ID: {job_id}')

Fine-tuning model with job ID: ft-kLSvHqK2gc0cF4PG61NjDYcL.


In [None]:
job_id = "ft-kLSvHqK2gc0cF4PG61NjDYcL"

## Wait for the fine-tuning to start

* Note that it can take several hours for the job to move from the `pending` state

In [None]:
# Get the status of our fine-tune job.
status = openai.FineTune.retrieve(id=job_id)["status"]

# If the job isn't yet done, poll it every 2 seconds.
if status not in ["succeeded", "failed"]:
    print(f'Job not in terminal status: {status}. Waiting.')
    while status not in ["succeeded", "failed"]:
        time.sleep(5)
        status = openai.FineTune.retrieve(id=job_id)["status"]
        print(f'Status: {status}')
else:
    print(f'Fine-tune job {job_id} finished with status: {status}')


## Check fine-tuning events

* Lets us know specifics about the fine-tuning job
* Gives debug info on failure:
  Example from a previous attempt using a 1% random sample of the initial dataset before any cleaning:
  ```
  {
    "object": "fine-tune-event",
    "level": "info",
    "message": "Created fine-tune: ft-TUEyMRXlB2es1JaeILlCRH7i",
    "created_at": 1690925364
  }
  {
    "object": "fine-tune-event",
    "level": "error",
    "message": "Fine-tune failed. Errors:\nThe number of classes in file-GG6r3mCVJ6q4uEGsBWewYrq9 does not match the number of classes specified in the hyperparameters.\nThe number of classes in file-Pyd5oYAo0CkztKjiEvsSdzWO does not match the number of classes specified in the hyperparameters.",
    "created_at": 1690934403
  }
  ```

In [None]:
# Get the events of our fine-tune job.
events = openai.FineTune.stream_events(id=job_id)

for event in events:
    print(event)


# Look at Training Results

* download the results file(s)

In [32]:
file_prefix = "next_message_user_results"

result = openai.FineTune.retrieve(id=job_id)
count = 0
for result_file in result["result_files"]:
    file_name = f'{work_dir}/{file_prefix}_{count}.csv'
    file = open(file_name, 'wb')
    file.write(openai.File.download(id=result_file["id"]))
    file.close()
    print(f'Outputted results to: {file_name}')

Outputted results to: /Users/eric.pinzur/Documents/slackbot2000/next_message_user_results_0.csv
