# Conversations

### For a set of recent messages in a "conversation", try to predict the set of users that might interact next.

A "conversation" is defined as either:
* All the messages in a thread
* A collection of messages from a single channel that occur in succession. If no response is made for 10 minutes, the conversation has ended. The next message outside this window is the start of a new conversation.


For each set of messages in a "conversation", build a user set from the following properties:
* users that reacted to the conversation
* users that participated in the conversation
* exclude the conversation starter 

In [None]:
!pip3 install --upgrade openai pip install file-read-backwards  -q

In [None]:
import json, openai, time, pandas, random, getpass
from openai import cli
from types import SimpleNamespace
from sklearn.model_selection import train_test_split
from file_read_backwards import FileReadBackwards

work_dir = "/Users/eric.pinzur/Documents/slackbot2000"
openai.api_key = getpass.getpass(prompt="Please enter your OpenAI API Key")

## Additional Data Prep

Convert the original input data into a set of "conversations".

### Split the data into "threads" and "non-threads"

Using DuckDB:

```sql
copy(
    select * from 
    read_json_auto('messages.jsonl', format='newline_delimited') 
    where thread_ts is not null 
    order by channel, thread_ts, ts
) to 'message_threads.jsonl' (FORMAT JSON);

copy(
    select * from 
    read_json_auto('messages.jsonl', format='newline_delimited') 
    where thread_ts is null 
    order by channel, ts
) to 'message_non_threads.jsonl' (FORMAT JSON);
```

### Make threads from non-threads

For non-threads, artifically group the messages into "threads".  Collect messages from a channel.  If there is a 5 minute gap between messages, convert the message collection to a "thread" and export.




In [9]:
in_file = open(f'{work_dir}/message_non_threads.jsonl', 'r')
out_file = open(f'{work_dir}/message_non_threads_threads.jsonl', 'w')

def write_thread(thread):
    if len(thread) > 0:
        reply_users = []
        for message in thread:
            if message["user"] not in reply_users:
                reply_users.append(message["user"])
        thread[0]["reply_users"] = reply_users
        thread_ts = thread[0]["ts"]
        for message in thread:
            message["thread_ts"] = thread_ts
            out_file.write(json.dumps(message)+"\n")

next_thread = []
current_channel = ""
last_msg_ts = None
while True:
    output_thread = False
    line = in_file.readline()

    if not line:
        break

    message = json.loads(line)

    channel = message["channel"]

    # output if channel changes in the file
    if channel != current_channel:
        current_channel = channel
        output_thread = True

    # output if message timestamp is more than 10 mins beyond last message
    if last_msg_ts and message["ts"] > last_msg_ts + 600:
        output_thread = True

    # output and reset
    if output_thread:
        write_thread(next_thread)
        next_thread = []
        last_msg_ts = None

    next_thread.append(message)
    last_msg_ts = message["ts"]

# output final thread
write_thread(next_thread)

in_file.close()
out_file.close()

### Make Conversations

Re-join the two files into a set of conversations, using duckDB:

```sql
copy(
    select * from 
    read_json_auto(['message_non_threads_threads.jsonl', 'message_threads.jsonl'], format='newline_delimited') 
    order by channel, thread_ts, ts
) to 'message_conversations.jsonl' (FORMAT JSON);
```

## Generate Examples

This is a Generative problem, so the goals for training aren't as strict.

Goals:
* prompt and completion length must not be longer than 2048 tokens

### Strategy

* Group messages int "conversations".
* Use a method to write examples for a "conversation"

In [14]:
# build a user map
in_file = open(f'{work_dir}/message_conversations.jsonl', 'r')
user_map = {}
user_count = 0

while True:
    line = in_file.readline()

    if not line:
        break

    message = json.loads(line)
    user = message["user"]

    # create a user_id -> user_num map
    if user not in user_map:
        user_count += 1
        user_map[user] = user_count

in_file.close()

In [21]:
in_file = open(f'{work_dir}/message_conversations.jsonl', 'r')
out_file = open(f'{work_dir}/conversation_user_examples.jsonl', 'w')

# gets a list of all users that reacted to a message
def get_reaction_users(message):
    users = []
    if message["reactions"]:
        for reaction in message["reactions"]:
            users.extend(reaction["users"])
    return users

prompt_separator = "\n\n###\n\n"
completion_separator = " end"

def output_conversation_example(conversation):
    if len(conversation) > 0:
        prompt_lines = []
        initial_user = conversation[0]["user"]
        users = []
        for message in conversation:
            user = message["user"]
            users.append(user)
            users.extend(get_reaction_users(message))
            
            text = message["text"]
            if text == "" or text.find("```") >= 0:
                continue

            prompt_lines.append(f' {user} --> {text} ')

        if len(prompt_lines) == 0:
            return
        
        prompt = "\n\n".join(prompt_lines)

        # de-duplicate users in the list.
        users = list(dict.fromkeys(users))

        # remove the conversation starter
        if initial_user in users:
            users.remove(initial_user)

        # convert user_ids to user_nums
        user_nums = []
        for user in users:
            if user in user_map:
                user_nums.append(f'{user_map[user]}')

        completion = " ".join(user_nums) if len(user_nums) > 0 else "nil"
        
        example = { "prompt": f'start -> {prompt}{prompt_separator}', "completion": f' {completion}{completion_separator}' }
        out_file.write(json.dumps(example) + "\n")

           
current_channel = ""
current_conversation = []
conversation_ts = None

while True:
    output_convo = False
    line = in_file.readline()

    if not line:
        break

    message = json.loads(line)
    channel = message["channel"]
    thread_ts = message["thread_ts"]
    user = message["user"]



    # output if channel changes
    if channel != current_channel:
        current_channel = channel
        output_convo = True

    # output if different thread_ts
    if conversation_ts and conversation_ts != thread_ts:
        output_convo = True

    # output and reset
    if output_convo:
        output_conversation_example(current_conversation)
        current_conversation = []

    current_conversation.append(message)
    conversation_ts = thread_ts
   

# output final conversation
output_conversation_example(current_conversation)

in_file.close()
out_file.close()

## Data Verification & Split Stage

* make sure prompts end with same suffix
* remove too long examples
* remove duplicated examples

Note, we aren't doing classification, so don't start a fine-tune as suggested by the output


In [22]:
args = SimpleNamespace(file=f'{work_dir}/conversation_user_examples.jsonl', quiet=True)
cli.FineTune.prepare_data(args)

Analyzing...

- Your file contains 5295 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 37 duplicated prompt-completion sets. These are rows: [740, 772, 925, 932, 935, 936, 957, 1016, 1079, 1136, 1446, 1646, 1716, 2869, 2903, 3099, 3200, 3225, 3318, 3424, 3438, 3877, 3904, 4218, 4254, 4263, 4264, 4323, 4422, 4505, 4662, 4829, 4843, 4885, 4996, 5020, 5054]
- There are 3 examples that are very long. These are rows: [146, 1906, 5035]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.
- All prompts end with suffix ` \n\n###\n\n`
- All prompts start with prefix `start ->  U`

Based on the analysis we will perform the following actions:
- [Rec

## Model Training Stage

* First need to upload the training & validation files to OpenAI

In [23]:
training_file_name = f'{work_dir}/conversation_user_examples_prepared_train.jsonl'

def check_status(training_id):
    train_status = openai.File.retrieve(training_id)["status"]
    print(f'Status (training_file): {train_status} ')
    return (train_status)

# Upload the training and validation dataset files to Azure OpenAI.
training_id = cli.FineTune._get_or_upload(training_file_name, True)

# Check on the upload status of the training dataset file.
(train_status) = check_status(training_id)

# Poll and display the upload status once a second until both files have either
# succeeded or failed to upload.
while train_status not in ["succeeded", "failed", "processed"]:
    time.sleep(1)
    (train_status) = check_status(training_id)

Upload progress: 100%|██████████| 2.65M/2.65M [00:00<00:00, 375Mit/s]


Uploaded file from /Users/eric.pinzur/Documents/slackbot2000/conversation_user_examples_prepared_train.jsonl: file-g9e5FgUEE9jW7U0wNsRRPOOc
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): uploaded 
Status (training_file): processed 


## Start a fine-tuning Job

* no validation file since this is not a classification problem

In [24]:
# This example defines a fine-tune job that creates a customized model based on curie, 
# with just a single pass through the training data. The job also provides classification-
# specific metrics, using our validation data, at the end of that epoch.
create_args = {
    "training_file": training_id,
    "model": "davinci",
    "n_epochs": 2,
    "learning_rate_multiplier": 0.02,
    "suffix": "coversation_users_full_kaskada"
}
# Create the fine-tune job and retrieve the job ID
# and status from the response.
resp = openai.FineTune.create(**create_args)
job_id = resp["id"]
status = resp["status"]

# You can use the job ID to monitor the status of the fine-tune job.
# The fine-tune job may take some time to start and complete.
print(f'Fine-tuning model with job ID: "{job_id}"')

Fine-tuning model with job ID: "ft-2AZO7nL3LW9AlEbuonurYOlR"


In [25]:
job_id = "ft-2AZO7nL3LW9AlEbuonurYOlR"

## Wait for the fine-tuning to start

* Note that it can take several hours for the job to move from the `pending` state

In [None]:
# Get the status of our fine-tune job.
status = openai.FineTune.retrieve(id=job_id)["status"]

# If the job isn't yet done, poll it every 2 seconds.
if status not in ["succeeded", "failed"]:
    print(f'Job not in terminal status: {status}. Waiting.')
    while status not in ["succeeded", "failed"]:
        time.sleep(5)
        status = openai.FineTune.retrieve(id=job_id)["status"]
        print(f'Status: {status}')
else:
    print(f'Fine-tune job {job_id} finished with status: {status}')


## Check fine-tuning events

* Lets us know specifics about the fine-tuning job


In [None]:
# Get the events of our fine-tune job.
events = openai.FineTune.stream_events(id=job_id)

for event in events:
    print(event)

# Look at Training Results

* download the results file(s)

In [None]:
file_prefix = "coversation_users"

result = openai.FineTune.retrieve(id=job_id)
count = 0
for result_file in result["result_files"]:
    file_name = f'{work_dir}/{file_prefix}_{count}.csv'
    file = open(file_name, 'wb')
    file.write(openai.File.download(id=result_file["id"]))
    file.close()
    print(f'Outputted results to: {file_name}')
