### Single Modeler
This notebook will create a GPT Assistant using OpenAI's API and provide it with the training dataframe returned by the data engineer Assistant and a set of instructions to creating an "Extra Trees" Random Forest. Basic outline of instructions for the modeler:
<br>
1. Load the provided dataframe into a pandas df.
2. Split the data set into training and testing using a 75:25 split.
3. Train an Extra Trees random forest with 2000 trees.
4. Use the testing data to measure the model's accuracy, presicion, recall, and generate a confusion matrix.
5. Return the results in a single csv table. 

In [1]:
!pip install openai



In [2]:
import openai
import time
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
from io import StringIO
import io
import json

## Define Functions

def read_and_save_file(first_file_id, file_name):    
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_like_object = io.BytesIO(file_data_bytes)
    #now read as csv to create df
    returned_data = pd.read_csv(file_like_object)
    returned_data.to_csv(file_name, index=False)
    return returned_data
    # file = read_and_save_file(first_file_id, "analyst_output.csv")
    
def files_from_messages(messages, asst_name):
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    print(message_ids)
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        file_name = f"{asst_name}_output_{i+1}.csv"  # Generate a sequential file name
        read_and_save_file(file_id, file_name)
        print(f'saved {file_name}')    

## Initialize API Session

In [5]:
# set key and assistant ID
OPENAI_API_KEY = 'your_API_key'

# Instantiate the OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [6]:
# load and check the file for the engineer
asst_file = 'engineer_output_1.csv'
df = pd.read_csv(asst_file)

display(df)

Unnamed: 0.1,Unnamed: 0,Class,Uniformity of Cell Shape,Bland Chromatin,Normal Nucleoli,Mitoses,Sample code number_div_Uniformity of Cell Shape,Sample code number_x_Single Epithelial Cell Size,Sample code number_div_Bare Nuclei,Sample code number_x_Normal Nucleoli,...,Single Epithelial Cell Size_div_Bare Nuclei,Single Epithelial Cell Size_x_Bland Chromatin,Single Epithelial Cell Size_div_Normal Nucleoli,Single Epithelial Cell Size_x_Mitoses,Bare Nuclei_x_Bland Chromatin,Bare Nuclei_div_Bland Chromatin,Bare Nuclei_div_Normal Nucleoli,Bare Nuclei_div_Mitoses,Bland Chromatin_div_Normal Nucleoli,Normal Nucleoli_div_Mitoses
0,0,2,1,3,1,1,1.000015e+06,2000050,1.000015e+06,1000025,...,1.999980,6,1.999980,2,3,0.333332,0.999990,0.999990,2.999970,0.999990
1,1,2,4,3,2,1,2.507356e+05,7020615,1.002944e+05,2005890,...,0.699999,21,3.499983,7,30,3.333322,4.999975,9.999900,1.499993,1.999980
2,2,2,1,3,1,1,1.015415e+06,2030850,5.077100e+05,1015425,...,0.999995,6,1.999980,2,6,0.666664,1.999980,1.999980,2.999970,0.999990
3,3,2,8,3,7,1,1.270345e+05,3048831,2.540686e+05,7113939,...,0.749998,9,0.428571,3,12,1.333329,0.571428,3.999960,0.428571,6.999930
4,4,2,1,3,1,1,1.017013e+06,2034046,1.017013e+06,1017023,...,1.999980,6,1.999980,2,3,0.333332,0.999990,0.999990,2.999970,0.999990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,678,2,1,1,1,1,7.767072e+05,2330145,3.883556e+05,776715,...,1.499993,3,2.999970,3,2,1.999980,1.999980,1.999980,0.999990,0.999990
679,679,2,1,1,1,1,8.417606e+05,1683538,8.417606e+05,841769,...,1.999980,2,1.999980,2,1,0.999990,0.999990,0.999990,0.999990,0.999990
680,680,4,10,8,10,2,8.888191e+04,6221740,2.962723e+05,8888200,...,2.333326,56,0.699999,14,24,0.375000,0.300000,1.499993,0.799999,4.999975
681,681,4,6,10,6,1,1.495783e+05,2692413,2.243672e+05,5384826,...,0.749998,30,0.499999,3,40,0.400000,0.666666,3.999960,1.666664,5.999940


## Create the Modeler and Pass the csv File

In [7]:
# create the assistant and give it the CSV file

mls = '''
You are a data scientist who will build a predictive model with data from two csv files uploaded to your files. 
When the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.
Then continue with each of the steps listed below in you ACTIONS. The user will identify the target variable. 

ACTIONS:

1. Load the engineer_output_1 csv file into a pandas df of the same name.
2. Split the data set into training and testing data sets with a 25% split.
3. Train an Extra Trees random forest with 2000 trees
4. Use the testing data to measure the models accuracy, presicion, recall, and confusion matrix.
5. Format the testing data results as a csv table and prepare it for download by the user. 

DO NOT:
1. Return any images. 
'''

# send the csv file to the assistant purpose files
response = client.files.create(
  file=open(asst_file, "rb"),
  purpose="assistants"
)
print(response)
file_1_id = response.id

my_assistant = client.beta.assistants.create(
    instructions=mls,
    name="modeler_1",
    tools=[{"type": "code_interpreter"}],
    model="gpt-4-1106-preview", # gpt-4
    file_ids=[file_1_id] # multiple files: file_ids=[file_1_id, file_2_id]
)

# get the file id
fileId = my_assistant.file_ids[0]
print(my_assistant)

FileObject(id='file-ZDNjzCQklyHBYOeaOMQveOLJ', bytes=301066, created_at=1700945014, filename='engineer_output_1.csv', object='file', purpose='assistants', status='processed', status_details=None)
Assistant(id='asst_oDqcpYNUA8ZHkGeHYZpRHjuy', created_at=1700945016, description=None, file_ids=['file-ZDNjzCQklyHBYOeaOMQveOLJ'], instructions='\nYou are a data scientist who will build a predictive model with data from two csv files uploaded to your files. \nWhen the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.\nThen continue with each of the steps listed below in you ACTIONS. The user will identify the target variable. \n\nACTIONS:\n\n1. Load the engineer_output_1 csv file into a pandas df of the same name.\n2. Split the data set into training and testing data sets with a 25% split.\n3. Train an Extra Trees random forest with 2000 trees\n4. Use the testing data to measure the models accuracy, presicion, recall, and confusion matrix.\n5. F

### Create the Message

In [8]:
# make the request to the assistant

message_string = "Please execute your ACTIONS on " + fileId + " and prepare the resulting table for csv download. The Target variable is Class"
print(message_string)

# Step 2: Create a Thread
thread = client.beta.threads.create()

# Step 3: Add a Message to a Thread
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content= message_string
)

# Step 4: Run the Assistant
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=my_assistant.id
    #instructions="Overwrite hard-coded instructions here"
)

print(run.model_dump_json(indent=4))

while True:
    # Wait in between tries
    sec = 60
    time.sleep(sec)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
    print('One eternity later...')
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        break

Please execute your ACTIONS on file-ZDNjzCQklyHBYOeaOMQveOLJ and prepare the resulting table for csv download. The Target variable is Class
{
    "id": "run_NobOiu8bKN0TkwqFyjlde0YO",
    "assistant_id": "asst_oDqcpYNUA8ZHkGeHYZpRHjuy",
    "cancelled_at": null,
    "completed_at": null,
    "created_at": 1700945097,
    "expires_at": 1700945697,
    "failed_at": null,
    "file_ids": [
        "file-ZDNjzCQklyHBYOeaOMQveOLJ"
    ],
    "instructions": "\nYou are a data scientist who will build a predictive model with data from two csv files uploaded to your files. \nWhen the user asks you to perform your actions, use the csv file to read the data into a pandas dataframe.\nThen continue with each of the steps listed below in you ACTIONS. The user will identify the target variable. \n\nACTIONS:\n\n1. Load the engineer_output_1 csv file into a pandas df of the same name.\n2. Split the data set into training and testing data sets with a 25% split.\n3. Train an Extra Trees random forest wi

## Extract Files from Response

In [9]:
asst_name = 'modeler'        
files_from_messages(messages, asst_name)

['file-0A9BwTt2oJB4IJPUbSdmgpmV']
saved modeler_output_1.csv


In [10]:
df1 = pd.read_csv('modeler_output_1.csv')
display(df1)

Unnamed: 0,Metric,Score,Predicted_1,Predicted_2
0,Accuracy,0.976608,,
1,Precision,0.978139,,
2,Recall,0.973087,,
3,,,102.0,1.0
4,,,3.0,65.0


## Clean Up the Analyst

In [11]:
# Clean up the assistant

response = client.beta.assistants.delete(my_assistant.id)
print(response)

AssistantDeleted(id='asst_oDqcpYNUA8ZHkGeHYZpRHjuy', deleted=True, object='assistant.deleted')
