## GPT Agent Swarm
This notebook uses the OpenAI API to create a group of Assistants to each create a different type of regression model on a shared set on data. The predictions from the testing data from each model are then compared by R^2 values as well as a "ensemble" approach which was the averaging of each model's prediction.

In [69]:
!pip install openai



In [70]:
import openai
import time
import ipywidgets as widgets
from IPython.display import display
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
from io import StringIO
import io
import json

## Define Functions

def read_and_save_file(first_file_id, file_name):    
    # its binary, so read it and then make it a file like object
    file_data = client.files.content(first_file_id)
    file_data_bytes = file_data.read()
    file_like_object = io.BytesIO(file_data_bytes)
    #now read as csv to create df
    returned_data = pd.read_csv(file_like_object)
    returned_data.to_csv(file_name, index=False)
    return returned_data
    # file = read_and_save_file(first_file_id, "analyst_output.csv")
    
def files_from_messages(messages, asst_name):
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    print(message_ids)
    # Loop through each file ID and save the file with a sequential name
    for i, file_id in enumerate(message_ids):
        file_name = f"{asst_name}_output_{i+1}.csv"  # Generate a sequential file name
        read_and_save_file(file_id, file_name)
        print(f'saved {file_name}')    
        
def spin_up(target, base_instructions, file_id):
    # create assistant
    my_assistant = client.beta.assistants.create(
        instructions=base_instructions,
        name="agent",
        tools=[{"type": "code_interpreter"}],
        model="gpt-4-1106-preview", # gpt-4
        file_ids=file_id)
    message_string = "Please execute your ACTIONS on the csv file, the target field is " + target
    # Create a Thread
    thread = client.beta.threads.create()
    # Add a Message to a Thread
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content= message_string)
    # Run the Assistant
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=my_assistant.id)
    return my_assistant, thread, run 
    print('Finished creating Assistants')
    #assistant, thread, run = spin_up(n, base_instructions, file_id)    
    
def catch_response(assistant, thread, run):
    #time.sleep(240)  
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id)
    print('Checking for response...')
    # If run is completed, get messages
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id)
        # Loop through messages and print content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
                return messages, content
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
    else:
        print('no response yet')
    #messages, content = catch_response(assistant, thread, run)   
    
def spin_down(my_assistant_id):
    response = client.beta.assistants.delete(my_assistant_id)
    print(response)  
    #spin_down(my_assistant_id)

def upload_csv(file_name):
    response = client.files.create(
        file=open(file_name, "rb"),
        purpose="assistants")
    print(response)
    file_id = response.id
    return file_id

def delete_all_assistant_files():
    ''' Deletes all exising files uploaded to client using API key '''
    # generate a files object
    files_object = client.files.list()
    # get a list comprehension
    file_ids = [file.id for file in files_object.data]
    print(f'Deleting {len(file_ids)} files.')
    #delete them all
    for file_id in file_ids:
        client.files.delete(file_id)
        print(f"Deleted file with ID: {file_id}")
    print('Finished deleting all files')

def create_dataframes_from_messages(messages, client):
    loop_dfs = []
    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids
    # Loop through each file ID and create a DataFrame
    for file_id in message_ids:
        # Read the file content
        file_data = client.files.content(file_id)
        file_data_bytes = file_data.read()
        file_like_object = io.BytesIO(file_data_bytes)
        # Create a DataFrame from the file-like object and append
        df = pd.read_csv(file_like_object)
        loop_dfs.append(df)
    return loop_dfs

def consolidate_response_dfs(df_list):
    # Extract 'actual_mpg' from the first DataFrame
    actual_mpg = df_list[0][0][['row_id', 'actual_mpg']].drop_duplicates('row_id')
    # create empty DataFrame for the predicted_mpg values
    predicted_mpg_df = pd.DataFrame()
    # Loop through each DataFrame in the list
    for i, df_tuple in enumerate(df_list):
        # Check if the tuple is not empty
        if df_tuple:
            df = df_tuple[0]
            # Rename 'predicted_mpg' column to match which agent predicted it
            df = df.rename(columns={'predicted_mpg': f'predicted_mpg{i+1}'})
            # Select only the 'row_id' and the renamed 'predicted_mpg' column
            df = df[['row_id', f'predicted_mpg{i+1}']]
            # initialize predicted_mpg_df on the first iteration
            if i == 0:
                predicted_mpg_df = df
            else:
                # Join using 'row_id'
                predicted_mpg_df = predicted_mpg_df.merge(df, on='row_id', how='outer')
        else:
            print(f'run {i} is empty')
    # Join the 'actual_mpg' with the predicted_mpg_df
    consolidated_df = actual_mpg.merge(predicted_mpg_df, on='row_id', how='outer')
    # Calculate the average of the predictions and add it as a new column
    prediction_columns = [col for col in consolidated_df.columns if col.startswith('predicted_mpg')]
    consolidated_df['average_prediction'] = consolidated_df[prediction_columns].mean(axis=1)
    consolidated_df = consolidated_df.dropna()
    return consolidated_df

def calculate_r2(df, predicted_column):
    # Extract the actual and predicted values
    actual = df['actual_mpg']
    predicted = df[predicted_column]
    # Calculate the R² score
    r2 = r2_score(actual, predicted)
    return r2

## Initialize API Session

In [71]:
# set key and assistant ID
OPENAI_API_KEY = '<your API key goes here>'

# Instantiate the OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

## Upload Data Set and Split

In [72]:
# load and drop name column
df = pd.read_csv('auto-mpg.csv')
df = df.drop('car name', axis = 1)
df = df.reset_index().rename(columns={'index': 'row_id'})
display(df)

# create training and testing files
train_data, test_data = train_test_split(df, test_size=.25, random_state=42) 
train_data.to_csv('auto-mpg-train.csv', index=False)
test_data.to_csv('auto-mpg-test.csv', index=False)

# read them back in
train_df = pd.read_csv('auto-mpg-train.csv')
test_df = pd.read_csv('auto-mpg-test.csv')
print(train_df.tail())
print(test_df.tail())

#upload both files to Assistants
train_file_id = upload_csv('auto-mpg-train.csv')
test_file_id = upload_csv('auto-mpg-test.csv')
file_ids = [train_file_id, test_file_id]
print(file_ids)

print(f' Row count in training set: {len(train_df)}, row countin testing set: {len(test_df)}')

Unnamed: 0,row_id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,0,18.0,8,307.0,130,3504,12.0,70,1
1,1,15.0,8,350.0,165,3693,11.5,70,1
2,2,18.0,8,318.0,150,3436,11.0,70,1
3,3,16.0,8,304.0,150,3433,12.0,70,1
4,4,17.0,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...,...,...
393,393,27.0,4,140.0,86,2790,15.6,82,1
394,394,44.0,4,97.0,52,2130,24.6,82,2
395,395,32.0,4,135.0,84,2295,11.6,82,1
396,396,28.0,4,120.0,79,2625,18.6,82,1


     row_id   mpg  cylinders  displacement horsepower  weight  acceleration  \
293      71  19.0          3          70.0         97    2330          13.5   
294     106  12.0          8         350.0        180    4499          12.5   
295     270  21.1          4         134.0         95    2515          14.8   
296     348  37.7          4          89.0         62    2050          17.3   
297     102  26.0          4          97.0         46    1950          21.0   

     model year  origin  
293          72       3  
294          73       1  
295          78       3  
296          81       3  
297          73       2  
    row_id   mpg  cylinders  displacement horsepower  weight  acceleration  \
95     378  38.0          4         105.0         63    2125          14.7   
96     371  29.0          4         135.0         84    2525          16.0   
97     280  21.5          6         231.0        115    3245          15.4   
98     323  27.9          4         156.0        105    2

## Create Instruction Strings

In [73]:
base_instructions_1 = '''
You are a data scientist who will build a predictive model with data from the provided training and testing csv files. 
When the user asks you to perform your ACTIONS, carry out the described ACTIONS on the provided files.
Then continue with each of the steps listed below in your ACTIONS. The user will identify the target variable. 

ACTIONS:

1. Train a ''' 

model_types = ['Linear Regression','Decision Tree Regression','Random Forest Regression','Support Vector Regression']

base_instructions_2 = ''' model with the training data.
2. Test the model using the testing data.
3. Create a table with three columns, one for row_id value called 'row_id', one for the actual mpg values in the testing data called 'actual_mpg' and one for the predicted mpg values called 'predicted_mpg'  
4. Prepare the table as a csv file for the user to download. 

DO NOT:
1. Return any images. '''


## Create the Swarm

In [74]:
# run a loop to create Agent swarm 
agents = []
for i in model_types:
    print(f'Creating assistant {i}')
    instructions = base_instructions_1 + i + base_instructions_2
    assistant, thread, run = spin_up("mpg", instructions, file_ids) 
    agents.append((assistant, thread, run))    

Creating assistant Linear Regression
Creating assistant Decision Tree Regression
Creating assistant Random Forest Regression
Creating assistant Support Vector Regression


In [75]:
# run a loop to catch the Agent responses
time.sleep(240) 
agent_responses = []
for assistant, thread, run in agents:
    messages, content = catch_response(assistant, thread, run) 
    agent_responses.append((messages, content))
    time.sleep(10) 

Checking for response...
Assistant: The table with 'row_id', 'actual_mpg', and 'predicted_mpg' values has been saved to a CSV file. You can download it using the link below:

[Download the predictions CSV file](sandbox:/mnt/data/mpg_predictions.csv)
Checking for response...
Assistant: The table with the actual and predicted `mpg` values has been saved as a CSV file. You can download it using the following link:

[Download mpg_predictions.csv](sandbox:/mnt/data/mpg_predictions.csv)
Checking for response...
Assistant: The results table has been saved as a CSV file. You can download it using the following link:

[Download the predicted mpg CSV file](sandbox:/mnt/data/mpg_predictions.csv)
Checking for response...
Assistant: The predictive model has been trained and tested. The table with actual and predicted mpg values has been prepared and saved to a csv file. You can download the results using the link below:

[Download predicted_mpg_results.csv](sandbox:/mnt/data/predicted_mpg_results.c

In [76]:
#extract dataframes and compile for accuracy
df_list = []
for messages, content in agent_responses:
    dataframes = create_dataframes_from_messages(messages, client)
    df_list.append(dataframes)

In [77]:
df_list

[[    row_id  actual_mpg  predicted_mpg
  0      198        33.0      32.952238
  1      396        28.0      29.533315
  2       33        19.0      21.123169
  3      208        13.0      16.771163
  4       93        14.0      12.557398
  ..     ...         ...            ...
  95     378        38.0      32.482212
  96     371        29.0      30.093171
  97     280        21.5      23.479354
  98     323        27.9      26.616858
  99      75        14.0      12.778561
  
  [100 rows x 3 columns]],
 [    row_id  actual_mpg  predicted_mpg
  0      198        33.0           31.0
  1      396        28.0           26.6
  2       33        19.0           21.0
  3      208        13.0           15.0
  4       93        14.0           17.5
  ..     ...         ...            ...
  95     378        38.0           36.0
  96     371        29.0           27.2
  97     280        21.5           22.4
  98     323        27.9           23.6
  99      75        14.0           13.0
  
  [100 

In [78]:
consolidated_df = consolidate_response_dfs(df_list)
consolidated_df

Unnamed: 0,row_id,actual_mpg,predicted_mpg1,predicted_mpg2,predicted_mpg3,predicted_mpg4,average_prediction
0,198,33.0,32.952238,31.0,30.321,32.041218,31.578614
1,396,28.0,29.533315,26.6,29.974,28.737055,28.711092
2,33,19.0,21.123169,21.0,20.113,20.873603,20.777443
3,208,13.0,16.771163,15.0,14.977,15.330378,15.519635
4,93,14.0,12.557398,17.5,14.565,13.699613,14.580503
...,...,...,...,...,...,...,...
95,378,38.0,32.482212,36.0,36.674,32.043098,34.299828
96,371,29.0,30.093171,27.2,29.365,29.582122,29.060073
97,280,21.5,23.479354,22.4,21.336,20.944252,22.039901
98,323,27.9,26.616858,23.6,26.488,26.247537,25.738099


In [79]:
model_types.append('Ensemble')

R_scores = []
column_names = consolidated_df.columns.tolist()
for name in column_names:
    if name != 'row_id' and name != 'actual_mpg':
        score = calculate_r2(consolidated_df, name)
        R_scores.append(score)

# Create dictionary with list names as keys
data = {'model_types': model_types, 'R_scores': R_scores}

# Creating the DataFrame
models_and_scores_df = pd.DataFrame(data)    
display(models_and_scores_df)

Unnamed: 0,model_types,R_scores
0,Linear Regression,0.841901
1,Decision Tree Regression,0.757636
2,Random Forest Regression,0.889145
3,Support Vector Regression,0.873117
4,Ensemble,0.887872


## Clean Up Agents and Files

In [80]:
for assistant, thread, run in agents:
    assistant_id = assistant.id
    response = spin_down(assistant_id)
    print(response)

AssistantDeleted(id='asst_UmmORqTu5THwaXBXp7TqZ4mu', deleted=True, object='assistant.deleted')
None
AssistantDeleted(id='asst_fNGcsPgzaPDMgROnRzwnbtKB', deleted=True, object='assistant.deleted')
None
AssistantDeleted(id='asst_wgxQFwA6d0KB4FgiaLYmzugq', deleted=True, object='assistant.deleted')
None
AssistantDeleted(id='asst_fidwhVdxZmnsUqOXhQf5N4pA', deleted=True, object='assistant.deleted')
None


In [81]:
delete_all_assistant_files()

Deleting 2 files.
Deleted file with ID: file-dwa2A18ABLcgLR7ix2jOCvPH
Deleted file with ID: file-OuQnwoGzOUgd9aVo6C19uWi4
Finished deleting all files
