## Base Model Swarm
This is the second in a series of three notebooks for the ODSC presentation 'Harnessing GPT Assistants for Superior Model Ensembles: A Beginner's Guide to AI STacked-Classifiers' ODSC East -- Jason Merwin

In [1]:
import openai
import time
import ipywidgets as widgets
from IPython.display import display
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from io import StringIO
import io
import json
import warnings

from config import OPENAI_API_KEY

warnings.filterwarnings('ignore', category=FutureWarning)

# define functions

def delete_all_agents():
    ''' Deletes all exising Assistants '''
    # Fetch the list of assistants
    my_assistants = client.beta.assistants.list(order="desc", limit=20)
    asst_ids = [asst.id for asst in my_assistants.data]
    print(f'Deleting {len(asst_ids)} assistants.')
    # Delete each assistant
    for asst_id in asst_ids:
        client.beta.assistants.delete(asst_id)
        print(f"Deleted assistant with ID: {asst_id}")
    print('Finished deleting all assistants')
    
def delete_all_assistant_files():
    ''' Deletes all exising files uploaded to client using API key '''
    # generate a files object
    files_object = client.files.list()
    # get a list comprehension
    file_ids = [file.id for file in files_object.data]
    print(f'Deleting {len(file_ids)} files.')
    #delete them all
    for file_id in file_ids:
        client.files.delete(file_id)
        print(f"Deleted file with ID: {file_id}")
        time.sleep(1)
    print('Finished deleting all files')   

def upload_csv(file_name):
    response = client.files.create(
        file=open(file_name, "rb"),
        purpose="assistants")
    print(response)
    file_id = response.id
    return file_id

def spin_up(target, base_instructions, file_id):
    # create assistant
    my_assistant = client.beta.assistants.create(
        instructions=base_instructions,
        name="agent",
        tools=[{"type": "code_interpreter"}],
        model="gpt-4-turbo-preview", #"gpt-4-1106-preview", # "gpt-4", # "gpt-3.5-turbo-1106", "gpt-4-turbo-preview"
        file_ids=file_id)
    message_string = "Please execute your ACTIONS on the csv file, the target field is " + target
    # Create a Thread
    thread = client.beta.threads.create()
    # Add a Message to a Thread
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content= message_string)
    # Run the Assistant
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=my_assistant.id)
    return my_assistant, thread, run 
    print('Finished creating Assistants')
    
def catch_response(assistant, thread, run):
    # Retrieve the run status
    run_status = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id)
    print('########################')
    print('Checking for response...')
    # Handle None response
    if run_status is None:
        print("No response yet")
        return None, None  # Return a tuple of None values to match the expected return type
    # Handle non-completed response
    if run_status.status != 'completed':
        print("Response status is not 'completed'")
        return None, None
    # Handle completed response
    if run_status.status == 'completed':
        messages = client.beta.threads.messages.list(
            thread_id=thread.id)
        contents = []  # Initialize an empty list to store contents
        # Loop through messages and process content based on role
        for msg in messages.data:
            role = msg.role
            try:
                content = msg.content[0].text.value
                print(f"{role.capitalize()}: {content}")
                contents.append(content)  # Append content to the list
            except AttributeError:
                # This will execute if .text does not exist
                print(f"{role.capitalize()}: [Non-text content, possibly an image or other file type]")
        return messages, contents  # Return messages and a list of contents
    else:
        print('Unable to retrieve message')
        return None, None

def create_dataframes_from_messages(messages, client):
    loop_dfs = []

    # Check if messages is None or messages.data is empty
    if messages is None or not messages.data:
        print("No messages data found.")
        return loop_dfs

    first_thread_message = messages.data[0]  # Accessing the first ThreadMessage
    message_ids = first_thread_message.file_ids

    # Loop through each file ID and create a DataFrame
    for file_id in message_ids:
        # Read the file content
        file_data = client.files.content(file_id)

        # Check if file_data is None
        if file_data is None:
            print(f"No content found for file_id: {file_id}")
            continue  # Skip this iteration and proceed with the next file_id

        file_data_bytes = file_data.read()
        file_like_object = io.BytesIO(file_data_bytes)

        # Create a DataFrame from the file-like object and append
        df = pd.read_csv(file_like_object)
        loop_dfs.append(df)

    return loop_dfs   

def calculate_model_accuracies(df):
    model_accuracy_dict = {}
    # Filter columns that contain probability predictions
    prediction_columns = [col for col in df.columns if "_prob" in col]
    
    for col in prediction_columns:
        # Assuming binary classification with 0.5 threshold
        predicted_classes = df[col].apply(lambda x: 1 if x >= 0.5 else 0)
        actual_classes = df[f'{target}']
        # Calculate accuracy
        accuracy = accuracy_score(actual_classes, predicted_classes)
        # Extract model name from column name 
        model_name = col.split("_prob")[0]
        model_accuracy_dict[model_name] = accuracy
    
    return model_accuracy_dict

def calculate_model_metrics(df, target):
    model_metrics_dict = {}
    # Filter columns that contain probability predictions
    prediction_columns = [col for col in df.columns if "_prob" in col]

    for col in prediction_columns:
        # Assuming binary classification with 0.5 threshold
        predicted_classes = df[col].apply(lambda x: 1 if x >= 0.5 else 0)
        actual_classes = df[target]
        # Calculate accuracy, precision, recall, and F1-score
        accuracy = accuracy_score(actual_classes, predicted_classes)
        precision = precision_score(actual_classes, predicted_classes, zero_division=0)
        recall = recall_score(actual_classes, predicted_classes, zero_division=0)
        f1 = f1_score(actual_classes, predicted_classes, zero_division=0)
        # Extract model name from column name 
        model_name = col.split("_prob")[0]
        # Store the metrics in a dictionary with the model name as key
        model_metrics_dict[model_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }

    return model_metrics_dict

# Initialize API Session

In [2]:
# Instantiate the OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# check training and testing data

In [3]:
#use the feature engineer output
target = 'Quality'
encoded_train = pd.read_csv('feature_engineer_output_1b.csv')

#optional: use the original dataset instead
#encoded_train = pd.read_csv('pre_assistant_train.csv')

#add a row id 
encoded_train = encoded_train.reset_index()
encoded_train = encoded_train.rename(columns={'index': 'row_id'})
encoded_train

Unnamed: 0,row_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Sweetness*Ripeness,Size*Ripeness,Size*Acidity,Quality
0,0,-3.970049,-2.512336,5.346330,-1.012009,1.844900,0.329840,-0.491590,1.763432,-1.309480,1.951638,1
1,1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.867530,-0.722809,3.178681,-1.036887,0.863914,1
2,2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,0.066118,0.011107,-0.765580,0
3,3,-0.657196,-2.271627,1.324874,-0.097875,3.637970,-3.413761,0.790723,-4.522803,2.243510,-0.519660,1
4,4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,0.501536,-1.778733,0.684815,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3995,0.059386,-1.067408,-3.714549,0.473052,1.697986,2.244055,0.137784,-8.335650,0.133266,0.008183,0
3996,3996,-0.293118,1.949253,-0.204020,-0.640196,0.024523,-1.087900,1.854235,0.221953,0.318883,-0.543510,1
3997,3997,-2.634515,-2.138247,-2.440461,0.657223,2.199709,4.763859,-1.334611,-11.626014,-12.550460,3.516054,0
3998,3998,-4.008004,-1.779337,2.366397,-0.200329,2.161435,0.214488,-2.229720,0.507565,-0.859670,8.936725,1


# Create the Swarm

In [4]:
# first make sure any existing bots and files are cleaned up
delete_all_agents()   
delete_all_assistant_files()

Deleting 1 assistants.
Deleted assistant with ID: asst_WXbgpKUFktsfMO84lXhz6NIQ
Finished deleting all assistants
Deleting 2 files.
Deleted file with ID: file-8grttuOZwzwJj3Vz94864D0A
Deleted file with ID: file-dpRMdyvBAbZU0So974wbXYAM
Finished deleting all files


In [6]:
#reserve 20% of training data to be used as "inference" data
train_set, val_set = train_test_split(encoded_train, test_size=0.2, random_state=42)

#save the files
train_set.to_csv('encoded_train.csv', index=False)
val_set.to_csv('encoded_val.csv', index=False)

In [7]:
#define the model types here by description
model_types = ['Logistic_Regression', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'Random_Forest', 'Extra_Trees_Random_Forest', 'Support Vector Machine']

train_id = upload_csv(f'encoded_train.csv')
val_id = upload_csv(f'encoded_val.csv')
file_ids = [train_id, val_id]

agents = []

for i in model_types:
    print(f'Creating {i} assistant')
    
    #assign loop version of models and file names
    model = i
    print('provided these files')
    print(file_ids)
    instructions = instructions = f'''
    You are a data scientist who will build and test a predictive model with data from the provided csv file.
    This model will be base model for a stacked model ensemble, thus the predictions on the training data will be used as input for a meta model. 
    When the user asks you to perform your ACTIONS, carry out the described ACTIONS on the provided files.
    The target variable is '{target}'.
    There is an id column to be maintained, unaltered and returned in the output called "row_id". This column should be excluded when training the model.

    ACTIONS:

    1.The data has been prepared for training a {model} classification model to predict the target variable '{target}'.
    2.Split the training data in the file {train_id} into 5 K-folds for cross-validation. Each fold should serve once as a validation set while the remaining folds serve as training sets.
    3.Train a {model} classification model using default hyper-parameter values on each training set derived from the K folds, ensuring the target variable is '{target}'.
    4.For each fold, use the trained {model} to predict the '{target}' on its corresponding validation set. Ensure the predictions are probabilities.
    5.Compile the out-of-fold predictions into a single dataset. This dataset should include the 'row_id' from the testing set and the predicted probabilities. Name the columns as follows: 'row_id' and '{model[:4]}_prob'.
    6.Save this compiled dataset as a CSV file named 'out_of_fold_predictions.csv' and prepare it for the user to download. This file will be used for training the meta-model.
    7.Now use the trained models to score the validation data in the file {val_id} containing the same target column '{target}'. Average their scores for each row in the validation data and compile the results in the same way as before and prepare it for the user to download as a CSV file names 'valiation_predicitons.csv'
    8.Both tables should contain 2 columns: row_id and '{model[:4]}_prob'.
    9.Please only respond once, with both tables once they are ready for download.
    
    DO NOT:
    1. Do not return any images.
    2. Do not return any other tables besides the tables 'out_of_fold_predictions.csv' and 'valiation_predicitons.csv'
    3. Do not include row_id as a feature in the training of the model.
    4. Do not respond before both tables are ready for download.

    '''  

    # spin up for each model type and store return object
    assistant, thread, run = spin_up(f'{target}', instructions, file_ids) 
    agents.append((assistant, thread, run, model))  
    print()
    time.sleep(5)

FileObject(id='file-SZHbZR6fxxdD5Obv88BdcbGV', bytes=485371, created_at=1714062846, filename='encoded_train.csv', object='file', purpose='assistants', status='processed', status_details=None)
FileObject(id='file-D6VAYUYWFjY4unDFRTWvoXde', bytes=121451, created_at=1714062847, filename='encoded_val.csv', object='file', purpose='assistants', status='processed', status_details=None)
Creating Logistic_Regression assistant
provided these files
['file-SZHbZR6fxxdD5Obv88BdcbGV', 'file-D6VAYUYWFjY4unDFRTWvoXde']

Creating DecisionTreeClassifier assistant
provided these files
['file-SZHbZR6fxxdD5Obv88BdcbGV', 'file-D6VAYUYWFjY4unDFRTWvoXde']

Creating KNeighborsClassifier assistant
provided these files
['file-SZHbZR6fxxdD5Obv88BdcbGV', 'file-D6VAYUYWFjY4unDFRTWvoXde']

Creating Random_Forest assistant
provided these files
['file-SZHbZR6fxxdD5Obv88BdcbGV', 'file-D6VAYUYWFjY4unDFRTWvoXde']

Creating Extra_Trees_Random_Forest assistant
provided these files
['file-SZHbZR6fxxdD5Obv88BdcbGV', 'file-D6

# Catch the Responses

In [8]:
# run a loop to catch the Agent responses
time.sleep(300) 

agent_responses = []
for assistant, thread, run, model, in agents:
    messages, content = catch_response(assistant, thread, run) 
    agent_responses.append((messages, content, model, assistant))
    time.sleep(5) 

########################
Checking for response...
Assistant: Both tables are ready for download:

- For the out-of-fold predictions, download [out_of_fold_predictions.csv](sandbox:/mnt/data/out_of_fold_predictions.csv).
- For the validation predictions, download [validation_predictions.csv](sandbox:/mnt/data/validation_predictions.csv).
User: Please execute your ACTIONS on the csv file, the target field is Quality
########################
Checking for response...
Assistant: I have completed the requested actions and generated the two CSV files:

- The out-of-fold predictions from the training data: [Download out_of_fold_predictions.csv](sandbox:/mnt/data/out_of_fold_predictions.csv)
- The averaged scores for each row in the validation data: [Download validation_predictions.csv](sandbox:/mnt/data/validation_predictions.csv)
User: Please execute your ACTIONS on the csv file, the target field is Quality
########################
Checking for response...
Assistant: I have completed the task

In [9]:
#extract dataframes and compile
df_list = []
for messages, content, model, assistant in agent_responses:
    dataframes = create_dataframes_from_messages(messages, client)
    assistant_id = assistant.id
    df_list.append([dataframes, model, assistant_id])

In [10]:
# Capture the validation data scores
val_data_df_dict = {}
val_failures = []

# Loop through and capture validation data output
for item in df_list:
    try:
        df1 = pd.DataFrame(item[0][0]) 
        print(df1)
        if 'row_id' not in df1.columns:
            df1 = df1.reset_index().rename(columns={'index': 'row_id'})
        model = item[1]
        # Extract the first three letters of the model and the fold_id value
        key = model
        # Add the DataFrame to the dictionary with the generated key
        val_data_df_dict[key] = df1
        
    except:
        assistant_model = item[1]
        val_failures.append([assistant_model])
        
# Display failed data returns
print('assistants which failed to return a scored training data dataframe:')
print(val_failures)

     row_id  Logi_prob
0       555   0.641120
1      3491   0.666901
2       527   0.603839
3      3925   0.524097
4      2989   0.015525
..      ...        ...
795    1922   0.358516
796     865   0.884039
797    3943   0.934317
798    1642   0.662319
799    2483   0.862121

[800 rows x 2 columns]
     row_id  Deci_prob
0       555        1.0
1      3491        0.0
2       527        1.0
3      3925        1.0
4      2989        0.0
..      ...        ...
795    1922        0.0
796     865        1.0
797    3943        1.0
798    1642        1.0
799    2483        1.0

[800 rows x 2 columns]
     row_id  KNei_prob
0       555       0.72
1      3491       0.76
2       527       0.68
3      3925       1.00
4      2989       0.00
..      ...        ...
795    1922       0.00
796     865       1.00
797    3943       1.00
798    1642       0.76
799    2483       0.96

[800 rows x 2 columns]
     row_id  Rand_prob
0       555      0.686
1      3491      0.862
2       527      0.570
3      3

In [11]:
# Capture the meta training data
test_data_df_dict = {}
test_failures = []

# Loop through and capture testing data output
for item in df_list:
    try:
        df1 = pd.DataFrame(item[0][1]) 
        print(df1)
        if 'row_id' not in df1.columns:
            df1 = df1.reset_index().rename(columns={'index': 'row_id'})
        model = item[1]
        # Extract the first three letters of the model and the fold_id value
        key = model
        # Add the DataFrame to the dictionary with the generated key
        test_data_df_dict[key] = df1
        
    except:
        assistant_model = item[1]
        test_failures.append([assistant_model])
        
# Display failed data returns
print('assistants which failed to return a scored training data dataframe:')
print(test_failures)

      row_id  Logi_prob
0          0   0.807077
1          1   0.888458
2          2   0.509400
3          3   0.707898
4          4   0.829310
...      ...        ...
3195    3994   0.907927
3196    3995   0.059556
3197    3996   0.587951
3198    3998   0.885267
3199    3999   0.639848

[3200 rows x 2 columns]
      row_id  Deci_prob
0       3994          0
1        423          1
2       2991          0
3       1221          0
4        506          1
...      ...        ...
3195    1130          1
3196    1294          0
3197     860          1
3198    3507          1
3199    3174          1

[3200 rows x 2 columns]
      row_id  KNei_prob
0       3994        1.0
1       1601        1.0
2       1779        1.0
3       3323        1.0
4       2059        0.6
...      ...        ...
3195    1685        0.4
3196    2135        0.6
3197    1482        0.0
3198    2169        0.4
3199    3174        1.0

[3200 rows x 2 columns]
      row_id  Rand_prob
0       3994       0.84
1       1601 

# Prepare Scored Training Data

In [12]:
# create a target df to join everything to
list_of_val_keys = list(val_data_df_dict.keys())
first_val_key = list_of_val_keys[0]
meta_val_data = val_data_df_dict[first_val_key]

# Loop through the DataFrames in the dictionary, joining each to the label
for key in val_data_df_dict:
    if key != first_val_key and key not in val_failures:
        # get each dataframe
        cols_to_join = val_data_df_dict[key]
        # Join with the initial DataFrame on 'row_id'
        meta_val_data = meta_val_data.merge(cols_to_join, on='row_id', how='left')
        print(f'joined to {key}')

# add back label
val_label_df = encoded_train[['row_id', 'Quality']]
meta_val_data = meta_val_data.merge(val_label_df, on='row_id', how='left')

display(meta_val_data)   

joined to DecisionTreeClassifier
joined to KNeighborsClassifier
joined to Random_Forest
joined to Extra_Trees_Random_Forest
joined to Support Vector Machine


Unnamed: 0,row_id,Logi_prob,Deci_prob,KNei_prob,Rand_prob,Extr_prob,Supp_prob,Quality
0,555,0.641120,1.0,0.72,0.686,0.644,0.829235,1
1,3491,0.666901,0.0,0.76,0.862,0.714,0.834491,0
2,527,0.603839,1.0,0.68,0.570,0.528,0.378980,0
3,3925,0.524097,1.0,1.00,0.732,0.788,0.936887,1
4,2989,0.015525,0.0,0.00,0.022,0.040,0.009707,0
...,...,...,...,...,...,...,...,...
795,1922,0.358516,0.0,0.00,0.266,0.272,0.278218,0
796,865,0.884039,1.0,1.00,0.982,0.960,0.974413,1
797,3943,0.934317,1.0,1.00,0.976,0.960,0.998720,1
798,1642,0.662319,1.0,0.76,0.772,0.728,0.868540,1


In [13]:
# create a target df to join everything to
list_of_keys = list(test_data_df_dict.keys())
first_key = list_of_keys[0]
meta_training_data = test_data_df_dict[first_key]

# Loop through the DataFrames in the dictionary, joining each to the label
for key in test_data_df_dict:
    if key != first_key and key not in test_failures:
        # get each dataframe
        cols_to_join = test_data_df_dict[key]
        # Join with the initial DataFrame on 'row_id'
        meta_training_data = meta_training_data.merge(cols_to_join, on='row_id', how='left')
        print(f'joined to {key}')

# add back label
label_df = encoded_train[['row_id', 'Quality']]
meta_training_data = meta_training_data.merge(label_df, on='row_id', how='left')

display(meta_training_data)   

joined to DecisionTreeClassifier
joined to KNeighborsClassifier
joined to Random_Forest
joined to Extra_Trees_Random_Forest
joined to Support Vector Machine


Unnamed: 0,row_id,Logi_prob,Deci_prob,KNei_prob,Rand_prob,Extr_prob,Supp_prob,Quality
0,0,0.807077,1,1.0,0.81,0.82,0.976517,1
1,1,0.888458,1,1.0,0.97,0.97,0.977460,1
2,2,0.509400,0,0.0,0.28,0.28,0.125335,0
3,3,0.707898,1,1.0,0.90,0.93,0.963616,1
4,4,0.829310,0,1.0,0.84,0.81,0.898430,1
...,...,...,...,...,...,...,...,...
3195,3994,0.907927,0,1.0,0.84,0.89,0.956977,1
3196,3995,0.059556,0,0.0,0.00,0.00,0.014631,0
3197,3996,0.587951,1,0.8,0.70,0.78,0.717308,1
3198,3998,0.885267,1,1.0,0.88,0.83,0.880914,1


In [None]:
# clean up join (if necessary)
# meta_training_data = meta_training_data.dropna()
# display(meta_training_data)

In [14]:
# save the meta training file
meta_training_data.to_csv('meta_train_df.csv', index=False)
meta_train_df = pd.read_csv('meta_train_df.csv')

# save the meta validation file (acting as inference data)
meta_val_data.to_csv('meta_val_df.csv', index=False)
meta_val_df = pd.read_csv('meta_val_df.csv')

In [15]:
# calculate model metrics
accuracy_dict = calculate_model_metrics(meta_val_df, f'{target}')
accuracy_dict

{'Logi': {'accuracy': 0.79,
  'precision': 0.7783132530120482,
  'recall': 0.8095238095238095,
  'f1_score': 0.7936117936117937},
 'Deci': {'accuracy': 0.81375,
  'precision': 0.8272251308900523,
  'recall': 0.7919799498746867,
  'f1_score': 0.8092189500640204},
 'KNei': {'accuracy': 0.89,
  'precision': 0.8897243107769424,
  'recall': 0.8897243107769424,
  'f1_score': 0.8897243107769424},
 'Rand': {'accuracy': 0.905,
  'precision': 0.8968058968058968,
  'recall': 0.9147869674185464,
  'f1_score': 0.9057071960297767},
 'Extr': {'accuracy': 0.90875,
  'precision': 0.9034653465346535,
  'recall': 0.9147869674185464,
  'f1_score': 0.9090909090909092},
 'Supp': {'accuracy': 0.875,
  'precision': 0.8804071246819338,
  'recall': 0.8671679197994987,
  'f1_score': 0.8737373737373737}}

In [16]:
# Convert the dictionary to a DataFrame
base_model_accuracy_df = pd.DataFrame.from_dict(accuracy_dict, orient='index').reset_index()
base_model_accuracy_df.columns = ['Model', 'Accuracy_base','precision_base','recall_base','f1_score_base',]
base_model_accuracy_df.to_csv('base_model_accuracy.csv', index=False)
base_model_accuracy_df

Unnamed: 0,Model,Accuracy_base,precision_base,recall_base,f1_score_base
0,Logi,0.79,0.778313,0.809524,0.793612
1,Deci,0.81375,0.827225,0.79198,0.809219
2,KNei,0.89,0.889724,0.889724,0.889724
3,Rand,0.905,0.896806,0.914787,0.905707
4,Extr,0.90875,0.903465,0.914787,0.909091
5,Supp,0.875,0.880407,0.867168,0.873737
