# This notebook uploads all the files in the af_knowledge_base directory to OpenAI and then creates a vector store and an assistant linked to that store.



# First upload all files to OpenAI and collect their IDs, I think this could be sped up a ton by using the 'batch' feature but haven't tried that yet for file uploading.



In [None]:
import os
from openai import OpenAI

# Initialize the OpenAI client
import os
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
# Directory containing the files
directory_path = '/Users/adamhunter/Documents/misc/actualism_chat/af_knowledge_base'

# Function to upload a file to OpenAI and return the file ID
def upload_file(file_path):
    with open(file_path, 'rb') as file:
        response = client.files.create(
            file=file,
            purpose='assistants'
        )
        return response.id  # Access the 'id' attribute directly

file_ids = []
count = 0
for filename in os.listdir(directory_path):
    if count < 1300: # Count for safety... but 1300 should be enough get all the .html files
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            file_id = upload_file(file_path)
            file_ids.append(file_id)
            print(f"Uploaded {filename} with ID {file_id}")
            count += 1


# Create an empty vector store

In [None]:
# Create the vector store first without any files
vector_store = client.beta.vector_stores.create(
    name="AF Knowledge Base Vector Store"
)
vector_store_id = vector_store.id
print("Vector store created with ID:", vector_store_id)


# Connect files to the vector store in batches

In [None]:
# The maximum number of file IDs allowed per batch
MAX_BATCH_SIZE = 100

# Function to create batches of file IDs
def create_batches(file_ids, batch_size):
    for i in range(0, len(file_ids), batch_size):
        yield file_ids[i:i + batch_size]

# Create and upload batches
for batch in create_batches(file_ids, MAX_BATCH_SIZE):
    vector_store_file_batch = client.beta.vector_stores.file_batches.create(
        vector_store_id=vector_store_id,
        file_ids=batch
    )
    print(f"Batch created with ID: {vector_store_file_batch.id}, Status: {vector_store_file_batch.status}")

print("All batches have been created and are being processed.")

# Create an assistant linked to the vector store

In [None]:
# Create an assistant with the file_search tool linked to the vector store
my_assistant = client.beta.assistants.create(
    model="gpt-3.5-turbo-0125",
    name="AF Knowledge Base Assistant",
    instructions="""You answer user queries using your knowledge base of the actual freedom trust website. You are sure to avoid filling in gaps with your own reasoning and stick to what is in the files.""",
    tools=[{"type": "file_search"}],
    tool_resources={
        "file_search": {
            "vector_store_ids": [vector_store_id]
        }
    }
)

print("Assistant created:")
print(my_assistant)