# This notebook uploads all the files in the af_knowledge_base directory to OpenAI and then creates a vector store and an assistant linked to that store.



# First upload all files to OpenAI and collect their IDs, I think this could be sped up a ton by using the 'batch' feature but haven't tried that yet for file uploading.



In [1]:
import os
from openai import OpenAI

# Initialize the OpenAI client
import os
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
# Directory containing the files
directory_path = '/Users/adamhunter/Documents/misc/actualism_chat/af_knowledge_base'

# Function to upload a file to OpenAI and return the file ID
def upload_file(file_path):
    with open(file_path, 'rb') as file:
        response = client.files.create(
            file=file,
            purpose='assistants'
        )
        return response.id  # Access the 'id' attribute directly

file_ids = []
count = 0
for filename in os.listdir(directory_path):
    if count < 1300: # Count for safety... but 1300 should be enough get all the .html files
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            file_id = upload_file(file_path)
            file_ids.append(file_id)
            print(f"Uploaded {filename} with ID {file_id}")
            count += 1


Uploaded richard---listdcorrespondence---listdrick.html with ID file-B9Wlgt1g2OYglZF9nxHu41fu
Uploaded actualism---peter---selected-correspondence---corr-compassion.html with ID file-sJgt2HiTyYBkgDkd7bVOHEaH
Uploaded richard---selectedcorrespondence---sc-science.html with ID file-HoQ4jl0cZsacOHJrSlX8iq6Q
Uploaded library---topics---i.html with ID file-l3ZVK2yucRrC4qkT6OkZyJdh
Uploaded sundry---frequentquestions---FAQ58.html with ID file-psYRXy72Yxsx0wg8aqgpx1m9
Uploaded actualism---vineeto---selected-correspondence---corr-serendipity.html with ID file-qJo00jgbHVCcuxzuv42rgWOY
Uploaded richard---selectedcorrespondence---sc-ego.html with ID file-NdDlQ571JvSf4UQNc5NrYMeW
Uploaded richard---selectedcorrespondence---sc-renedescartes.html with ID file-v2bgcTbsAC5imFT4ob872Gdi
Uploaded actualism---vineeto---selected-correspondence---corr-happy.html with ID file-jtuO52wZpBewmHFWwzDEW7L3
Uploaded actualism---peter---selected-correspondence---corr-belief2.html with ID file-oEajHZC4f8CT58qCA066f7

# Create an empty vector store

In [2]:
# Create the vector store first without any files
vector_store = client.beta.vector_stores.create(
    name="AF Knowledge Base Vector Store 2.0"
)
vector_store_id = vector_store.id
print("Vector store created with ID:", vector_store_id)


Vector store created with ID: vs_0TFpFogwGB85KgHKtvPeNz8s


# Connect files to the vector store in batches

In [3]:
# The maximum number of file IDs allowed per batch
MAX_BATCH_SIZE = 100

# Function to create batches of file IDs
def create_batches(file_ids, batch_size):
    for i in range(0, len(file_ids), batch_size):
        yield file_ids[i:i + batch_size]

# Create and upload batches
for batch in create_batches(file_ids, MAX_BATCH_SIZE):
    vector_store_file_batch = client.beta.vector_stores.file_batches.create(
        vector_store_id=vector_store_id,
        file_ids=batch
    )
    print(f"Batch created with ID: {vector_store_file_batch.id}, Status: {vector_store_file_batch.status}")

print("All batches have been created and are being processed.")

Batch created with ID: vsfb_564558327128406cb13ecd238c9a1106, Status: in_progress
Batch created with ID: vsfb_7922131c531e46308d8b1d1b68aa5ffe, Status: in_progress
Batch created with ID: vsfb_562e521aa2d448bbb12b32b47170c3e6, Status: in_progress
Batch created with ID: vsfb_daf6af4f4f834c6d9e5027864bf118c0, Status: in_progress
Batch created with ID: vsfb_5bdbfedd79c34ebb8d90ae84b97a6f2d, Status: in_progress
Batch created with ID: vsfb_23d3229e3ece4791a0ded65117606955, Status: in_progress
Batch created with ID: vsfb_258b811315c44ac09f202f7c63037bde, Status: in_progress
Batch created with ID: vsfb_fe496328d42c4f8aab6da71b177079d6, Status: in_progress
Batch created with ID: vsfb_a126a7929ff7411091e3cc2051e5f04c, Status: in_progress
Batch created with ID: vsfb_bc4bc42e6c6046c286526b17732d687f, Status: in_progress
Batch created with ID: vsfb_8cfc2568c37e4b51b3b56e65f662b2be, Status: in_progress
Batch created with ID: vsfb_6db8fb5b53f145d0ad54f87a61f52dd8, Status: in_progress
Batch created wi

# Create an assistant linked to the vector store

In [4]:
# Create an assistant with the file_search tool linked to the vector store
my_assistant = client.beta.assistants.create(
    model="gpt-3.5-turbo-0125",
    name="AF Knowledge Base Assistant 2.0",
    instructions="""You answer user queries using your knowledge base of the actual freedom trust website. You are sure to avoid filling in gaps with your own reasoning and stick to what is in the files.""",
    tools=[{"type": "file_search"}],
    tool_resources={
        "file_search": {
            "vector_store_ids": [vector_store_id]
        }
    }
)

print("Assistant created:")
print(my_assistant)

Assistant created:
Assistant(id='asst_CGsw3FBqnnszrWtygzXArN8e', created_at=1714947955, description=None, instructions='You answer user queries using your knowledge base of the actual freedom trust website. You are sure to avoid filling in gaps with your own reasoning and stick to what is in the files.', metadata={}, model='gpt-3.5-turbo-0125', name='AF Knowledge Base Assistant 2.0', object='assistant', tools=[FileSearchTool(type='file_search')], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=['vs_0TFpFogwGB85KgHKtvPeNz8s'])), top_p=1.0)
