# This notebook uploads all the files in the af_knowledge_base directory to OpenAI and then creates a vector store and an assistant linked to that store.



# First upload all files to OpenAI and collect their IDs

In [1]:
import os
from openai import OpenAI

# Initialize the OpenAI client
import os
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
# Directory containing the files
directory_path = '/Users/adamhunter/Documents/becca things/mca_chatbot/mca_knowledge_base2'

# Function to upload a file to OpenAI and return the file ID
def upload_file(file_path):
    with open(file_path, 'rb') as file:
        response = client.files.create(
            file=file,
            purpose='assistants'
        )
        return response.id  # Access the 'id' attribute directly

file_ids = []
count = 0
for filename in os.listdir(directory_path):
    if count < 5000: # Count for safety, since you are charged by storage you dont want this to go wild
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            file_id = upload_file(file_path)
            file_ids.append(file_id)
            print(f"Uploaded {filename} with ID {file_id}")
            count += 1


Uploaded title_0250---chapter_0030---part_0020.md with ID file-lvZNkJFmiIXoEtHEnoOC7aBw
Uploaded title_0200---chapter_0060---part_0040.md with ID file-MX8HYXDtE16je9p7IsdPxPgh
Uploaded title_0870---chapter_0030---part_0030.md with ID file-ZZXocS9GdXyAeNa4lzG6lUKF
Uploaded title_0020---chapter_0090---part_0010.md with ID file-fYWmfCtCk3ofQsoYdU21kQCN
Uploaded title_0760---chapter_0250---part_0030.md with ID file-9tw7aRg3Krx3JTE2OzfUXGLJ
Uploaded title_0500---chapter_0610---part_0010.md with ID file-Xq6iadAMa37mWa3vnmbT5MYn
Uploaded title_0760---chapter_0010---part_0020.md with ID file-YokU99QI3nZb5kkERXr8O3hD
Uploaded title_0720---chapter_0260---part_0060.md with ID file-GvwCBj8ePdC9PHkKW56oxIGD
Uploaded title_0720---chapter_0020---part_0070.md with ID file-TZ8eMlpFbR1nfvQcQa6Oiwrv
Uploaded title_0070---chapter_0020---part_0480.md with ID file-qqo1waADuQb447TXWXWMBGth
Uploaded title_0530---chapter_0240---part_0020.md with ID file-wChkEJW7QzFNCMKFpth6y81m
Uploaded title_0700---chapter_00

# Create an empty vector store

In [2]:
# Create the vector store first without any files
vector_store = client.beta.vector_stores.create(
    name="granular_mca"
)
vector_store_id = vector_store.id
print("Vector store created with ID:", vector_store_id)


Vector store created with ID: vs_sKPakxN728GMzOMsfmoCVe2F


# Connect files to the vector store in batches

In [3]:
# The maximum number of file IDs allowed per batch
MAX_BATCH_SIZE = 100

# Function to create batches of file IDs
def create_batches(file_ids, batch_size):
    for i in range(0, len(file_ids), batch_size):
        yield file_ids[i:i + batch_size]

# Create and upload batches
for batch in create_batches(file_ids, MAX_BATCH_SIZE):
    vector_store_file_batch = client.beta.vector_stores.file_batches.create(
        vector_store_id=vector_store_id,
        file_ids=batch
    )
    print(f"Batch created with ID: {vector_store_file_batch.id}, Status: {vector_store_file_batch.status}")

print("All batches have been created and are being processed.")

Batch created with ID: vsfb_dc25798022f840e8abaaa55c94d1caa0, Status: in_progress
Batch created with ID: vsfb_fa67043ad927453398dc31dd2e61654f, Status: in_progress
Batch created with ID: vsfb_dd2e7d5883904c14a24b2194a3e57336, Status: in_progress
Batch created with ID: vsfb_53118929b6a047d68048b1ccb4c0eb24, Status: in_progress
Batch created with ID: vsfb_42cdee6f5a84402a9a6db04f2249e555, Status: in_progress
Batch created with ID: vsfb_a89dd80112fe4ef5b4ae0f2ce67f1b21, Status: in_progress
Batch created with ID: vsfb_3624c70d0df04034a19bbfd5602d78c6, Status: in_progress
Batch created with ID: vsfb_8b2511152ddf4216b4b3c51ef908218e, Status: in_progress
Batch created with ID: vsfb_a1d979ae19804ae7a29646d4d830bcd5, Status: in_progress
Batch created with ID: vsfb_f5040b426e2a4ffc9b48725ef1aa8fd2, Status: in_progress
Batch created with ID: vsfb_1d085f79d96849ec996d88d94d5936c5, Status: in_progress
Batch created with ID: vsfb_e872dbe415944cdfa98d8bc3bea4e148, Status: in_progress
Batch created wi

# Create an assistant linked to the vector store

In [4]:
# Create an assistant with the file_search tool linked to the vector store
my_assistant = client.beta.assistants.create(
    model="gpt-3.5-turbo-0125",
    name="AF Knowledge Base Assistant 3",
    instructions="""You answer user queries using your knowledge base of the actual freedom trust website. You are sure to avoid filling in gaps with your own reasoning and stick to what is in the files.""",
    tools=[{"type": "file_search"}],
    tool_resources={
        "file_search": {
            "vector_store_ids": [vector_store_id]
        }
    }
)

print("Assistant created:")
print(my_assistant)

Assistant created:
Assistant(id='asst_mZAeytPh7TSa4Y6g6QsEtRJd', created_at=1715025807, description=None, instructions='You answer user queries using your knowledge base of the actual freedom trust website. You are sure to avoid filling in gaps with your own reasoning and stick to what is in the files.', metadata={}, model='gpt-3.5-turbo-0125', name='AF Knowledge Base Assistant 3', object='assistant', tools=[FileSearchTool(type='file_search')], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=['vs_fTUlICTQMr24XmQ4jnar3Atg'])), top_p=1.0)
