# This notebook uploads all the files in the af_knowledge_base directory to OpenAI and then creates a vector store and an assistant linked to that store.



# First upload all files to OpenAI and collect their IDs

In [1]:
import os
from openai import OpenAI

# Initialize the OpenAI client
import os
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
# Directory containing the files
directory_path = '/Users/adamhunter/Documents/misc/actualism_chat/scrapy_spider/output_files'

# Function to upload a file to OpenAI and return the file ID
def upload_file(file_path):
    with open(file_path, 'rb') as file:
        response = client.files.create(
            file=file,
            purpose='assistants'
        )
        return response.id  # Access the 'id' attribute directly

file_ids = []
count = 0
for filename in os.listdir(directory_path):
    if count < 5000: # Count for safety, since you are charged by storage you dont want this to go wild
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            file_id = upload_file(file_path)
            file_ids.append(file_id)
            print(f"Uploaded {filename} with ID {file_id}")
            count += 1


Uploaded www.actualfreedom.com.au---richard---catalogue---psittacisms.htm.txt with ID file-YVMXDbpabMdSzQ0cZcQImpqR
Uploaded actualfreedom.com.au---richard---listbcorrespondence---listb07.htm.txt with ID file-j10dtasYvwOabjtuqDldgIVE
Uploaded actualfreedom.com.au---actualism---peter---list-c---pc-corr07.htm.txt with ID file-sfpBA1tNkHmbgwcus8wfiU2e
Uploaded actualfreedom.com.au---actualism---peter---selected-correspondence---corr-seth.htm.txt with ID file-hZXSyIccS30fKFFfQAV9M62l
Uploaded actualfreedom.com.au---richard---listbcorrespondence---listb17.htm.txt with ID file-nq3QE8xLdccKdwMWAU3pWgGj
Uploaded actualfreedom.com.au---actualism---peter---list-c---pc-corr17.htm.txt with ID file-TK6lXtuRqg4fuAaVTgC7Q7UF
Uploaded actualfreedom.com.au---richard---listafcorrespondence---listaf44g.htm.txt with ID file-wy5REBIDlACGCCThEUsRxVaZ
Uploaded www.actualfreedom.com.au---actualism---peter---selected-correspondence---corr-time.htm.txt with ID file-9yDGSy3iB64ijgmVRIRt7o07
Uploaded www.actualfr

# Create an empty vector store

In [2]:
# Create the vector store first without any files
vector_store = client.beta.vector_stores.create(
    name="AF Knowledge Base Vector Store 3"
)
vector_store_id = vector_store.id
print("Vector store created with ID:", vector_store_id)


Vector store created with ID: vs_gxuCAeLFq7nKVoo5MO6LXJLx


# Connect files to the vector store in batches

In [3]:
# The maximum number of file IDs allowed per batch
MAX_BATCH_SIZE = 100

# Function to create batches of file IDs
def create_batches(file_ids, batch_size):
    for i in range(0, len(file_ids), batch_size):
        yield file_ids[i:i + batch_size]

# Create and upload batches
for batch in create_batches(file_ids, MAX_BATCH_SIZE):
    vector_store_file_batch = client.beta.vector_stores.file_batches.create(
        vector_store_id=vector_store_id,
        file_ids=batch
    )
    print(f"Batch created with ID: {vector_store_file_batch.id}, Status: {vector_store_file_batch.status}")

print("All batches have been created and are being processed.")

Batch created with ID: vsfb_6cb95791c30346eda47bf679f7474833, Status: in_progress
Batch created with ID: vsfb_e748b53c9cf74c4a84fc2ac83cbb1377, Status: in_progress
Batch created with ID: vsfb_00a95c1c89e94243a807b35ca613f70a, Status: in_progress
Batch created with ID: vsfb_7fd8b6e2e6db42fd8b54e5a7ccd635c1, Status: in_progress
Batch created with ID: vsfb_b94081df6f654083be4b786825a11843, Status: in_progress
Batch created with ID: vsfb_74e9984074be42c3b2b710e8251c38c4, Status: in_progress
Batch created with ID: vsfb_ec125b031cf7477fa096ef5a7f49b1f2, Status: in_progress
Batch created with ID: vsfb_c110627201f347348878ce62a119cf4b, Status: in_progress
Batch created with ID: vsfb_d69eb90034864c4f9abfb8ec83fccabf, Status: in_progress
Batch created with ID: vsfb_b46f378aeae14c31b27ed93b6bcf1379, Status: in_progress
Batch created with ID: vsfb_df14e50e8f8440c18e483893033523ae, Status: in_progress
Batch created with ID: vsfb_d51f490777ff416990e9cae920434a04, Status: in_progress
Batch created wi

# Create an assistant linked to the vector store

In [4]:
# Create an assistant with the file_search tool linked to the vector store
my_assistant = client.beta.assistants.create(
    model="gpt-3.5-turbo-0125",
    name="AF Knowledge Base Assistant 3",
    instructions="""You answer user queries using your knowledge base of the actual freedom trust website. You are sure to avoid filling in gaps with your own reasoning and stick to what is in the files.""",
    tools=[{"type": "file_search"}],
    tool_resources={
        "file_search": {
            "vector_store_ids": [vector_store_id]
        }
    }
)

print("Assistant created:")
print(my_assistant)

Assistant created:
Assistant(id='asst_mZAeytPh7TSa4Y6g6QsEtRJd', created_at=1715025807, description=None, instructions='You answer user queries using your knowledge base of the actual freedom trust website. You are sure to avoid filling in gaps with your own reasoning and stick to what is in the files.', metadata={}, model='gpt-3.5-turbo-0125', name='AF Knowledge Base Assistant 3', object='assistant', tools=[FileSearchTool(type='file_search')], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=['vs_fTUlICTQMr24XmQ4jnar3Atg'])), top_p=1.0)
