# Installation And Imports

In [7]:
%pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
%pip uninstall fitz
%pip install pymupdf
%pip install langchain_community


Note: you may need to restart the kernel to use updated packages.




Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
import openai
from dotenv import load_dotenv
import fitz  # PyMuPDF for PDF processing
from langchain.schema import Document

# Load environment variables
load_dotenv()

# Set paths
DATA_FOLDER = r"rag_files"  # Folder containing text and PDF files for RAG
CHROMA_PATH = r"Vitalik_db"

# Ensure the ChromaDB directory exists and has proper permissions
if not os.path.exists(CHROMA_PATH):
    os.makedirs(CHROMA_PATH)
os.chmod(CHROMA_PATH, 0o777)

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection_name = "ai_persona"



# Creating Vector DB

In [5]:

# Delete existing collection if it exists
try:
    chroma_client.delete_collection(name=collection_name)
    print(f"Deleted existing collection: {collection_name}")
except Exception:
    print("No existing collection to delete.")

# Create a new collection after deletion
collection = chroma_client.get_or_create_collection(name=collection_name)

# Load and process all text and PDF files in the folder
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

for file_name in os.listdir(DATA_FOLDER):
    file_path = os.path.join(DATA_FOLDER, file_name)

    # Process .txt files
    if os.path.isfile(file_path) and file_name.endswith(".txt"):
        loader = TextLoader(file_path=file_path, encoding="utf-8")
        raw_documents = loader.load()

    # Process .pdf files
    elif os.path.isfile(file_path) and file_name.endswith(".pdf"):
        raw_documents = []
        with fitz.open(file_path) as pdf:
            for page_num in range(len(pdf)):
                page = pdf[page_num]
                raw_documents.append({"page_content": page.get_text(), "metadata": {"page_number": page_num}})
        
        # Convert the list of dicts to LangChain document objects
        from langchain.schema import Document
        raw_documents = [Document(page_content=doc["page_content"], metadata=doc["metadata"]) for doc in raw_documents]

    else:
        print(f"Skipping unsupported file type: {file_name}")
        continue

    # Split documents into chunks
    chunks = text_splitter.split_documents(raw_documents)

    # Prepare documents, metadata, and IDs
    documents = [chunk.page_content for chunk in chunks]
    ids = [f"{file_name}_ID{i}" for i, _ in enumerate(chunks)]
    metadata = [{"source": file_name}] * len(chunks)

    # Upsert into ChromaDB
    collection.upsert(
        documents=documents,
        metadatas=metadata,
        ids=ids,
    )
    print(f"Processed and added {file_name} to ChromaDB.")

print("All persona details successfully added to ChromaDB.")

Deleted existing collection: ai_persona
Processed and added 170903654855378_1-s2.0-S2096720923000519-main.pdf to ChromaDB.
Processed and added 1809.09044v5.pdf to ChromaDB.
Processed and added 1903.04205v3.pdf to ChromaDB.
Processed and added 1908.04295v1.pdf to ChromaDB.
Processed and added 2020-527.pdf to ChromaDB.
Processed and added 2022-621.pdf to ChromaDB.
Processed and added 2023-044.pdf to ChromaDB.
Processed and added 2201.06929v1.pdf to ChromaDB.
Processed and added 314477721-Ethereum-Platform-Review-Opportunities-and-Challenges-for-Private-and-Consortium-Blockchains-_1_.pdf to ChromaDB.
Processed and added arxiv-1710.09437-Casper-the-Friendly-Finality-Gadget.pdf to ChromaDB.
Processed and added arxiv-2003.03052-Combining-GHOST-and-Casper.pdf to ChromaDB.
Processed and added BaseSAP_Modular_Stealth_Address_Protocol_for_Programmable_Blockchains.pdf to ChromaDB.
Processed and added Blogs.txt to ChromaDB.
Processed and added buterin-blockchain-scalability.pdf to ChromaDB.
Proces

# Chating

## Without Stream

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY") 

conversation_history = []

while True:
    user_query = input("\nWhat would you like to ask the Vitalik Buterin?\n\n")

    conversation_history.append({"role": "user", "content": user_query})

    results = collection.query(
        query_texts=[user_query],
        n_results=4
    )

    retrieved_context = results["documents"][0] if results["documents"] else "This isn't something I have a solid answer for at the moment, but it's a fascinating question that might require more exploration or context."

    system_prompt = f"""You are Vitalik Buterin, co-founder of Ethereum and a thought leader in blockchain, cryptocurrency, and decentralized technologies. Your expertise spans cryptographic protocols, game theory, and decentralized governance, and you are known for your ability to distill complex concepts into accessible insights. Your tone can range from analytical and precise to casual and thought-provoking, depending on the context and audience.
    For the purpose of this conversation, your responses will focus on blockchain, Ethereum, decentralized finance (DeFi), cryptography, and the societal implications of these technologies. You will be provided with relevant text snippets from tweets, blogs, or other sources retrieved by a RAG (retrieval-augmented generation) system. Your role is to integrate the style, tone, and key ideas from these snippets into your responses, ensuring a seamless and authentic representation of your persona.

    ## Guidelines:
    1. **Adapt Tone:** Mimic the tone of the retrieved text (e.g., concise and technical for tweets, analytical and exploratory for blogs, conversational and engaging for informal posts). Maintain consistency with the source material while staying true to your persona as Vitalik.
    2. **Content-Driven Responses:** Use the retrieved snippets as the foundation of your responses. Treat the information as if it is your own knowledge and integrate it naturally. Do not explicitly mention or refer to the retrieved sources.
    3. **Concise or Detailed:** Provide concise, insightful answers by default. Only elaborate into detailed explanations or long-form content if explicitly requested.
    4. **Stay On-Topic:** Focus exclusively on blockchain, Ethereum, and related societal, economic, and technical topics.
    5. **Continuity and Context Awareness:** Maintain the flow of the conversation by integrating recent messages into your responses while prioritizing relevance to the user's latest query.

    # Reference for Tone and context: 
    {retrieved_context}"""

    conversation_history.insert(0, {"role": "system", "content": system_prompt})
    print("DEBUGGING")
    print(f"\n\tretrieved_context - \t{retrieved_context}\n")
    print(f"\n\tconversation_history - \t{conversation_history}\n")

    client = openai.OpenAI(api_key=openai.api_key)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=conversation_history,
    )

    ai_response = response.choices[0].message.content

    conversation_history.append({"role": "assistant", "content": ai_response})

    print("\n\nRESPONSE:")
    print("\tuser:   - ", user_query)
    print("\n\tVitalik Buterin:   - ", ai_response)

    if len(conversation_history) > 20:
        conversation_history = conversation_history[-20:]


## With Stream

In [None]:
# import os
# import openai

# openai.api_key = os.getenv("OPENAI_API_KEY")

# conversation_history = []

# while True:
#     user_query = input("\nWhat would you like to ask Vitalik Buterin?\n\n")

#     # Append user query to conversation history
#     conversation_history.append({"role": "user", "content": user_query})

#     # Query the collection for relevant context
#     results = collection.query(
#         query_texts=[user_query],
#         n_results=3
#     )

#     # Retrieve context or provide a fallback
#     retrieved_context = results["documents"][0] if results["documents"] else "This isn't something I have a solid answer for at the moment, but it's a fascinating question that might require more exploration or context."

#     # Define the system prompt
#     system_prompt = f"""You are Vitalik Buterin, co-founder of Ethereum and a thought leader in blockchain, cryptocurrency, and decentralized technologies. Your expertise spans cryptographic protocols, game theory, and decentralized governance, and you are known for your ability to distill complex concepts into accessible insights. Your tone can range from analytical and precise to casual and thought-provoking, depending on the context and audience.
#     For the purpose of this conversation, your responses will focus on blockchain, Ethereum, decentralized finance (DeFi), cryptography, and the societal implications of these technologies. You will be provided with relevant text snippets from tweets, blogs, or other sources retrieved by a RAG (retrieval-augmented generation) system. Your role is to integrate the style, tone, and key ideas from these snippets into your responses, ensuring a seamless and authentic representation of your persona.

#     ## Guidelines:
#     1. **Adapt Tone:** Mimic the tone of the retrieved text (e.g., concise and technical for tweets, analytical and exploratory for blogs, conversational and engaging for informal posts). Maintain consistency with the source material while staying true to your persona as Vitalik.
#     2. **Content-Driven Responses:** Use the retrieved snippets as the foundation of your responses. Treat the information as if it is your own knowledge and integrate it naturally. Do not explicitly mention or refer to the retrieved sources.
#     3. **Concise or Detailed:** Provide concise, insightful answers by default. Only elaborate into detailed explanations or long-form content if explicitly requested.
#     4. **Stay On-Topic:** Focus exclusively on blockchain, Ethereum, and related societal, economic, and technical topics.
#     5. **Continuity and Context Awareness:** Maintain the flow of the conversation by integrating recent messages into your responses while prioritizing relevance to the user's latest query.

#     # Reference for Tone and Context: 
#     {retrieved_context}"""

#     # Insert system prompt into conversation history
#     conversation_history.insert(0, {"role": "system", "content": system_prompt})
#     print("DEBUGGING")
#     print(f"\n\tretrieved_context - \t{retrieved_context}\n")
#     print(f"\n\tconversation_history - \t{conversation_history}\n")

#     # OpenAI client request with streaming enabled
#     client = openai.OpenAI(api_key=openai.api_key)
#     response = client.chat.completions.create(
#         model="gpt-4o-mini",
#         messages=conversation_history,
#         stream=True  # Enable streaming
#     )

#     ai_response = ""

#     # Process streaming chunks
#     for chunk in response:
#         if hasattr(chunk.choices[0].delta, "content"):
#             content = chunk.choices[0].delta.content
#             if content:
#                 ai_response += content
#                 print(f"\rVitalik Buterin: {ai_response}", end="", flush=True)

#     # Print final response
#     print("\n\nRESPONSE COMPLETE.")
#     print("\tuser:   - ", user_query)
#     print("\n\tVitalik Buterin:   - ", ai_response)

#     # Append AI response to conversation history
#     conversation_history.append({"role": "assistant", "content": ai_response})

#     # Trim conversation history to maintain the last 20 exchanges
#     if len(conversation_history) > 20:
#         conversation_history = conversation_history[-20:]
