<a href="https://colab.research.google.com/github/hadiwyne/philo_chatbot/blob/main/philosophy_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing dependencies

In [9]:
!pip install -q pandas sentence-transformers chromadb requests gradio

# Importing the required libraries

In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import requests
import os

# Setting OpenRouter API Key

In [67]:
os.environ["OPENROUTER_API_KEY"] = "YOUR-API-KEY-HERE"

# Loading the dataset from Google Drive

In [12]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/philosophy_data.csv'
df = pd.read_csv(file_path)
df = df.dropna(subset=['sentence_str']).reset_index(drop=True)
df['original_publication_date'] = df['original_publication_date'].astype(int)

Mounted at /content/drive


# Sentence encoding

In [13]:
sentences = df['sentence_str'].tolist()
model = SentenceTransformer('all-MiniLM-L6-v2')
vectors = model.encode(sentences, batch_size=32, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/11276 [00:00<?, ?it/s]

# Seting up ChromaDB

In [14]:
client = chromadb.Client()
try:
    collection = client.get_collection("philosophy")
except:
    collection = client.create_collection("philosophy")

# Addding text to ChromaDB in batches

In [15]:
if len(collection.get()['ids']) == 0:
    ids = [f"philo_{i}" for i in range(len(sentences))]
    metadatas = df[['author','school','original_publication_date']].to_dict(orient='records')
    batch_size = 5000
    for start in range(0, len(sentences), batch_size):
        end = min(start + batch_size, len(sentences))
        collection.add(
            documents=sentences[start:end],
            embeddings=vectors[start:end].tolist(),
            ids=ids[start:end],
            metadatas=metadatas[start:end]
        )
        print(f"Added records {start} to {end}")

Added records 0 to 5000
Added records 5000 to 10000
Added records 10000 to 15000
Added records 15000 to 20000
Added records 20000 to 25000
Added records 25000 to 30000
Added records 30000 to 35000
Added records 35000 to 40000
Added records 40000 to 45000
Added records 45000 to 50000
Added records 50000 to 55000
Added records 55000 to 60000
Added records 60000 to 65000
Added records 65000 to 70000
Added records 70000 to 75000
Added records 75000 to 80000
Added records 80000 to 85000
Added records 85000 to 90000
Added records 90000 to 95000
Added records 95000 to 100000
Added records 100000 to 105000
Added records 105000 to 110000
Added records 110000 to 115000
Added records 115000 to 120000
Added records 120000 to 125000
Added records 125000 to 130000
Added records 130000 to 135000
Added records 135000 to 140000
Added records 140000 to 145000
Added records 145000 to 150000
Added records 150000 to 155000
Added records 155000 to 160000
Added records 160000 to 165000
Added records 165000 t

# Retrieve function

In [16]:
def retrieve_quotes(question: str, top_k: int = 3):
    q_vec = model.encode([question]).tolist()
    results = collection.query(
        query_embeddings=q_vec,
        n_results=top_k,
        include=["documents", "metadatas"]
    )
    return list(zip(results['documents'][0], results['metadatas'][0]))

# LLM call via OpenRouter


In [33]:
def ask_llm(question: str, top_k: int = 3) -> str:
    ctx = retrieve_quotes(question, top_k=top_k)
    quotes_text = "\n\n".join(
        f"{m['author']} ({m['school']}): “{d}”" for d, m in ctx
    )

    messages = [
        {"role": "system", "content": "You are a learned philosophy assistant. Quote relevant passages with author and school, then explain clearly."},
        {"role": "user", "content": f"Here are some passages:\n\n{quotes_text}\n\nQUESTION: {question}\n\nAnswer based on those passages."}
    ]

    headers = {
        "Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}",
        "Content-Type": "application/json"
    }

    payload = {"model": "mistralai/mistral-7b-instruct", "messages": messages}

    response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)

    try:
        response.raise_for_status()
        data = response.json()
        print("Response from LLM:", data)
        return data["choices"][0]["message"]["content"]
    except Exception as e:
        print("ERROR:", e)
        print("Full response text:", response.text)
        return "Sorry, an error occurred while contacting the AI."

# Test call

In [60]:
for doc, meta in retrieve_quotes("Body without organs", top_k=7):
    print(f"{meta['author']} ({meta['school']}): {doc}")

Deleuze (continental): The Body without Organs
Deleuze (continental): The body without organs is not God, quite the contrary.
Deleuze (continental): The body without organs is the model of death.
Deleuze (continental): The body without organs is in fact produced as a whole, but a whole alongside the parts a whole that does not unify or totalize them, but that is added to them like anew, really distinct part.
Deleuze (continental): The body without organs is produced as a whole, but in its own particular place within the process of production, alongside the parts that it neither unifies nor totalizes.
Deleuze (continental): The full body without organs is the unproductive, the sterile, the unengendered, the unconsumable.
Deleuze (continental): The body without organs is the matter that always fills space to


In [61]:
#@title { vertical-output: true}
print(ask_llm("What is body without organs?"))

Response from LLM: {'id': 'gen-1753193624-xNmeDm5RPbXbg1RVrdfs', 'provider': 'DeepInfra', 'model': 'mistralai/mistral-7b-instruct', 'object': 'chat.completion', 'created': 1753193624, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'native_finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': ' The Body without Organs (BwO) is a term coined by French philosopher Gilles Deleuze, who primarily worked within the continental philosophical tradition. In Deleuze\'s philosophy, the "Body without Organs" represents a potential or virtual state in which the body is not organized or structured by traditional individual or social norms. Instead, it exists as an immanent substance with an infinite capacity and autonomous nature.\n\nDeleuze compares this concept to Spinoza\'s "attributes" of God, as the BwO is made up of "partial objects" or distinct aspects, but these aspects do not exclude or oppose one another. The BwO is not to be confused with the body as comp