In [10]:
# !pip install pymongo langchain-groq

In [1]:
from pymongo import MongoClient
import os

from sentence_transformers import SentenceTransformer
from langchain_core.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
atlas_connection_string = os.getenv('ATLAS_CONNECTION_STRING')
atlas_cluster_password = os.getenv('ATLAS_CLUSTER_PASSWORD')
MONGODB_ATLAS_CLUSTER_URI = atlas_connection_string.replace("<password>", atlas_cluster_password)

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

In [3]:
DB_NAME = "the-communist-bot"
COLLECTION_NAME = "manifesto"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "communist-manifesto"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [4]:
def generate_embeddings(text: str) -> list[float]:
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(text)

    return embeddings.tolist()

In [5]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = generate_embeddings(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "communist-manifesto",  # Search index name
                "queryVector": query_embedding,  # Embedding representation of the use query
                "path": "embedding",             # Document field containing the embeddings
                "numCandidates": 150,            # Number of candidate matches to consider (Limits on the number of results to return)
                "limit": 2,                      # Return top 2 matches
            }
        },
        {
            "$project": {
                "_id": 0,                                 # Exclude the _id field
                "documentID": 1,                          # Include the documentID field
                "file_path": 1,                           # Include the file_path field
                "file_name": 1,                           # Include the file_name field
                "page_number": 1,
                "content": 1,
                "embedding": 1,
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [6]:
relevant_contexts = vector_search("Who are Proletarians?", MONGODB_COLLECTION)



In [7]:
relevant_contexts

[{'documentID': '7d6d0282-eec7-4b73-9f13-3c20d97c233c',
  'file_path': 'manifesto\\12-proletarians-and-communists.pdf',
  'file_name': '12-proletarians-and-communists.pdf',
  'page_number': '1',
  'content': 'II. Proletarians and Communists \nIn what relation do the Communists stand to the proletarians as a whole?  \nThe Communists do not form a separate party opposed to the other working -class parties.  \nThey have no interests separate and apart from those of  the proletariat as a whole.  \nThey do not set up any sectarian principles of their own, by which to shape and mould the \nproletarian movement.  \nThe Communists are distinguished from the other working- class parties by this only: 1. In the \nnational struggles of the proletarians of the different countries, they point out and bring to the \nfront the common interests of the entire proletariat, independently of all nationality. 2. In the \nvarious stages of development which the struggle of the working class against the bour

## Chatbot

In [8]:
groq_api_key = os.getenv('GROQ_API_KEY')

In [9]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_community.llms import Ollama

In [10]:
model = ChatGroq(model_name="mixtral-8x7b-32768")

In [11]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

In [25]:
# Define your desired data structure.
class ModelOutput(BaseModel):
    response: str = Field(description="the response to the query")
    documentID: list[str] = Field(description="documentIDs of documents used to form response")

In [26]:
# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=ModelOutput)

In [33]:
# Creating the prompt
examples = [
    {"question": "Who is the current best F1 Driver?", 
     "context": "[{{'documentID': '<Document ID>', 'file_path': '<File Path>', 'file_name': '<File Name>', 'page_number': '<Page Number>', 'embedding': '<Embedding>', 'score': '<Score>'}}]",
     "answer": "{{'response': 'The current best F1 driver is Fernando Alonso', 'documentID': ['d381adfb-d0b6-46ef-8e00-1c44c90ac14b', 'd381adfb-d0b6-46ef-8e00-1c44c90ac39b']}}",
    },
    {"question": "Name four F1 Drivers", 
     "context": "[{{'documentID': '<Document ID>', 'file_path': '<File Path>', 'file_name': '<File Name>', 'page_number': '<Page Number>', 'embedding': '<Embedding>', 'score': '<Score>'}}]",
     "answer": "{{'response': 'Four F1 drivers are:\n Max Verstappen\n Fernando Alonso\n Lewis Hamilton\n Charles LeClark', 'documentID': ['d381thfb-d0b6-46ef-8e00-1c44c90ac14b', 'd381adfb-d0b6-47sf-8e00-1c44c90ac39b']}}",
    },
    {"question": "Who is the founder of Ferrari?", 
     "context": "[{{'documentID': '<Document ID>', 'file_path': '<File Path>', 'file_name': '<File Name>', 'page_number': '<Page Number>', 'embedding': '<Embedding>', 'score': '<Score>'}}]",
     "answer": "{{'response': 'I do not know the answer to your question', 'documentID': []}}",
    },
]

In [34]:
bot_instructions = """1. Respond with 'I do not know the answer to your question' if the relevant answer is not in the context
                      2. Do not form answers on your own. If an answer is not found on the context reply with 'I do not know the answer to your question'
                      3. Mention the documentIDs of the contexts where you fetched information from
                      4. REMEMBER not to form answers of your own
                      5. If the answer is not in the relevant context, keep citation as empty list
"""

In [35]:
# This is a prompt template used to format each individual example.
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{question}\n{context}"),
        ("ai", "{answer}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

print(few_shot_prompt.format())

Human: Who is the current best F1 Driver?
[{{'documentID': '<Document ID>', 'file_path': '<File Path>', 'file_name': '<File Name>', 'page_number': '<Page Number>', 'embedding': '<Embedding>', 'score': '<Score>'}}]
AI: {{'response': 'The current best F1 driver is Fernando Alonso', 'documentID': ['d381adfb-d0b6-46ef-8e00-1c44c90ac14b', 'd381adfb-d0b6-46ef-8e00-1c44c90ac39b']}}
Human: Name four F1 Drivers
[{{'documentID': '<Document ID>', 'file_path': '<File Path>', 'file_name': '<File Name>', 'page_number': '<Page Number>', 'embedding': '<Embedding>', 'score': '<Score>'}}]
AI: {{'response': 'Four F1 drivers are:
 Max Verstappen
 Fernando Alonso
 Lewis Hamilton
 Charles LeClark', 'documentID': ['d381thfb-d0b6-46ef-8e00-1c44c90ac14b', 'd381adfb-d0b6-47sf-8e00-1c44c90ac39b']}}
Human: Who is the founder of Ferrari?
[{{'documentID': '<Document ID>', 'file_path': '<File Path>', 'file_name': '<File Name>', 'page_number': '<Page Number>', 'embedding': '<Embedding>', 'score': '<Score>'}}]
AI: {{'

In [36]:
final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You're an expert AI assistant specializing in advanced question answering.\
         Generate answers STRICTLY following the below instructions: \n{bot_instructions}and for the answer as \n{format_instructions}"),
        few_shot_prompt,
        ("human", "{question}\n{context}"),
    ]
)

In [40]:
chain = final_prompt | model

chain.invoke({"question": "What are the principles of communism?", "context": relevant_contexts, "bot_instructions": bot_instructions, "format_instructions": parser.get_format_instructions()})

AIMessage(content="{{'response': 'The principles of communism include the abolition of private property, the idea that the proletariat must settle matters with its own bourgeoisie, and the belief that the proletarian movement is the self-conscious, independent movement of the immense majority, in the interest of the immense majority.', 'documentID': ['7d6d0282-eec7-4b73-9f13-3c20d97c233c', 'f93d0913-d16b-42c4-9373-8a6094e9f8f8']}}", response_metadata={'token_usage': {'completion_tokens': 147, 'prompt_tokens': 17935, 'total_tokens': 18082, 'completion_time': 0.234070274, 'prompt_time': 2.9977016, 'queue_time': None, 'total_time': 3.231771874}, 'model_name': 'mixtral-8x7b-32768', 'system_fingerprint': 'fp_c5f20b5bb1', 'finish_reason': 'stop', 'logprobs': None}, id='run-4d914998-d515-4318-84e1-ce3cc4f2b0d0-0')