In [None]:
! pip install pinecone-client pymongo transformers sentence_transformers

In [None]:
from pymongo.mongo_client import MongoClient
from pinecone import Pinecone
import pinecone
from sentence_transformers import SentenceTransformers, util

In [None]:
uri = "mongodb+srv://password:password@cluster0.u141hkk.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Create a new client and connect to the server
client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB")

except Exception as e:
    print(e)

In [None]:
PINECONE_KEY = "PINECONE_KEY"

In [None]:
pc = Pinecone(api_key = PINECONE_KEY)
index = pc.Index("index_name") # Enter the name of the Index from Pinecone 

In [None]:
db = client["testdb"]

In [None]:
collection = db["testcollection"]

In [None]:
embedding_model = SentenceTransformers("sentence-transformers/all-MiniLM-L6-v2")
#embedding_model = SentenceTransformers("thenlper/gte-large")

In [None]:
# Opne up change stream cursor
cursor = collection.watch(full_document="updateLookup")
print("Change stream is now open.")

while True:
    change = next(cursor)

    # If a new document is inserted into the collection, replicate its vector in Pinecone
    if change["operationType"] == "insert":
        document = change["fullDocument"]

        # Convert the document's name into an embedding
        vector = embedding_model.encode(document["fullplot"])

        # Ensure the vector is a flat list of floats (and possibly convert to float64)
        vector = vector.tolist() # Convert from numpy array to list
        vector = [float(x) for x in vector] # Convert elements to float (usually float64)
        
        # Prepare the data for Pinecone upsert, which requires a tuple of (id, vector)
        # Assuming 'document['_id']' is the unique ID for the upsert operation
        upsert_data = (str(document['_id']), vector)

        # Insert into Pinecone
        index.upsert([upsert_data]) # Note that upsert_data is enclosed in a list

    elif change['operationType'] == 'update':
        document = change["fullDocument"]
        document_id = document['_id']
        updated_fields = change["updateDescription"]["updatedFields"]

        # If the change is in the name field, generate the embedding and insert
        if updated_fields.get("fullplot"):
            vector = embedding_model.encode(updated_fields["fullplot"])
            upsert_data = (str(document_id), vector)

            # Inset into Pinecone
            index.upsert([upsert_data]) # Note that upset_data is enclosed in a list

            # pinecone.upsert(index_name="index_name", data=vector, ids=[str(document_id)])

    # If a document is deleted from the collection, remove its vector from Pinecone
    elif change["operatonType"] == "delete":
        index.delete(ids=[str(change["documentkey"]["_id"])])


Testing the Server and insert data to Pinecone (most run the cells below with a different script)

In [None]:
# collection.insert_one({
#     "name":"3 Idiots",
#     "genres" : "Comedy",
#     "rating": 8.4,
#     "fullplot": "Two friends search for their lost companion and revisit their college days, revealing a bet, a wedding, and secrets"
# })

In [None]:
! pip install datasets pandas sentence_transformers pymongo pinecone-client 

In [None]:
! pip install --upgrade langchain-google-genai

In [None]:
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformers
from pymongo.mongo_client import Mongoclient
from pinecone import Pinecone
from bson.objectid import ObjectId
import os
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI


In [None]:
dataset = load_dataset("MongoDB/embedded_movies")

In [None]:
dataset = pd.DataFrame(dataset["train"])

In [None]:
dataset = dataset.sample(80)
dataset.shape

In [None]:
dataset.columns

In [None]:
dataset.isnull().sum()

In [None]:
dataset = dataset.dropna(subset=["fullplot"])

In [None]:
dataset = dataset.drop(columns=["plot_embedding"])

In [None]:
embedding_model = SentenceTransformers("thenlper/gte-large")

In [None]:
uri = "mongodb+srv://password:password@cluster0.u141hkk.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Create a new client and connect to the server
client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB")

except Exception as e:
    print(e)

In [None]:
db = client["testdb"]

collection = db["testcollection"]

In [None]:
document = dataset.to_dict("records")

In [None]:
collection.insert_many(document)

In [None]:
PINECONE_API_KEY = "PINECONE_API_KEY"
pc = Pinecone(api_key = PINECONE_API_KEY)

index = pc.Index("index_name") # Enter the name of the Index from Pinecone 

In [None]:
def get_result(query, similar_result=2):

    embedding = embedding_model.encode(query)
    embedding = embedding.tolist()

    result = index.query(
        vector = embedding,
        top_k = similar_result,
        #include_values = True,
    )

    return result

In [None]:
query = "What is the best horror movie to watch and why?"

In [None]:
result = get_result(query)

In [None]:
my_list = []

for i in range(len(result["matches"])):
    value = result["matches"][i]["id"]
    my_list.append(collection.find_one({"_id": ObjectId(value)}))

In [None]:
my_list

In [None]:
combined_information = ""

for i in range(len(my_list)):
    fullplot = my_list[i]["fullplot"]
    title = my_list[i]["title"]
    combined_information += f"Title: {title}, fullplot: {fullplot}\n"

In [None]:
print(combined_information)

In [None]:
prompt = f"Query: {query}\nContinue to answer the query by using the fullplot only: \n{combined_information}"

In [None]:
print(prompt)

In [None]:
GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
def load_model(model_name):
    if model_name == "gemini-pro":
        llm = ChatGoogleGenerativeAI(model = "gemini-pro")

    else:
        llm = ChatGoogleGenerativeAI(model="gemini-pro-vision")

    return llm

In [None]:
model_text = load_model("gemini-pro")

In [None]:
model_text.invoke(prompt).content