In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import os
import time
import torch
import DLAIUtils
import numpy as np
from tqdm.auto import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils

In [7]:
dataset = load_dataset('quora', split='train[240000:290000]')

In [8]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [9]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f"Number of questions: {len(questions)}")

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print("Sorry bro no cuda.")
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry bro no cuda.


In [11]:
query = "Which city is the most populated in the world?"
xq = model.encode(query)
xq.shape

(384,)

In [22]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

pinecone = Pinecone(api_key=PINECONE_API_KEY) 
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
#print(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=model.get_sentence_embedding_dimension(), metric='cosine', 
                      spec=ServerlessSpec(cloud='aws', region='us-east-1'))
index = pinecone.Index(INDEX_NAME)
print(index)

<pinecone.data.index.Index object at 0x7f9723287e50>


In [14]:
batch_size = 200
vector_limit = 10000

questions = question[:vector_limit]

# Process in batches
for i in tqdm(range(0, len(questions), batch_size), desc="Processing Batches"):
    i_end = min(i + batch_size, len(questions))   # Finding the end of the batch
    batch_questions = questions[i:i_end]  # Slice batch

    ids = [str(x) for x in range(i, i_end)]
    metadatas = [{'text': text} for text in batch_questions]

    # Generating embeddings (Enable multiprocessing)
    xc = model.encode(batch_questions, batch_size=32, show_progress_bar=False, convert_to_numpy=True)

    # Convert to list for upsert
    records = list(zip(ids, xc.tolist(), metadatas))
    
    try:
        index.upsert(vectors=records)
    except Exception as e:
        print(f"Error during upsert at batch {i}-{i_end}: {e}")

Processing Batches: 100%|██████████| 50/50 [04:10<00:00,  5.02s/it]


In [15]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000,
 'vector_type': 'dense'}

In [19]:
def run_query(query):
    embedding = model.encode(query).tolist()
    results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
    for result in results['matches']:
        print(f"{round(result['score'], 2)}: {result['metadata']['text']}")
        
run_query("Which country has the largest zoo in the world?")

0.86: Where is largest zoo in india?
0.59: What is your opinion on zoos?
0.5: Which is the biggest football stadium in the world?
0.47: What country has the fastest growing population and why?
0.47: What is the most beautiful city in the world?
0.46: What are the coldest countries in the world?
0.45: Which is the safest country in the world?
0.45: What is the most racist country?
0.45: What is the greatest, most beautiful city in the world?
0.43: What is the most interesting and exciting country?


In [20]:
query = "How to prepare scrambled eggs?"
run_query(query)

0.62: What're the differences between cooking eggs certain ways?
0.53: What should I eat for breakfast?
0.48: Is raw egg good for health?
0.48: What are some Atkins meals for which I don't have to deal with raw meat or raw eggs?
0.47: Why egg becomes hard when we boil it?
0.46: Is it safe to eat raw unpasteurized eggs?
0.42: If I were to eat the same thing for breakfast every day, what should that food item be?
0.4: What is the best way to cook spaghetti?
0.38: How do I sleep early and wake up early?
0.38: What is the recipe for vattalappam?
