In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#!pip install sentence-transformers pinecone

In [3]:
#!pip install datasets

In [4]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import os
import time
import torch
from tqdm.auto import tqdm
from DLAIUtils import Utils
import DLAIUtils

In [5]:
dataset = load_dataset('quora', split='train[240000:245000]')


In [6]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [7]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
questions = list(set(questions))
print('\n'.join(questions[:10]))
print('-'*50)
print('Number of questions:', len(questions))


What does a scientist do?
What should you do if you tell your girlfriend you love her and she's not ready to say it back?
What's the best restaurant in Paris?
Can host be shown as pregnant in the westworld?
Where can you find the best hotels in Nainital?
Should I learn TypeScript or Elm as an alternative to JavaScript for large scale frontend development?
Is it normal to be asked only math problems in a front-end developer interview?
What are some baby shower games that are actually fun?
How did Argentina gain its independence?
I would like to become a lead developer for Google or another large company in the future, what steps must I take with having no current experience?
--------------------------------------------------
Number of questions: 9814


### Check cuda and Setup the model

**Note**: "Checking cuda" refers to checking if you have access to GPUs (faster compute). In this course, we are using CPUs. So, you might notice some code cells taking a little longer to run.

We are using *all-MiniLM-L6-v2* sentence-transformers model that maps sentences to a 384 dimensional dense vector space.

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('No GPU available, using the CPU instead.')

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


No GPU available, using the CPU instead.


In [9]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

### Setup Pinecone

In [10]:
import importlib
import DLAIUtils
importlib.reload(DLAIUtils)

from DLAIUtils import Utils

utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = utils.create_dlai_index_name(pinecone, 'quora-index')


Deleted existing index: quora-index
Creating index: quora-index
Created index: quora-index


In [11]:
# Check if the index exists and delete it

# if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
#     pinecone.delete_index(INDEX_NAME)
# print(INDEX_NAME)

# # Create a Pinecone index with the name INDEX_NAME
# pinecone.create_index(name=INDEX_NAME,
#                       dimension=model.get_sentence_embedding_dimension(),
#                       metric='cosine',
#                       spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME) # Load the index
print(index)

<pinecone.data.index.Index object at 0x7f90c00b14f0>


In [12]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Create Embeddings and Upsert to Pinecone

In [13]:
#len(questions) # 9814
batch_size = 200
vector_limit = 1000

questions = questions[:vector_limit]
print('Number of questions:', len(questions))

Number of questions: 1000


In [14]:
import json

for i in tqdm(range(0, len(questions), batch_size)): # 0, 200, 400, 600, 800
    # find end of batch
    i_end = min(i + batch_size, len(questions))
    # create IDs for the batch
    ids = [str(x) for x in range(i, i_end)] # 0-199, 200-399, 400-599, 600-799, 800-999
    # create metadata for the batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings for the batch
    embeddings = model.encode(questions[i:i_end])
    # insert the batch into the index
    records = zip(ids, embeddings, metadatas)
    index.upsert(vectors=records)



  0%|          | 0/5 [00:00<?, ?it/s]

In [15]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

### Run Your Query

In [16]:
def run_query(query, top_k=5):
    embedding = model.encode(query).tolist()
    results = index.query(vector=embedding,
                          top_k=top_k,
                          include_metadata=True,
                          include_vector=False)
    
    for result in results['matches']:
        print(f"{ round(result['score'],2)}  {result['metadata']['text']}")

In [17]:
run_query('what is a good programming language?')

0.68  I want to make Hacks, bots, cheats for games. I know 0 about programming. What programming language should I learn as a beginning?
0.6  What is the best way to learn a computer Language?
0.45  What is the difference between scripting and programming?
0.41  What is the recommended book to learn python for beginner?
0.41  Which book to buy to learn java?


In [18]:
query = 'how do i make chocolate cake?'
run_query(query)

0.35  What is the "shake n' bake" meth recipe?
0.33  What are some good mixes for Bailey's Irish Cream?
0.31  How do you draw cartoons?
0.31  How do you get inspiration to write a poem?
0.3  What is the best biryani masala powder?
