In [1]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from utils import Utils
import os
import time
import torch

In [6]:
from tqdm.auto import tqdm

In [7]:
dataset = load_dataset('quora', split='train[250000:290000]')

Downloading builder script:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [8]:
dataset[:5]

{'questions': [{'id': [152498, 313546],
   'text': ['Who would win in a war between Russia and the US?',
    'Who would win a war out of the United States and Russia?']},
  {'id': [363759, 363760],
   'text': ['When planning for retirement, what rate of inflation should I assume to play it safe?',
    'What does it feel like to retire extremely early?']},
  {'id': [363761, 363762],
   'text': ['What is space time yield in chemical reactor?',
    'When did the Greeks conquer Egypt?']},
  {'id': [363763, 363764],
   'text': ['What are unknown facts about lord Shiva?',
    'What are some interesting facts about Shiva?']},
  {'id': [8880, 363765],
   'text': ['Is Donald Trump likely to win the 2016 election (late 2015 / early 2016)?',
    "What will Donald Trump's response be if he doesn't win the 2016 presidential election?"]}],
 'is_duplicate': [True, False, False, False, False]}

In [10]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print("*" * 50)
print("Number of questions:", len(questions))

Who would win in a war between Russia and the US?
Who would win a war out of the United States and Russia?
When planning for retirement, what rate of inflation should I assume to play it safe?
What does it feel like to retire extremely early?
What is space time yield in chemical reactor?
When did the Greeks conquer Egypt?
What are unknown facts about lord Shiva?
What are some interesting facts about Shiva?
Is Donald Trump likely to win the 2016 election (late 2015 / early 2016)?
What will Donald Trump's response be if he doesn't win the 2016 presidential election?
**************************************************
Number of questions: 80000


In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')
model = SentenceTransformer('all-miniLM-L6-v2', device=device)


In [15]:
query = 'Which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

In [25]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "quickstart"
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
pinecone.create_index(
    name=INDEX_NAME,
    dimension=model.get_sentence_embedding_dimension(),
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)
index = pinecone.Index(INDEX_NAME)
print(index)

<pinecone.data.index.Index object at 0x7fb5d1584640>


In [27]:
batch_size=200
vector_limit=10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end=min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{"text": text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upset
    records = zip(ids, xc, metadatas)
    # upsert to pinecone
    index.upsert(vectors=records)

  0%|          | 0/50 [00:00<?, ?it/s]

In [28]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

In [30]:
# helper function to reapeat queries later
def run_query(query):
    embedding = model.encode(query).tolist()
    results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
    for result in result['matches']:
        print(f"{round(result['score'], 2)}: {result['metadaat']['text']}
        

SyntaxError: unterminated string literal (detected at line 6) (3509834976.py, line 6)