In [19]:
import pinecone
import pinecone.graph
import pinecone.service
import pinecone.connector
import pandas as pd

from sentence_transformers import SentenceTransformer, util

# initialize pinecone api key
pinecone.init(api_key="")

# download or load distelbert model
model = SentenceTransformer('msmarco-distilbert-base-v2')

In [13]:
# create questions service
Qservice_name = 'google-questions-service'
# create Answers service
Aservice_name = 'google-answers-service'

# create graphs
# Qgraph = pinecone.graph.IndexGraph()  # create a graph
# Qgraph.view() 
# Agraph = pinecone.graph.IndexGraph()  # create a graph
# Agraph.view()

# deploy services
# pinecone.service.deploy(Qservice_name, Qgraph)
# pinecone.service.deploy(service_name=Aservice_name, graph=Agraph)

# stops pinecone service
# pinecone.service.stop(service_name=Aservice_name)

# connect to service
question_conn = pinecone.connector.connect(Qservice_name)
answer_conn = pinecone.connector.connect(Aservice_name)

# lists all deployed services or the ones that are currently running
print(pinecone.service.ls())

['answers-service', 'google-answers-service', 'google-questions-service', 'hello-pinecone1', 'knn-classifier', 'questions-service']


In [25]:
# grab the dataset and convert to dataframe
google_dataset_df = pd.read_json('https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl', lines=True)
google_dataset_df

Unnamed: 0,answer,question
0,[Fernie Alpine Resort],where did they film hot tub time machine
1,[Neither vessel],who has the right of way in international waters
2,[Marley],who does annie work for attack on titan
3,"[November 6, 1986]",when was the immigration reform and control ac...
4,[1950],when was puerto rico added to the usa
...,...,...
87920,[Rashidi Kawawa],who was the first prime minister of tanzania
87921,[the Eagles],what is the name of the football team in facin...
87922,[Paul McCartney],who sang lead on when i'm 64
87923,[doo-wop],what is the style of this boy by the beatles


In [26]:
# remove brackets from answer, run only once
google_dataset_df['answer'] = google_dataset_df['answer'].str.get(0)
google_dataset_df['answer']

0        Fernie Alpine Resort
1              Neither vessel
2                      Marley
3            November 6, 1986
4                        1950
                 ...         
87920          Rashidi Kawawa
87921              the Eagles
87922          Paul McCartney
87923                 doo-wop
87924       Flora Louise Shaw
Name: answer, Length: 87925, dtype: object

In [27]:
# For each row update the 'Questions' and 'answer' value to a vector
for index_label, row_series in google_dataset_df.iterrows():
    google_dataset_df.at[index_label , 'question'] = model.encode(row_series['question'])
    google_dataset_df.at[index_label , 'answer'] = model.encode(row_series['answer'])

Total Time Execution is 0.0002166748046875


In [28]:
google_dataset_df

Unnamed: 0,answer,question
0,"[0.1765306, -0.0128194345, 1.5167723, -0.11197...",where did they film hot tub time machine
1,"[-0.53276587, -0.23448008, -0.13825296, 0.0506...",who has the right of way in international waters
2,"[0.028528154, 0.2927364, -0.14695545, -0.74160...",who does annie work for attack on titan
3,"[-0.7147557, 0.14656873, -0.24601491, 0.697288...",when was the immigration reform and control ac...
4,"[-0.74161214, 0.02487731, 0.011852339, 0.35666...",when was puerto rico added to the usa
...,...,...
87920,"[-0.46302348, -0.619929, -1.0059278, 0.0910818...",who was the first prime minister of tanzania
87921,"[0.033811085, 0.7217571, -1.6357528, -1.175526...",what is the name of the football team in facin...
87922,"[1.072259, 0.9951943, -0.27271816, -1.0339307,...",who sang lead on when i'm 64
87923,"[-0.5942096, -0.055734705, 0.05951788, -0.3431...",what is the style of this boy by the beatles


In [29]:
acks_q = question_conn.upsert(items=zip(google_dataset_df.index, google_dataset_df.question)).collect()
acks_a = answer_conn.upsert(items=zip(google_dataset_df.index, google_dataset_df.answer)).collect()

In [32]:
question_conn.info()
answer_conn.info()

InfoResult(index_size=87925)

In [75]:
# vectorize a question
q1 = "capital of united states?"
embedded_question = model.encode(q1)
# q2 = model.encode("what is the temperature of chicken?")
# # questions = ['q1','q2','q3']
# # question_embeddings = model.encode(questions)



In [76]:
# get similar questions from vectorized question
query_results = question_conn.query([embedded_question], batch_size=100, top_k=10).collect()

# get similar answers from vectorized question
# query_results = answer_conn.query([q1], batch_size=100, top_k=10).collect()

print(query_results)

[QueryResult(ids=['11548', '60022', '26998', '70443', '51715', '32669', '53954', '6945', '45584', '52973'], scores=[0.8777937889099121, 0.8450073003768921, 0.8254081010818481, 0.7595561146736145, 0.7496448755264282, 0.7495452165603638, 0.7211736440658569, 0.7150613069534302, 0.6897961497306824, 0.6801357865333557], data=None)]


In [77]:
google_dataset_df = pd.read_json('https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl', lines=True)

# grab all the ids
indexes = query_results[0].ids
# print(indexes)

# grab all the scores
scores = query_results[0].scores
# print(scores)

# create a list of tuples of [(id, score)]
index_scores_tple = []
for (id, score) in zip(indexes, scores):
    index_scores_tple.append((id,score))
# print(index_scores_tple)

answerResults = google_dataset_df.iloc[indexes]
# view results
answerResults

Unnamed: 0,answer,question
11548,"[Washington, D.C.]",what is the name of the capital of usa
60022,"[Washington, D.C., Philadelphia, New York City]",what cities have been capital of the united st...
26998,"[Washington, D.C.]",what is the name of america capital city
70443,"[Washington, D.C.]",what was the united states first national capital
51715,[Philadelphia],where was the capital of the united states fir...
32669,[Phoenix],what is the most populous capital city in the us
53954,[Philadelphia],where was the first capital of the united stat...
6945,"[New York City, Philadelphia, Lancaster, Penns...",name 3 cities where the us capital was located
45584,[Albany],what's the state capital of new york
52973,[1791],when did washington dc become the us capital


In [78]:
# if using question service to get top similar questions
# for answer, score in zip(answerResults["question"], scores):
#     print("*" * 20)
#     print("Question:", q1)
#     print("Answer: ", answer)
#     print("Score: ", score)

# if using answer service to get top similar answers    
for answer, score in zip(answerResults["answer"], scores):
    print("*" * 20)
    print("Question:", q1)
    print("Answer:", answer)
    print("Score:", score)

********************
Question: capital of united states?
Answer: ['Washington, D.C.']
Score: 0.8777937889099121
********************
Question: capital of united states?
Answer: ['Washington, D.C.', 'Philadelphia', 'New York City']
Score: 0.8450073003768921
********************
Question: capital of united states?
Answer: ['Washington, D.C.']
Score: 0.8254081010818481
********************
Question: capital of united states?
Answer: ['Washington, D.C.']
Score: 0.7595561146736145
********************
Question: capital of united states?
Answer: ['Philadelphia']
Score: 0.7496448755264282
********************
Question: capital of united states?
Answer: ['Phoenix']
Score: 0.7495452165603638
********************
Question: capital of united states?
Answer: ['Philadelphia']
Score: 0.7211736440658569
********************
Question: capital of united states?
Answer: ['New York City', 'Philadelphia', 'Lancaster, Pennsylvania']
Score: 0.7150613069534302
********************
Question: capital of united