In [1]:
# ! pip install cohere pinecone-client

Hello world


In [None]:
import cohere
import os
import random
import pandas as pd
from annoy import AnnoyIndex
import numpy as np

In [None]:
api_key = os.getenv('PINECONE_API_KEY')
co = cohere.Client(api_key)

Embed the text data and store them in an index

In [None]:
response = co.embed(texts=texts, model='multilingual-22-12').embeddings
embeds = np.array(response)
# Embed the documents and store in index
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(100) # 10 trees
search_index.save('quran_index.ann')

In [None]:
embeds.shape

Query

In [None]:
# They are all the same ingredients
# Expected: Output recipes are the same
queries = ["query here", "What does the quran say about the dis-believers"]

queries_lang = ["English"]

most similar query results

In [None]:
results_list = []

for idx, q in enumerate(queries):

    # Retrieve the nearest neighbors
    query_embed = co.embed(texts=[q], model='multilingual-22-12').embeddings
    similar_item_ids, _ = search_index.get_nns_by_vector(query_embed[0], 5, include_distances=True)

    if len(similar_item_ids) >= 2:
        # Format the results
        results = pd.DataFrame(data={'Surah': df.iloc[similar_item_ids[0]]['Surah'],
                                     'Ayah': df.iloc[similar_item_ids[0]]['Ayat'],
                                     'Arabic': df.iloc[similar_item_ids[0]]['Arabic'],
                                     'Translation': df.iloc[similar_item_ids[0]]['Translation1']}, index=[idx])

        results_list.append(results)
        print(f"Query:'{q}'\nNearest neighbors:")
        print(queries_lang[idx])
        print(results)
        print("\n")
    else:
        print("Not enough similar items found for query:", q)

Initializing the pinecone index

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key="") # add your pinecone API key here

index_name = 'quranic'

# if the index does not exist, we create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=shape[1],
        metric='cosine'
    )

# connect to index
index = pc.Index(index_name)

In [None]:
import numpy as np

shape = np.array(embeds).shape
shape

In [None]:
batch_size = 128

ids = [str(i) for i in range(shape[0])]
# create list of metadata dictionaries
meta = [{'Arabic': arabic, 'Surah': surah, 'Ayat': ayat, 'Translation1': translation, 'Tafaseer1': tafaseer} for arabic, surah, ayat, translation, tafaseer in zip(df['Arabic'], df['Surah'], df['Ayat'], df['Translation1'], df['Tafaseer1'])]

# create list of (id, vector, metadata) tuples to be upserted
to_upsert = list(zip(ids, embeds, meta))

for i in range(0, shape[0], batch_size):
    i_end = min(i+batch_size, shape[0])
    index.upsert(vectors=to_upsert[i:i_end])

# let's view the index statistics
index.describe_index_stats()

results most similar to the query from pinecone

In [None]:
query = "What does the quran say about the dis-believers?"

# create the query embedding
xq = co.embed(
    texts=[query],
    model='multilingual-22-12',
    truncate='NONE'
).embeddings

print(np.array(xq).shape)

# query, returning the top 10 most similar results
res = index.query(vector=xq, top_k=10, include_metadata=True)
res

In [None]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['Arabic']}")
    print(f"{match['score']:.2f}: {match['metadata']['Translation1']}")
    print(f"{match['score']:.2f}: {match['metadata']['Tafaseer1']}")