In [1]:
from datasets import load_dataset
from dotenv import load_dotenv, find_dotenv
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fw = load_dataset("HuggingFaceFW/fineweb", name = "sample-10BT", split = "train", streaming = True)

Resolving data files: 100%|████████████████████████████████████████████████████| 23781/23781 [00:15<00:00, 1583.49it/s]


In [3]:
fw

IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'],
    n_shards: 15
})

In [4]:
fw.features

{'text': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'dump': Value(dtype='string', id=None),
 'url': Value(dtype='string', id=None),
 'date': Value(dtype='string', id=None),
 'file_path': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'language_score': Value(dtype='float64', id=None),
 'token_count': Value(dtype='int64', id=None)}

In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [6]:
load_dotenv(find_dotenv(), override = True)

True

In [7]:
pc = Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [8]:
pc.list_indexes()

{'indexes': [{'dimension': 3,
              'host': 'my-index-bbthoyp.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'my-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'my-index-2-bbthoyp.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'my-index-2',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [9]:
pc.create_index(
    name="text",
    dimension=model.get_sentence_embedding_dimension(),
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

In [10]:
index = pc.Index(name = "text")

In [11]:


# Define the number of items you want to process (subset size)
subset_size = 10000  # For example, take only 10,000 items

# Iterate over the dataset and prepare data for upserting
vectors_to_upsert = []
for i, item in enumerate(fw):
    if i >= subset_size:
        break

    text = item['text']
    unique_id = str(item['id'])
    language = item['language']

    # Create an embedding for the text
    embedding = model.encode(text, show_progress_bar=False).tolist()

    # Prepare metadata
    metadata = {'language': language}

    # Append the tuple (id, embedding, metadata) to the list
    vectors_to_upsert.append((unique_id, embedding, metadata))

# Upsert data to Pinecone in batches
batch_size = 1000  # Adjust based on your environment and dataset size
for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i:i + batch_size]
    index.upsert(vectors=batch)

print("Subset of data upserted to Pinecone index.")


Subset of data upserted to Pinecone index.
