### Connect to pinecone and create the index if it doesn't exist



In [1]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

# configure client
pc = Pinecone(api_key=api_key)

In [2]:
import time
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)
# check if index already exists (it shouldn't if this is first time)
if 'actualism-website' not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        'actualism-website',
        dimension=1024,
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index('actualism-website').status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index('actualism-website')
# view index stats
index.describe_index_stats()



{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

# Load chunks from jsonl file, with sparse and dense vectors plus metadata including the raw text



In [7]:
import json

# Load chunks from updated chunks jsonl file
updated_chunks_path = 'embedded_chunks.jsonl'
chunks = []
with open(updated_chunks_path, 'r') as file:
    for line in file:
        chunks.append(json.loads(line))

### convert sparse vector to pinecone expected format, and fill in empty ones to avoid error, should only be a small number of these

In [10]:
# Function to convert sparse data to dictionary format
def sparse_to_dict(data):
    if data and 'indices' in data[0] and 'values' in data[0] and data[0]['indices'] and data[0]['values']:
        return {"indices": data[0]['indices'], "values": data[0]['values']}
    else:
        return {"indices": [0], "values": [0.0]} # avoid pinecone error by filling in empty sparse vectors

# Modify the sparse vectors in all chunks using sparse_to_dict
for chunk in chunks:
    if 'sparse_values' in chunk:
        chunk['sparse_values'] = sparse_to_dict(chunk['sparse_values'])

### upload to pinecone

In [11]:
from tqdm import tqdm

batch_size = 100  # Define the batch size
index = pc.Index("actualism-website")  # Connect to the correct Pinecone index

# Assuming 'chunks' is your list of chunk dictionaries loaded from 'embedded_chunks.jsonl'
total_chunks = len(chunks)  # Total number of chunks

# Proceed with uploading the modified chunks to Pinecone
for i in tqdm(range(9300, total_chunks, batch_size)):
    i_end = min(i + batch_size, total_chunks)
    chunks_batch = chunks[i: i_end]

    upserts = []
    for chunk in chunks_batch:

        upserts.append({
            "id": chunk['id'],
            "sparse_values": chunk['sparse_values'],
            "values": chunk['values'],
            "metadata": {
                'filename': chunk['filename'],
                'chunk_start_index': chunk['chunk_start_index'],
                'chunk_end_index': chunk['chunk_end_index'],
                'raw_string': chunk['raw_string']
            }
        })

    # Upsert the modified chunks to Pinecone
    index.upsert(upserts)

# Optionally, check if the vectors have been upserted
index.describe_index_stats()

100%|██████████| 198/198 [09:17<00:00,  2.81s/it]


{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 29000}},
 'total_vector_count': 29000}