In [1]:
import os
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Initialize Pinecone environment
with open('../secret.json') as f:
    data = json.load(f)
pinecone_key = data["pinecone_api"]
pc = Pinecone(api_key=pinecone_key)

In [4]:
# Check if the index exists, and if not, create a new one
index_name = 'python-18k-instructions-codeonly'
if index_name not in pc.list_indexes().names():
    pc.create_index(index_name, dimension=384, metric='cosine', #dimension 384 for all-miniLM-L12-v2
            spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) )


In [14]:
! pip install pyarrow --break-system-packages

Collecting pyarrow
  Downloading pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl (27.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.2/27.2 MB[0m [31m904.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-17.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [15]:
index = pc.Index(index_name)

# Load python-code-18k-alpha
df = pd.read_parquet("hf://datasets/iamtarun/python_code_instructions_18k_alpaca/data/train-00000-of-00001-8b6e212f3e1ece96.parquet")


In [17]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')  # Example model, replace with your choice
codes = df['output'].to_list()
# Generate embeddings
df['embeddings'] =  model.encode(codes, show_progress_bar=True).tolist()

Batches: 100%|██████████| 582/582 [01:39<00:00,  5.83it/s]


In [19]:
data_to_upload = list(zip(df.index, df['embeddings'], df['output']))
len(data_to_upload)

18612

In [20]:
# Function to divide data into chunks for batch processing
def chunked_data(data, chunk_size):
    """Yield successive chunk_size chunks from data."""
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]

# Upload data in batches
batch_size = 100
for batch in chunked_data(data_to_upload, batch_size):
    batch_to_upsert = [(str(id), vec, {'code': codes}) for id, vec, codes in batch]
    try:
        index.upsert(vectors=batch_to_upsert)
    except Exception as e:
        print('error, ', e)
        pass