In [10]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Downloading pinecone_client-6.0.0-py3-none-any.whl (6.7 kB)
Installing collected packages: pinecone-client
Successfully installed pinecone-client-6.0.0


#02_pinecone_setup
- Initializes Pinecone with API keys
- Creates or connects to an existing index
- Sets up the proper dimensions (1024) for Jina embeddings
- Configures metadata fields for efficient filtering
- Tests basic vector operations (insert, query, filter, delete)
- Creates a helper function for use in other notebooks

In [11]:
import json
import uuid
import numpy as np
import pandas as pd

In [12]:
from pinecone import pinecone

PINECONE_API_KEY = ""
PINECONE_ENVIRONMENT = "us-east-1"


In [None]:
# Define index parameters
INDEX_NAME = "mirra-embeddings"
DIMENSION = 1024  
METRIC = "cosine"

# Define metadata fields to index for efficient filtering
INDEXED_METADATA_FIELDS = [
    "source_type",        # job_description, resume
    "requirement_level",  # mandatory, preferred, responsibility
    "job_id",             # For grouping by job
    "resume_id"           # For grouping by resume
    "location"
]

# Initialize Pinecone with API key and environment
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

# List existing indexes
existing_indexes = pc.list_indexes().names()
print(f"Existing indexes: {existing_indexes}")

print(f"Index name: {INDEX_NAME}")
print(f"Vector dimension: {DIMENSION}")
print(f"Similarity metric: {METRIC}")
print(f"Indexed metadata fields: {INDEXED_METADATA_FIELDS}")

In [16]:
# Check if the index already exists
if INDEX_NAME in existing_indexes:
    print(f"Index '{INDEX_NAME}' already exists. Connecting to existing index.")
    # Connect to the existing index
    index = pc.Index(INDEX_NAME)

    # Get and display index statistics
    index_stats = index.describe_index_stats()
    print(f"Index statistics:")
    print(index_stats)
else:
    print(f"Creating new index '{INDEX_NAME}'...")

    # Create the index with specifications
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric=METRIC,
        metadata_config={
            "indexed": INDEXED_METADATA_FIELDS
        }
    )

    print(f"Index '{INDEX_NAME}' created successfully.")

    # Connect to the newly created index
    index = pc.Index(INDEX_NAME)

    # Verify index creation
    new_indexes = pc.list_indexes().names()
    print(f"Updated index list: {new_indexes}")

Index 'mirra-embeddings' already exists. Connecting to existing index.
Index statistics:
{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [18]:
# Create a test vector
def create_test_vector():
    # Generate a random vector of the correct dimension
    # Convert NumPy floats to regular Python floats
    vector_values = [float(val) for val in np.random.rand(DIMENSION)]

    # Create a unique ID for the test vector
    vector_id = f"test_{uuid.uuid4()}"

    # Create metadata for the test vector
    metadata = {
        "source_type": "test",
        "requirement_level": "mandatory",
        "job_id": "test_job",
        "skill_name": "Python programming",
        "string_text": "Required skill: Python programming with 3 years experience"
    }

    # Create the vector object
    vector = {
        "id": vector_id,
        "values": vector_values,
        "metadata": metadata
    }

    return vector, vector_id

In [20]:
# Query the test vector
query_results = index.query(
    vector=test_vector["values"],
    top_k=1,
    include_metadata=True
)

print("Query results:")
# Convert to dictionary if possible or print directly
try:
    query_results_dict = query_results.to_dict() if hasattr(query_results, 'to_dict') else vars(query_results)
    print(json.dumps(query_results_dict, indent=2))
except:
    # Fallback to direct printing
    print(query_results)

# Verify the top result is our test vector (adjust this based on the actual structure)
matches = getattr(query_results, 'matches', [])
if matches and matches[0].id == test_vector_id:
    print("Vector query successful! Retrieved the test vector correctly.")
else:
    print("Vector query issue: Test vector not retrieved as expected.")

Query results:
{
  "matches": [],
  "namespace": "",
  "usage": {
    "read_units": 1
  }
}
Vector query issue: Test vector not retrieved as expected.


In [22]:
# Query with metadata filter
filtered_results = index.query(
    vector=test_vector["values"],
    filter={"source_type": "test"},
    top_k=10,
    include_metadata=True
)

print("\nFiltered query results:")
# Convert to dictionary if possible or print directly
try:
    filtered_results_dict = filtered_results.to_dict() if hasattr(filtered_results, 'to_dict') else vars(filtered_results)
    print(json.dumps(filtered_results_dict, indent=2))
except:
    # Fallback to direct printing
    print(filtered_results)

# Check matches using attributes instead of dictionary access
matches = getattr(filtered_results, 'matches', [])
if matches:
    print(f"Filter query successful! Retrieved {len(matches)} vectors.")
else:
    print("Filter query issue: No vectors retrieved with filter.")


Filtered query results:
{
  "matches": [],
  "namespace": "",
  "usage": {
    "read_units": 1
  }
}
Filter query issue: No vectors retrieved with filter.


In [24]:
# Delete the test vector
index.delete(ids=[test_vector_id])

# Verify deletion
index_stats_after_delete = index.describe_index_stats()
# Convert to dictionary if possible or print directly
try:
    stats_dict = index_stats_after_delete.to_dict() if hasattr(index_stats_after_delete, 'to_dict') else vars(index_stats_after_delete)
    print(f"Index statistics after deletion: {json.dumps(stats_dict, indent=2)}")
except:
    # Fallback to direct printing
    print(f"Index statistics after deletion:")
    print(index_stats_after_delete)

Index statistics after deletion: {
  "namespaces": {},
  "index_fullness": 0.0,
  "total_vector_count": 0,
  "dimension": 1024,
  "metric": "cosine",
  "vector_type": "dense"
}


In [27]:
def get_pinecone_index(index_name=INDEX_NAME, api_key=PINECONE_API_KEY):
    """
    Helper function to initialize Pinecone and return the index.

    Args:
        index_name: Name of the Pinecone index
        api_key: Pinecone API key

    Returns:
        Pinecone index object
    """
    # Initialize Pinecone with specified credentials
    pc = Pinecone(api_key=api_key)

    # Return the index
    return pc.Index(index_name)

# Save this function for use in other notebooks
# %store get_pinecone_index
# print("Stored get_pinecone_index function for use in other notebooks.")

In [28]:
print("""
Pinecone Setup Complete.

Successfully:
1. Initialized Pinecone with credentials
2. Created a vector index for resume-job matching
3. Configured the index for jina-embeddings-v3 (1024 dimensions)
4. Set up metadata indexing for efficient filtering
5. Tested basic vector operations (insert, query, filter, delete)

The vector database is now ready for storing job description and resume embeddings.
""")


Pinecone Setup Complete.

Successfully:
1. Initialized Pinecone with credentials
2. Created a vector index for resume-job matching
3. Configured the index for jina-embeddings-v3 (1024 dimensions)
4. Set up metadata indexing for efficient filtering
5. Tested basic vector operations (insert, query, filter, delete)

The vector database is now ready for storing job description and resume embeddings.

