# Chunking 

In [1]:
#imports
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

We are going to chunk all the text based columns. 
- Description ( `description`)
- Skills (`skills_desc`)

In [3]:
data = pd.read_csv('data.csv')

In [4]:
text_columns = ['description']
metadata_columns = ['title', 'company_name','location','formatted_experience_level','work_type','currency','normalized_salary']

In [None]:
# Calculate the length of each description in the original dataset
data['description_length'] = data['description'].astype(str).apply(len)
average_length = data['description_length'].mean()
max_length = data['description_length'].max()
min_length = data['description_length'].min()

print(f"Average number of characters per description: {average_length:.2f}")
print(f"Maximum number of characters in a description: {max_length}")
print(f"Minimum number of characters in a description: {min_length}")



Drop columns with small char length

In [None]:
data = data[data['description_length'] >= 800]
data = data.drop(columns=['description_length'])

print(f"Number of rows after filtering: {data.shape[0]}")
data.head()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,  # Number of characters per chunk
    chunk_overlap=100  # Overlap to preserve context
)

chunked_data = []

for index, row in data.iterrows():
    for col in text_columns:
        if pd.notna(row[col]):  # Ensure text exists
            chunks = text_splitter.split_text(row[col])
            for chunk in chunks:
                chunked_data.append({
                    "description": chunk, 
                    "title": row['title'],
                    "company_name": row['company_name'],
                    "location": row['location'],
                    "experience_level" : row['formatted_experience_level'],
                    "work_type": row['work_type'],
                    "salary": row['normalized_salary']
    
                })

chunked_df = pd.DataFrame(chunked_data)
chunked_df.head()

In [None]:
for i in range(len(chunked_df) - 1):
    end_overlap = chunked_df['description'][i][-100:]  # Last 100 characters of current chunk
    start_overlap = chunked_df['description'][i + 1][:100]  # First 100 characters of next chunk
    
    print(f"End of Chunk {i+1}: {end_overlap}")
    print(f"Start of Chunk {i+2}: {start_overlap}")
    print('-' * 50)


In [None]:
chunked_df.shape

## Embedding


In [None]:
#imports
from fastembed.embedding import TextEmbedding

In [11]:
# Ensure all descriptions are strings
chunked_df['description'] = chunked_df['description'].astype(str)

# Convert descriptions to a list
text_list = chunked_df['description'].tolist()

In [None]:
supported_models = TextEmbedding.list_supported_models()
print("Supported models:", supported_models)

In [13]:
model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")  


In [None]:
# TAKES ABOUT AN HOUR TO RUN DONT RUN THIS UNLESS ABSOLUTELY NEEDED !!
embeddings = list(model.embed(text_list))

## Upsert to pinecone

In [14]:
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc = Pinecone(api_key="API KEY")

# Get a list of existing indexes
existing_indexes = pc.list_indexes()
print("Existing indexes:", existing_indexes)  # Debugging step

In [None]:
index_name = "job-postings"

# Check if the index exists and connect to it
if any(idx["name"] == index_name for idx in existing_indexes):
    print(f"Connecting to existing index: {index_name}")
    index = pc.Index(index_name)
else:
    print(f"Index '{index_name}' does not exist. Creating a new one...")
    pc.create_index(
        name=index_name,
        dimension=384, 
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

index = pc.Index(index_name)

print("Successfully connected to Pinecone index:", index_name)



In [None]:
upsert_data = [
    (
        str(i), 
        embeddings[i].tolist(),  # Ensure embedding is in list format
        {
            "title": chunked_df['title'].iloc[i] if pd.notna(chunked_df['title'].iloc[i]) else "",
            "company_name": chunked_df['company_name'].iloc[i] if pd.notna(chunked_df['company_name'].iloc[i]) else "",
            "location": chunked_df['location'].iloc[i] if pd.notna(chunked_df['location'].iloc[i]) else "",
            "experience_level": chunked_df['experience_level'].iloc[i] if pd.notna(chunked_df['experience_level'].iloc[i]) else "",
            "work_type": chunked_df['work_type'].iloc[i] if pd.notna(chunked_df['work_type'].iloc[i]) else "",
            "salary": str(chunked_df['salary'].iloc[i]) if pd.notna(chunked_df['salary'].iloc[i]) else "N/A"  
        }
    )
    for i in range(len(chunked_df))
]

In [None]:
sample_record = json.dumps(upsert_data[0])
record_size = sys.getsizeof(sample_record)
print(f"Size of a single record: {record_size} bytes")

max_records_per_request = 4194304 // record_size
print(f"Max records per request: {max_records_per_request}")

In [None]:
batch_size = min(100, max_records_per_request)  
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(vectors=batch)


## Testing queries

In [None]:
data['description'][5]

In [None]:
query_text = "Machine Learning Engineer with Python skills"
query_embedding = list(model.embed([query_text]))[0].tolist()  # 

results = index.query(
    vector=query_embedding,  # Query with single vector
    top_k=5,                 
    include_metadata=True    
)
if 'matches' in results and results['matches']:
    for match in results['matches']:
        print(f"Job Title: {match['metadata'].get('title', 'N/A')}")
        print(f"Company: {match['metadata'].get('company_name', 'N/A')}")
        print(f"Location: {match['metadata'].get('location', 'N/A')}")
        print(f"Score: {match.get('score', 0):.2f}")
        print("-" * 40)
else:
    print("No matching results found.")



In [None]:
query_text2 = "Senior Associate Attorney"
query_embedding = list(model.embed([query_text2]))[0].tolist()  # Ensure list format

results = index.query(
    vector=query_embedding,  
    top_k=5,                 
    include_metadata=True   
)

if 'matches' in results and results['matches']:
    for match in results['matches']:
        print(f"Job Title: {match['metadata'].get('title', 'N/A')}")
        print(f"Company: {match['metadata'].get('company_name', 'N/A')}")
        print(f"Location: {match['metadata'].get('location', 'N/A')}")
        print(f"Score: {match.get('score', 0):.2f}")
        print("-" * 40)
else:
    print("No matching results found.")

In [None]:
query_text3 = "economic development, city planning"
query_embedding = list(model.embed([query_text3]))[0].tolist()  
results = index.query(
    vector=query_embedding,  # Query with single vector
    top_k=5,                 
    include_metadata=True    
)

if 'matches' in results and results['matches']:
    for match in results['matches']:
        print(f"Job Title: {match['metadata'].get('title', 'N/A')}")
        print(f"Company: {match['metadata'].get('company_name', 'N/A')}")
        print(f"Location: {match['metadata'].get('location', 'N/A')}")
        print(f"Score: {match.get('score', 0):.2f}")
        print("-" * 40)
else:
    print("No matching results found.")