In [None]:
import os
from PyPDF2 import PdfReader
from docx import Document
from pinecone import Pinecone, ServerlessSpec
import requests  # Use requests for making API calls to Azure OpenAI

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text
    return text

# Function to extract text from any CV file (PDF or DOCX)
def extract_text_from_file(file_path):
    if file_path.endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        return extract_text_from_docx(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

# Initialize Pinecone with API key and environment
pc = Pinecone(api_key="")  # Replace with actual API key

# Check if the index exists or create it (assuming your Azure model outputs embeddings of 768 dimensions)
index_name = "cv-analyzer-index"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Adjust based on your model's embedding dimension
        metric='cosine',  # Use cosine similarity for example
        spec=ServerlessSpec(
            cloud='azure',  # Specify Azure as the cloud provider
            region='eastus2'  # Choose the appropriate Azure region
        )
    )

# Connect to the Pinecone index
index = pc.index(index_name)

# Function to get embeddings from Azure OpenAI
def get_embeddings_from_azure(text):
    # Azure OpenAI API endpoint
    url = ""

    # Your Azure API key
    api_key = ""  # Replace with your Azure API key

    # Headers for authentication
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key  # Use 'api-key' for Azure OpenAI
    }

    # Payload for the chat completion request
    data = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": text}
        ]
    }

    # Send the POST request to Azure OpenAI
    response = requests.post(url, headers=headers, json=data)

    # Check the response
    if response.status_code == 200:
        # Extract the embedding from the response (adjust based on your API response structure)
        embedding = response.json()["choices"][0]["message"]["content"]
        return embedding
    else:
        raise Exception(f"Error from Azure OpenAI API: {response.status_code}, {response.text}")

# Extract text from all CV files in the folder
cv_folder = 'cv_folder'
cv_texts = {}

for filename in os.listdir(cv_folder):
    file_path = os.path.join(cv_folder, filename)
    if os.path.isfile(file_path):
        try:
            text = extract_text_from_file(file_path)
            cv_texts[filename] = text
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Now send the extracted CV data to Pinecone
for filename, text in cv_texts.items():
    try:
        # Get embeddings from Azure OpenAI model
        embeddings = get_embeddings_from_azure(text)  # Replace with actual embedding retrieval logic

        # Upsert (insert or update) the embedding into Pinecone
        index.upsert(
            vectors=[{
                "id": filename,  # Use the filename or some unique ID
                "values": embeddings  # The embeddings from your Azure model
            }]
        )
        print(f"Successfully uploaded {filename} embeddings to Pinecone.")
    except Exception as e:
        print(f"Error uploading {filename} to Pinecone: {e}")

print("CV embeddings have been uploaded to Pinecone.")

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-01', 'x-cloud-trace-context': '2a48cdc53354a5ac14704d0bb3d5f85e', 'date': 'Wed, 19 Mar 2025 07:52:20 GMT', 'server': 'Google Frontend', 'Content-Length': '200', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Bad request: Your free plan does not support indexes in the eastus2 region of azure. To create indexes in this region, upgrade your plan."},"status":400}
