In [27]:
import os
import shutil
import docx2txt
import fitz
import openai
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
from PyPDF2 import PdfReader,PdfWriter
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from dotenv import load_dotenv
import json


In [13]:
# Attempt to load environment variables from a .env file in the current directory
# or any parent directories. This is useful for local development.
# load_dotenv() will not override existing environment variables.
dotenv_loaded_successfully = load_dotenv() 

if dotenv_loaded_successfully:
    print("INFO: '.env' file found and loaded successfully.")
else:
    print("INFO: No '.env' file found, or 'python-dotenv' is not installed. "
          "Will rely on system environment variables for 'OPENAI_API_KEY'.")

# Retrieve the OpenAI API key from the environment variables.
# os.getenv() will return None if the variable is not set.
api_key_from_env = os.getenv("OPENAI_API_KEY")

if api_key_from_env:
    openai.api_key = api_key_from_env
    print("INFO: OpenAI API key has been successfully set from an environment variable.")
    # For security reasons, do not print the key itself.
    # You can print a part of it for verification if absolutely necessary during debugging,
    # but remove it for final versions.
    # print(f"DEBUG: API Key starts with: {openai.api_key[:5]}...") # Example for debugging
else:
    # If the API key is not found, print a critical error message with instructions.
    # The script might still run, but subsequent OpenAI API calls will fail.
    print("\nCRITICAL ERROR: The 'OPENAI_API_KEY' environment variable is not set or not found.")
    print("Please ensure you have an OpenAI API key and have set it either as a system environment variable")
    print("or in a '.env' file in your project's root directory (e.g., OPENAI_API_KEY='sk-yourkey').")
    print("If using a .env file, ensure 'python-dotenv' is installed and 'load_dotenv()' is called.")
    # Depending on your script's structure, you might want to raise an error here to stop execution
    # if the API key is absolutely essential for all subsequent steps.
    # Example: raise ValueError("OpenAI API key not configured. Script cannot proceed.")
    print("WARNING: OpenAI API calls will fail until the API key is correctly configured.\n")


INFO: '.env' file found and loaded successfully.
INFO: OpenAI API key has been successfully set from an environment variable.


In [14]:
# --- Path Definitions ---
job_description_folder = "JD" 
original_resumes_folder = "original_resume"
sample_folder = "sample_resume"
selected_resume_main_folder = "selected_resume"
destination_folder = os.path.join(selected_resume_main_folder, datetime.date.today().isoformat())
os.makedirs(destination_folder, exist_ok=True)

In [15]:
# Define file reading function
def read_file(path: str) -> str:
    """
    Reads text content from a file, supporting .pdf and .docx formats.
    Uses a 'with' statement for PDF files to ensure resources are properly managed.

    Args:
        path (str): The full path to the file.

    Returns:
        str: The extracted text content from the file.
             Returns an empty string if the file type is not supported or an error occurs.
    """
    text = ""  # Initialize text to ensure a string is always returned
    
    try:
        if path.endswith(".pdf"):
            # Use a 'with' statement to ensure the document is closed automatically,
            # even if errors occur during processing.
            with fitz.open(path) as doc:
                all_pages_text = [page.get_text() for page in doc]
                text = "\n".join(all_pages_text)
        elif path.endswith(".docx"):
            text = docx2txt.process(path)
        else:
            # Handle unsupported file types
            print(f"Warning: Unsupported file type for: {os.path.basename(path)}. Only .pdf and .docx are currently processed.")
            # Depending on your needs, you might want to raise an error here instead of just printing.
            # Example: raise ValueError(f"Unsupported file type: {os.path.basename(path)}")

    except FileNotFoundError:
        print(f"Error: File not found at '{path}'")
    except fitz.fitz.FitzError as fe: # More specific error for PyMuPDF issues
        print(f"Error processing PDF file '{path}': {fe}")
    except Exception as e:
        # Catch-all for other unexpected errors during file reading
        print(f"An unexpected error occurred while reading file '{path}': {e}")
        
    return text

In [16]:
# --- Cell: Read and Validate Job Description (JD) ---

# This cell is responsible for locating and reading the job description (JD) file.
# The content of the JD is crucial as it serves as the primary reference for
# evaluating both sample resumes and candidate resumes in subsequent steps.

# Prerequisites:
#   - 'job_description_folder': A string variable defined in a previous cell,
#     containing the path to the directory where the JD file is stored.
#   - 'read_file': A function defined in a previous cell, capable of extracting
#     text from .docx and .pdf files.
# Required imported modules:
#   - 'os' (for os.path.join, os.path.exists, os.path.isfile, os.path.basename, os.path.abspath)
#   - 'datetime' (if you were using it for any dynamic naming here, though not directly in this snippet)

# --- Configuration for Job Description File ---
# Define the expected filename for the job description.
# This makes it easy to update if the filename changes.
# Ensure this filename matches the actual file in your 'job_description_folder'.
JOB_DESCRIPTION_FILENAME = "job_description.docx" # Or "JD_Edited.docx" if that's your active file

# Construct the full, platform-independent path to the job description file.
jd_path = os.path.join(job_description_folder, JOB_DESCRIPTION_FILENAME)

# Initialize 'jd_full_text'. This variable will hold the text content of the JD.
# Initializing to an empty string ensures it's always defined, even if file reading fails.
jd_full_text = "" 

# --- Read and Validate Job Description File ---
print(f"INFO: Attempting to read Job Description from: {os.path.abspath(jd_path)}")

# Check if the constructed path to the JD file actually exists in the filesystem.
if os.path.exists(jd_path):
    # If the path exists, further check if it's a file (not a directory).
    if os.path.isfile(jd_path):
        # Attempt to read the text content from the file using the 'read_file' function.
        # It's assumed 'read_file' will handle its own internal errors (e.g., unsupported types,
        # corrupted files) and ideally return an empty string or raise a specific exception on failure.
        try:
            jd_full_text = read_file(jd_path)
        except Exception as e:
            # This catch block is a safeguard if read_file itself doesn't handle all its exceptions
            # or if an unexpected error occurs during the call.
            print(f"ERROR: An error occurred while calling read_file for '{jd_path}': {e}")
            jd_full_text = "" # Ensure it's empty on error

        # After attempting to read, check if 'jd_full_text' actually contains content.
        if jd_full_text and jd_full_text.strip(): # .strip() checks if it's not just whitespace
            print(f"INFO: Successfully read Job Description: '{JOB_DESCRIPTION_FILENAME}'")
            
            # Optional: Display a snippet for quick verification in interactive environments (e.g., Jupyter).
            # INSPECT_JD_SNIPPET = False # Set to True to enable
            # if INSPECT_JD_SNIPPET:
            #     print("\n--- Job Description Snippet (first 300 chars) ---")
            #     print(jd_full_text[:300] + "..." if len(jd_full_text) > 300 else jd_full_text)
            #     print("--------------------------------------------------\n")
        else:
            # This condition means 'read_file()' might have returned an empty string (e.g., due to an
            # internal error in 'read_file', an unsupported file type if 'read_file' doesn't raise errors,
            # or the JD file itself is empty or contains no extractable text).
            # This is critical, as a valid JD is essential for the script's operation.
            error_message = (f"CRITICAL ERROR: Job Description file '{jd_path}' was found, "
                             f"but no text could be extracted, or the file is empty. "
                             f"Please check the file content and the 'read_file' function.")
            print(error_message)
            # In a production script, you might want to raise an exception or exit here:
            # raise ValueError(error_message)
            # For a notebook, jd_full_text will remain empty, and subsequent cells should check for this.
            
    else:
        # The path exists, but it points to a directory, not a regular file.
        error_message = (f"CRITICAL ERROR: The path specified for the Job Description '{jd_path}' "
                         f"points to a directory, not a file. Please ensure '{JOB_DESCRIPTION_FILENAME}' "
                         f"is a file within the '{job_description_folder}' directory.")
        print(error_message)
        # raise FileNotFoundError(error_message) # More specific error type
else:
    # The JD file was not found at the specified path. This is a critical failure.
    error_message = (f"CRITICAL ERROR: Job Description file not found at '{jd_path}'. "
                     f"Please ensure the file '{JOB_DESCRIPTION_FILENAME}' exists in the "
                     f"'{job_description_folder}' directory.")
    print(error_message)
    # raise FileNotFoundError(error_message)

# --- Verification for subsequent cells ---
# It's crucial for subsequent cells to check if 'jd_full_text' was successfully populated.
# Example check for the start of the next cell:
# if not jd_full_text:
#     print("STOPPING: Cannot proceed without valid Job Description content.")
#     # raise ValueError("JD content is missing, cannot continue.")
# else:
#     # Proceed with using jd_full_text
#     pass

# --- End of Job Description Reading Cell ---

INFO: Attempting to read Job Description from: c:\Users\harry\OneDrive\Desktop\AI RESUME SCANNER\AI_RESUME_SCANNER\JD\job_description.docx
INFO: Successfully read Job Description: 'job_description.docx'


In [28]:
# --- Cell: Process Sample Resumes to Generate Structured JSON and Extract Ideal Match Summaries ---

# This cell reads sample resumes, sends them to the OpenAI API with a prompt
# instructing the AI to output structured JSON (containing extracted info and an ideal match summary).
# The Python code then parses this JSON and extracts the "ideal_match_summary" text.
# These summaries, stored in 'sample_outputs', will be used for embedding.

# --- Prerequisites ---
# (Same as your previous version of this cell: sample_folder, jd_full_text, read_file, os, openai, client)
# Make sure 'datetime' is imported for 'current_date_for_ai_instruction'.

# --- Configuration for OpenAI API Call for Sample Resumes (JSON Output) ---
# IMPORTANT: Verify your model ID is valid and accessible.
OPENAI_MODEL_FOR_SAMPLES_JSON = "gpt-4.1-2025-04-14" # REPLACE with your valid model ID
# OPENAI_MODEL_FOR_SAMPLES_JSON = "gpt-3.5-turbo-0125" # Alternative for testing

OPENAI_TEMPERATURE_FOR_SAMPLES_JSON = 0.2 # Lower temperature for more structured, deterministic JSON.
OPENAI_MAX_TOKENS_FOR_SAMPLES_JSON = 1000 # JSON output can be longer; adjust based on typical resume complexity and summary length.
                                       # Ensure this + prompt tokens < model's context window.

current_date_for_ai_instruction = datetime.date.today().isoformat()

# New System Prompt instructing AI to output JSON.
# This is the NEW_SYSTEM_MESSAGE_FOR_SAMPLES_JSON_OUTPUT I provided earlier.
SYSTEM_MESSAGE_FOR_SAMPLES_JSON = f"""
You are an expert AI assistant specialized in identifying and deconstructing high-quality resumes for the travel industry.
You will be provided with a job description (JD) and a sample resume that is considered an EXCELLENT match for that role.
Your task is to extract key structured information from this excellent sample resume and then, based on this extracted information and the JD, generate a concise profile summary highlighting why it's an excellent match.

Please structure your ENTIRE output as a single, valid JSON object.
The JSON object MUST have the following top-level keys: "extracted_info", "ideal_match_summary".
Do NOT include any text outside of this JSON object (no greetings, no explanations, no ```json ... ``` markdown).

1.  "extracted_info":
    * This MUST be a JSON object containing structured details from the sample resume.
    * It MUST include keys like:
        * "job_history": (list of objects, each with "role", "company", "start_date", "end_date", "raw_tenure_text". Use "{current_date_for_ai_instruction}" for current roles' end_date). If no history, use [].
        * "skills": (list of strings) Key skills listed or inferred that are relevant to the JD.
        * "education": (list of strings) Relevant education entries.
        * "location_info": (object, with "city", "state", "country", "raw_location_text").
        * "quantifiable_achievements": (list of strings) Any quantifiable achievements mentioned.
    * For date fields ("start_date", "end_date"), attempt "YYYY-MM" or "YYYY-MM-DD". If ambiguous, use "Unknown".
    * If a field within "extracted_info" cannot be found, use "Not Specified" for strings or null for other types where appropriate, or an empty list [] for lists.

2.  "ideal_match_summary":
    * (string) Based SOLELY on the "extracted_info" from this sample resume AND the provided Job Description (JD), generate a concise (target 3-5 sentences, around 250-350 tokens) summary.
    * This summary should articulate the CORE reasons and key characteristics that make this specific sample resume an IDEAL benchmark candidate for the JD.
    * Focus on the most impactful alignments (e.g., "Possesses X+ years in luxury travel sales directly matching JD requirements, demonstrated by consistently exceeding targets. Key skills include advanced objection handling and CRM proficiency, vital for this role.")
    * This summary will be the primary text used for generating the 'sample_embedding'.
"""

# Initialize lists
sample_paths_processed = [] # Store paths of successfully processed samples
sample_texts_processed = [] # Store texts of successfully processed samples
sample_outputs = []         # THIS WILL NOW STORE THE "ideal_match_summary" STRING
sample_extracted_json_data = [] # Optional: Store the full parsed JSON from AI for each sample

# --- Ensure OpenAI Client is Initialized ---
if 'client' not in locals() or client is None:
    try:
        client = openai.OpenAI()
        if not client.api_key: raise openai.AuthenticationError("OpenAI API key not found by client.")
        print("INFO: OpenAI client (re)initialized successfully for processing samples (JSON output).")
    except Exception as e:
        print(f"CRITICAL ERROR: Failed to initialize OpenAI client for samples (JSON output). Error: {e}")
        client = None

# --- Stage 1: Discover and Read Sample Resume Files ---
if client:
    print(f"INFO: Looking for sample resumes (for JSON processing) in directory: {os.path.abspath(sample_folder)}")
    try:
        found_sample_files = [f for f in os.listdir(sample_folder) if f.lower().endswith((".pdf", ".docx"))]
        
        if not found_sample_files:
            print(f"WARNING: No .pdf or .docx sample resumes found in '{sample_folder}'.")
        else:
            all_initial_sample_paths = [os.path.join(sample_folder, filename) for filename in found_sample_files]
            print(f"INFO: Found {len(all_initial_sample_paths)} potential sample resume file(s): {found_sample_files}")

            temp_texts_for_json_processing = []
            valid_paths_for_json_processing_samples = []

            for path_to_file in all_initial_sample_paths:
                text_content = read_file(path_to_file)
                if text_content and text_content.strip():
                    temp_texts_for_json_processing.append(text_content)
                    valid_paths_for_json_processing_samples.append(path_to_file)
                else:
                    print(f"WARNING: Skipped (no text): {os.path.basename(path_to_file)} for JSON processing.")
            
            sample_texts_processed = temp_texts_for_json_processing
            sample_paths_processed = valid_paths_for_json_processing_samples

            if not sample_texts_processed:
                print(f"WARNING: No text extracted from any sample files. 'sample_outputs' will be empty.")
            else:
                # --- Stage 2: Generate Structured JSON and Extract Ideal Match Summary for Samples ---
                print(f"\nINFO: Generating structured JSON for {len(sample_texts_processed)} sample resume(s)...")
                
                for i, individual_sample_text in enumerate(sample_texts_processed):
                    current_sample_file = os.path.basename(sample_paths_processed[i])
                    print(f"INFO: Processing sample {i+1}/{len(sample_texts_processed)} ('{current_sample_file}') with OpenAI model '{OPENAI_MODEL_FOR_SAMPLES_JSON}' for JSON output...")
                    
                    ai_response_json_str = None
                    parsed_json_output = None
                    ideal_summary_for_embedding = "" # Default to empty string

                    try:
                        response = client.chat.completions.create(
                            model=OPENAI_MODEL_FOR_SAMPLES_JSON,
                            messages=[
                                {"role": "system", "content": SYSTEM_MESSAGE_FOR_SAMPLES_JSON},
                                {"role": "user", "content": f"JOB DESCRIPTION:\n```\n{jd_full_text}\n```"},
                                {"role": "user", "content": f"SAMPLE RESUME TEXT TO ANALYZE AND STRUCTURE:\n```\n{individual_sample_text}\n```"}
                            ],
                            temperature=OPENAI_TEMPERATURE_FOR_SAMPLES_JSON,
                            max_tokens=OPENAI_MAX_TOKENS_FOR_SAMPLES_JSON,
                            # response_format={ "type": "json_object" } # For newer models that support strict JSON mode
                        )
                        ai_response_json_str = response.choices[0].message.content.strip()
                        
                        # Attempt to parse the JSON string
                        try:
                            # Sometimes AI might wrap output in ```json ... ```, try to strip it if present
                            if ai_response_json_str.startswith("```json"):
                                ai_response_json_str = ai_response_json_str[7:]
                                if ai_response_json_str.endswith("```"):
                                    ai_response_json_str = ai_response_json_str[:-3]
                            ai_response_json_str = ai_response_json_str.strip() # Ensure no leading/trailing whitespace

                            parsed_json_output = json.loads(ai_response_json_str)
                            ideal_summary_for_embedding = parsed_json_output.get("ideal_match_summary", "").strip()
                            if not ideal_summary_for_embedding:
                                print(f"WARNING: AI returned JSON for '{current_sample_file}', but 'ideal_match_summary' was empty or missing.")
                                ideal_summary_for_embedding = f"Error extracting ideal_match_summary for {current_sample_file}" # Placeholder
                            
                            sample_extracted_json_data.append(parsed_json_output) # Optional: store full JSON

                        except json.JSONDecodeError as je:
                            print(f"ERROR (JSON Decode): Failed to parse AI's JSON response for sample '{current_sample_file}'. Error: {je}")
                            print(f"AI Raw Response was: {ai_response_json_str}")
                            ideal_summary_for_embedding = f"Error parsing JSON for {current_sample_file}" # Placeholder
                        except Exception as e_parse: # Catch other errors during parsing or key access
                            print(f"ERROR (Post-JSON Parse): Error processing parsed JSON for sample '{current_sample_file}'. Error: {e_parse}")
                            ideal_summary_for_embedding = f"Error processing parsed JSON for {current_sample_file}" # Placeholder

                    # Error Handling for OpenAI API call itself (copied from previous, ensure it's up-to-date)
                    except openai.APIConnectionError as e:
                        print(f"ERROR (API Connection) for sample '{current_sample_file}': {e}")
                        ideal_summary_for_embedding = f"API_CONNECTION_ERROR_SAMPLE_JSON: {e}"
                    # ... (include other specific openai errors: RateLimitError, AuthenticationError, NotFoundError, APIStatusError) ...
                    except openai.RateLimitError as e:
                        print(f"ERROR (Rate Limit / Quota): For sample '{current_sample_file}': {e}")
                        ideal_summary_for_embedding = f"API_RATE_LIMIT_ERROR_SAMPLE_JSON: {e}"
                    except openai.AuthenticationError as e:
                        print(f"ERROR (Authentication): For sample '{current_sample_file}': {e}")
                        ideal_summary_for_embedding = f"API_AUTHENTICATION_ERROR_SAMPLE_JSON: {e}"
                    except openai.NotFoundError as e: 
                        print(f"ERROR (Not Found - e.g., Model ID): For sample '{current_sample_file}': {e}")
                        ideal_summary_for_embedding = f"API_NOT_FOUND_ERROR_SAMPLE_JSON: Model '{OPENAI_MODEL_FOR_SAMPLES_JSON}'?"
                    except openai.APIStatusError as e: 
                        error_msg_detail = str(e)
                        try: error_msg_detail = e.response.json().get('error', {}).get('message', str(e))
                        except: pass
                        print(f"ERROR (API Status): For sample '{current_sample_file}': HTTP Status {e.status_code}, Msg: {error_msg_detail}")
                        ideal_summary_for_embedding = f"API_STATUS_ERROR_SAMPLE_JSON: HTTP {e.status_code}"
                    except Exception as e: 
                        print(f"ERROR (Unexpected) calling OpenAI for sample '{current_sample_file}': {type(e).__name__} - {e}")
                        ideal_summary_for_embedding = f"UNEXPECTED_PROCESSING_ERROR_SAMPLE_JSON: {e}"
                    
                    sample_outputs.append(ideal_summary_for_embedding) # This now stores the extracted summary or an error string.
                
                print(f"INFO: Finished generating structured JSON and extracting summaries for sample resumes. 'sample_outputs' now has {len(sample_outputs)} items.")

    except FileNotFoundError:
        print(f"CRITICAL ERROR: 'sample_folder' ({os.path.abspath(sample_folder)}) not found.")
    except Exception as e:
        print(f"CRITICAL ERROR: Unexpected error during sample file discovery/reading: {type(e).__name__} - {e}")
else:
    print("CRITICAL ERROR: OpenAI client not initialized. Skipping processing of sample resumes for JSON output.")

INFO: Looking for sample resumes (for JSON processing) in directory: c:\Users\harry\OneDrive\Desktop\AI RESUME SCANNER\AI_RESUME_SCANNER\sample_resume
INFO: Found 5 potential sample resume file(s): ['Carlos A. Martinez.pdf', 'Emily S. Turner.pdf', 'John R. Williams.pdf', 'Linda J. Park.pdf', 'Michael D. Lee.pdf']

INFO: Generating structured JSON for 5 sample resume(s)...
INFO: Processing sample 1/5 ('Carlos A. Martinez.pdf') with OpenAI model 'gpt-4.1-2025-04-14' for JSON output...
INFO: Processing sample 2/5 ('Emily S. Turner.pdf') with OpenAI model 'gpt-4.1-2025-04-14' for JSON output...
INFO: Processing sample 3/5 ('John R. Williams.pdf') with OpenAI model 'gpt-4.1-2025-04-14' for JSON output...
INFO: Processing sample 4/5 ('Linda J. Park.pdf') with OpenAI model 'gpt-4.1-2025-04-14' for JSON output...
INFO: Processing sample 5/5 ('Michael D. Lee.pdf') with OpenAI model 'gpt-4.1-2025-04-14' for JSON output...
INFO: Finished generating structured JSON and extracting summaries for sam

In [29]:
# --- Optional: Verification (Adjust to show extracted summaries) ---
INSPECT_EXTRACTED_SAMPLE_SUMMARIES = True # Set to True to print.

if INSPECT_EXTRACTED_SAMPLE_SUMMARIES and sample_outputs:
    print(f"\n--- Extracted 'ideal_match_summary' for Sample Resumes ({len(sample_outputs)} total) ---")
    for i, summary_text in enumerate(sample_outputs):
        original_filename = os.path.basename(sample_paths_processed[i]) if i < len(sample_paths_processed) else "Unknown Original"
        print(f"\nFor Sample File: {original_filename} (Sample {i+1}/{len(sample_outputs)})")
        if "ERROR" in summary_text or not summary_text.strip():
            print(f"   [AI Processing resulted in an error/placeholder or empty summary]: {summary_text}")
        else:
            print(f"Extracted Ideal Match Summary:\n{summary_text}")
        print("-" * 40)
    print("--- End of Sample Summaries Inspection ---")
elif INSPECT_EXTRACTED_SAMPLE_SUMMARIES:
    print("\nINFO: 'sample_outputs' is empty or inspection is disabled.")


--- Extracted 'ideal_match_summary' for Sample Resumes (5 total) ---

For Sample File: Carlos A. Martinez.pdf (Sample 1/5)
Extracted Ideal Match Summary:
Carlos A. Martinez is an exemplary fit for the Inbound Sales Consultant role at ABC Cruises, bringing over 8 years of luxury travel sales experience with a proven track record in remote, phone-based closing. His tenure includes consistently ranking #1 for customer conversion and upsell, closing more than 500 bookings, and developing sales scripts that increased close rates by 17%—demonstrating mastery in objection handling and exceeding sales KPIs, both of which are core requirements in the JD. Carlos’s expertise in CRM systems, pipeline management, and post-sale follow-up aligns perfectly with the need for accurate record-keeping and relationship building with high-end clients. His bilingual proficiency (English and Spanish) and background in hospitality management further enhance his ability to deliver personalized, high-touch serv

In [30]:
# --- Cell: Process Candidate Resumes for Structured Info Extraction and Preliminary Analysis (JSON Output) ---

# This cell reads candidate resumes from 'original_resumes_folder'.
# For each resume, it calls the OpenAI API with a specific system message ('NEW_SYSTEM_MESSAGE_FOR_CANDIDATES')
# that instructs the AI to output a structured JSON object containing:
#   1. Extracted job history (for Python-based tenure calculation).
#   2. Extracted location information (for Python-based location check).
#   3. A brief qualitative JD alignment summary (this will be used for embedding if rules pass).
# The Python code in *subsequent cells* will parse this JSON, perform rule-based filtering
# (tenure, location), and then prepare the 'jd_alignment_summary' for embedding.
# This cell's primary output is 'raw_ai_json_outputs_for_candidates' (list of JSON strings or error markers)
# and a synchronized 'candidate_paths_for_processing' list.

# --- Prerequisites ---
# Expected variables from previous cells:
#   - 'original_resumes_folder': Path to the directory containing candidate resumes.
#   - 'jd_full_text': String containing the full text of the job description.
#   - 'read_file': Function to extract text from .pdf and .docx files.
#   - 'os', 'openai', 'datetime', 'json' modules imported.
#   - 'client': An initialized OpenAI client object for SDK v1.x.x.

# --- Configuration for OpenAI API Call for Candidate Resumes (JSON Output) ---
# IMPORTANT: Verify your model ID is valid and accessible.
OPENAI_MODEL_FOR_CANDIDATES_JSON = "gpt-4.1-2025-04-14" # REPLACE with your valid and accessible model ID
# OPENAI_MODEL_FOR_CANDIDATES_JSON = "gpt-3.5-turbo-0125" # Alternative for testing

OPENAI_TEMPERATURE_FOR_CANDIDATES_JSON = 0.1 # Very low temperature for structured, factual JSON output.
OPENAI_MAX_TOKENS_FOR_CANDIDATES_JSON = 1500 # JSON output can be quite long for detailed resumes.
                                           # Adjust based on typical resume complexity and model context window.
                                           # Ensure this + prompt tokens < model's context window.

current_date_for_ai_instruction = datetime.date.today().isoformat()

# System Prompt instructing AI to output structured JSON for candidate resumes.
# This is the 'NEW_SYSTEM_MESSAGE_FOR_CANDIDATES' prompt discussed previously.
SYSTEM_MESSAGE_FOR_CANDIDATES_JSON = f"""
You are an expert AI assistant specialized in meticulous resume data extraction and preliminary analysis for the travel industry.
You will be provided with a job description (JD) and a candidate's resume text.
Your task is to extract specific pieces of information from the resume and provide a brief qualitative summary based on the JD.

Please structure your ENTIRE output as a single, valid JSON object.
The JSON object MUST have the following top-level keys: "job_history", "location_info", "jd_alignment_summary".
Do NOT include any text outside of this JSON object (no greetings, no explanations, no ```json ... ``` markdown).

1.  "job_history":
    * This MUST be a list of JSON objects (dictionaries).
    * For each distinct job position you can identify in the resume, create one JSON object.
    * Each job object MUST contain the following keys:
        * "role": (string) The job title or role as stated in the resume. If not found, use "Not Specified".
        * "company": (string) The company name as stated. If not found, use "Not Specified".
        * "start_date": (string) The start date of the role. Attempt to parse and provide in "YYYY-MM" format (e.g., "2019-03"). If only year is available, use "YYYY" (e.g., "2019"). If a more precise date like "YYYY-MM-DD" is clearly stated, use that. If a start date cannot be reliably determined, use "Unknown".
        * "end_date": (string) The end date of the role, following the same formatting rules as "start_date". If the role is clearly stated as current (e.g., "Present", "Current", "To Date", "Now"), use the exact string "{current_date_for_ai_instruction}" for this field. If an end date cannot be reliably determined, use "Unknown".
        * "raw_tenure_text": (string) The exact, verbatim text snippet from the resume that describes the dates or tenure for this specific role (e.g., "Jan 2020 - Present", "2018 to 2019", "3 years relevant experience"). If no such snippet is found for a role, use "Not Specified".
    * If no job history is found or discernible, "job_history" should be an empty list: [].

2.  "location_info":
    * This MUST be a JSON object (dictionary).
    * It MUST contain the following keys:
        * "city": (string or null) The candidate's primary city, if discernible from contact information or summary. If not found, use null.
        * "state": (string or null) The candidate's state or province, if discernible. If not found, use null.
        * "country": (string or null) The candidate's country. Attempt to determine if it is "USA". If clearly another country, state that country name. If ambiguous or not found, use null.
        * "raw_location_text": (string) The verbatim text snippet from the resume where primary location information was found. If not found, use "Not Specified".

3.  "jd_alignment_summary":
    * (string) Based on the provided Job Description (JD), provide a VERY BRIEF qualitative summary (strictly 1 to 3 sentences, target around 50-100 words) indicating the overall apparent alignment of the candidate's skills and experiences with the JD.
    * Focus on key aspects such as relevant industry experience, core skills mentioned in the JD, and overall suitability from a quick review.
    * If the resume seems completely irrelevant to the JD based on a high-level assessment, state that clearly and briefly.
    * Avoid making definitive hiring decisions; this is a preliminary alignment assessment. This summary will be used for further semantic matching if other criteria are met.
"""

# Initialize lists
candidate_paths_for_processing = [] # Stores paths of successfully read candidate resumes.
candidate_texts_for_processing = [] # Stores text of successfully read candidate resumes.
# This list will store the RAW JSON strings (or error markers) returned by the AI for each candidate.
# Parsing and rule-based filtering will happen in a SUBSEQUENT cell.
raw_ai_json_outputs_for_candidates = [] 

# --- Ensure OpenAI Client is Initialized ---
if 'client' not in locals() or client is None:
    try:
        client = openai.OpenAI()
        if not client.api_key: raise openai.AuthenticationError("OpenAI API key not found by client.")
        print("INFO: OpenAI client (re)initialized successfully for processing candidate resumes (JSON output).")
    except Exception as e:
        print(f"CRITICAL ERROR: Failed to initialize OpenAI client for candidate resumes (JSON output). Error: {e}")
        client = None

# --- Stage 1: Discover and Read Candidate Resume Files ---
if client:
    print(f"INFO: Looking for candidate resumes (for JSON processing) in directory: {os.path.abspath(original_resumes_folder)}")
    try:
        found_candidate_files_list = [f for f in os.listdir(original_resumes_folder) if f.lower().endswith((".pdf", ".docx"))]
        
        if not found_candidate_files_list:
            print(f"WARNING: No .pdf or .docx candidate resumes found in '{original_resumes_folder}'.")
        else:
            all_initial_candidate_paths = [os.path.join(original_resumes_folder, filename) for filename in found_candidate_files_list]
            print(f"INFO: Found {len(all_initial_candidate_paths)} potential candidate resume file(s): {found_candidate_files_list}")

            temp_texts_from_candidates = []
            valid_paths_for_candidate_processing = []

            for path_to_candidate_file in all_initial_candidate_paths:
                text_content = read_file(path_to_candidate_file)
                if text_content and text_content.strip():
                    temp_texts_from_candidates.append(text_content)
                    valid_paths_for_candidate_processing.append(path_to_candidate_file)
                else:
                    print(f"WARNING: Skipped (no text): {os.path.basename(path_to_candidate_file)} for candidate JSON processing.")
            
            candidate_texts_for_processing = temp_texts_from_candidates
            candidate_paths_for_processing = valid_paths_for_candidate_processing

            if not candidate_texts_for_processing:
                print(f"WARNING: No text extracted from any candidate files. 'raw_ai_json_outputs_for_candidates' will be empty.")
            else:
                # --- Stage 2: Generate Structured JSON for Successfully Read Candidate Resumes ---
                print(f"\nINFO: Generating structured JSON for {len(candidate_texts_for_processing)} candidate resume(s)...")
                
                for i, individual_candidate_text in enumerate(candidate_texts_for_processing):
                    current_candidate_file = os.path.basename(candidate_paths_for_processing[i])
                    print(f"INFO: Processing candidate {i+1}/{len(candidate_texts_for_processing)} ('{current_candidate_file}') with OpenAI model '{OPENAI_MODEL_FOR_CANDIDATES_JSON}' for JSON output...")
                    
                    ai_response_str_for_candidate = f"ERROR_AI_CALL_NOT_ATTEMPTED_FOR_{current_candidate_file}" # Default error
                    try:
                        response = client.chat.completions.create(
                            model=OPENAI_MODEL_FOR_CANDIDATES_JSON,
                            messages=[
                                {"role": "system", "content": SYSTEM_MESSAGE_FOR_CANDIDATES_JSON},
                                {"role": "user", "content": f"JOB DESCRIPTION:\n```\n{jd_full_text}\n```"},
                                {"role": "user", "content": f"CANDIDATE RESUME TEXT TO ANALYZE AND STRUCTURE:\n```\n{individual_candidate_text}\n```"}
                            ],
                            temperature=OPENAI_TEMPERATURE_FOR_CANDIDATES_JSON,
                            max_tokens=OPENAI_MAX_TOKENS_FOR_CANDIDATES_JSON,
                            # response_format={ "type": "json_object" } # Add this if your model version supports strict JSON mode
                            # timeout=60 # Optional: Longer timeout for complex JSON generation
                        )
                        ai_response_str_for_candidate = response.choices[0].message.content.strip()
                        # Basic check if the response looks like JSON (starts with { ends with })
                        # More robust JSON validation will happen in the next cell during parsing.
                        if not (ai_response_str_for_candidate.startswith("{") and ai_response_str_for_candidate.endswith("}")):
                            print(f"WARNING: AI response for '{current_candidate_file}' does not look like a JSON object. Raw response: {ai_response_str_for_candidate[:200]}...")
                            # Still append it, parsing errors will be handled in the next cell.
                    
                    # Error Handling for OpenAI API call
                    except openai.APIConnectionError as e:
                        print(f"ERROR (API Connection) for candidate '{current_candidate_file}': {e}")
                        ai_response_str_for_candidate = f"API_CONNECTION_ERROR_CANDIDATE: {e}"
                    except openai.RateLimitError as e:
                        print(f"ERROR (Rate Limit/Quota) for candidate '{current_candidate_file}': {e}")
                        ai_response_str_for_candidate = f"API_RATE_LIMIT_ERROR_CANDIDATE: {e}"
                    except openai.AuthenticationError as e:
                        print(f"ERROR (Authentication) for candidate '{current_candidate_file}': {e}")
                        ai_response_str_for_candidate = f"API_AUTHENTICATION_ERROR_CANDIDATE: {e}"
                    except openai.NotFoundError as e: 
                        print(f"ERROR (Not Found - e.g., Model ID) for candidate '{current_candidate_file}': {e}")
                        ai_response_str_for_candidate = f"API_NOT_FOUND_ERROR_CANDIDATE: Model '{OPENAI_MODEL_FOR_CANDIDATES_JSON}'?"
                    except openai.APIStatusError as e: 
                        error_msg_detail = str(e)
                        try: error_msg_detail = e.response.json().get('error', {}).get('message', str(e))
                        except: pass
                        print(f"ERROR (API Status) for candidate '{current_candidate_file}': HTTP Status {e.status_code}, Msg: {error_msg_detail}")
                        ai_response_str_for_candidate = f"API_STATUS_ERROR_CANDIDATE: HTTP {e.status_code}"
                    except Exception as e: 
                        print(f"ERROR (Unexpected) calling OpenAI for candidate '{current_candidate_file}': {type(e).__name__} - {e}")
                        ai_response_str_for_candidate = f"UNEXPECTED_PROCESSING_ERROR_CANDIDATE: {e}"
                    
                    raw_ai_json_outputs_for_candidates.append(ai_response_str_for_candidate)
                
                print(f"INFO: Finished generating raw JSON outputs for candidate resumes. 'raw_ai_json_outputs_for_candidates' now has {len(raw_ai_json_outputs_for_candidates)} items.")

    except FileNotFoundError:
        print(f"CRITICAL ERROR: 'original_resumes_folder' ({os.path.abspath(original_resumes_folder)}) not found.")
    except Exception as e:
        print(f"CRITICAL ERROR: Unexpected error during candidate file discovery/reading: {type(e).__name__} - {e}")
else:
    print("CRITICAL ERROR: OpenAI client not initialized. Skipping processing of candidate resumes for JSON output.")


INFO: Looking for candidate resumes (for JSON processing) in directory: c:\Users\harry\OneDrive\Desktop\AI RESUME SCANNER\AI_RESUME_SCANNER\original_resume
INFO: Found 21 potential candidate resume file(s): ['Alexandra N. White.pdf', 'Amanda B. Chen.pdf', 'Brian J. Walker.pdf', 'Chloe R. Johnson.pdf', 'Christopher N. Patel.pdf', 'David S. King.pdf', 'Derek J. Nelson.pdf', 'Elena R. Castillo.pdf', 'Jason T. Kim.pdf', 'Jessica L. Morgan.pdf', 'Kevin J. Stone.pdf', 'Lauren K. Smith.pdf', 'Mark L. Thompson.pdf', 'Morgan L. Wright.pdf', 'Omar S. Farouk.pdf', 'Rebecca D. Miller.pdf', 'Samantha Q. Evans.pdf', 'Samuel D. Reed.pdf', 'Sophia E. Lopez.pdf', 'Tyler M. Harris.pdf', 'Wei Lin.pdf']

INFO: Generating structured JSON for 21 candidate resume(s)...
INFO: Processing candidate 1/21 ('Alexandra N. White.pdf') with OpenAI model 'gpt-4.1-2025-04-14' for JSON output...
INFO: Processing candidate 2/21 ('Amanda B. Chen.pdf') with OpenAI model 'gpt-4.1-2025-04-14' for JSON output...
INFO: Process

In [31]:
# --- Optional: Verification of Raw AI JSON Outputs ---
INSPECT_RAW_JSON_CANDIDATE_OUTPUTS = True # Set to True for debugging.

if INSPECT_RAW_JSON_CANDIDATE_OUTPUTS and 'raw_ai_json_outputs_for_candidates' in locals() and raw_ai_json_outputs_for_candidates:
    print(f"\n--- Raw AI JSON Outputs for Candidate Resumes ({len(raw_ai_json_outputs_for_candidates)} total) ---")
    for i, json_str_output in enumerate(raw_ai_json_outputs_for_candidates):
        original_filename = os.path.basename(candidate_paths_for_processing[i]) if i < len(candidate_paths_for_processing) else "Unknown Original"
        print(f"\nFor Candidate File: {original_filename} (Candidate {i+1}/{len(raw_ai_json_outputs_for_candidates)})")
        print(f"Raw AI JSON String (or error message):\n{json_str_output}")
        print("-" * 40)
    print("--- End of Raw AI JSON Output Inspection ---")
elif INSPECT_RAW_JSON_CANDIDATE_OUTPUTS:
    print("\nINFO: 'raw_ai_json_outputs_for_candidates' is empty or inspection is disabled.")


--- Raw AI JSON Outputs for Candidate Resumes (21 total) ---

For Candidate File: Alexandra N. White.pdf (Candidate 1/21)
Raw AI JSON String (or error message):
{
  "job_history": [
    {
      "role": "Sales Consultant",
      "company": "Vista Dream Vacations",
      "start_date": "2019-01",
      "end_date": "2025-05-18",
      "raw_tenure_text": "Jan 2019 – Present"
    },
    {
      "role": "Sales Coordinator",
      "company": "Sunrise Luxury Travel",
      "start_date": "2016-06",
      "end_date": "2018-12",
      "raw_tenure_text": "Jun 2016 – Dec 2018"
    }
  ],
  "location_info": {
    "city": "Charlotte",
    "state": "NC",
    "country": "USA",
    "raw_location_text": "Charlotte, NC"
  },
  "jd_alignment_summary": "The candidate demonstrates over 6 years of remote travel sales experience, including direct sales, KPI achievement, and pipeline management in the travel industry. Their background in high-volume sales, objection handling, and client retention aligns closely

In [49]:
# --- Cell: Parse AI's JSON Output, Apply Python Rules, and Prepare Data for Embedding ---

# This cell takes the raw JSON string outputs from the AI (for candidate resumes),
# parses them, applies hard-coded rules (like tenure and location) using Python logic,
# and then extracts the 'jd_alignment_summary' for those candidates who pass the rules.
# These extracted summaries are then prepared for the subsequent embedding step.

# --- Prerequisites ---
# Expected variables from the PREVIOUS cell:
#   - 'raw_ai_json_outputs_for_candidates': A list of strings. Each string is expected to be
#     a JSON object returned by the AI, or an error marker string if the API call failed.
#   - 'candidate_paths_for_processing': A list of strings, holding the original file paths
#     for the candidate resumes. This list MUST be synchronized with 'raw_ai_json_outputs_for_candidates'
#     in terms of order and length.
#
# Required imported modules (ensure these are imported in one of the first cells of your notebook):
#   - import json # For json.loads
#   - import datetime # For datetime, date, timedelta - used in tenure calculation
#   - import os # For os.path.basename - used in logging/error messages

# --- Configuration for Rule-Based Filtering ---
MINIMUM_AVERAGE_TENURE_YEARS = 1.0 # (e.g.) Minimum 1 year of average relevant work experience
REQUIRED_COUNTRY = "USA" # Case-insensitive check will be applied
MINIMUM_TENURE_PER_ROLE_MONTHS = 6 # (e.g.) A single role must last at least 6 months to be considered "relevant"

# Helper function: Parses date strings from AI output (YYYY-MM, YYYY-MM-DD, YYYY) and handles "Unknown"
def parse_date_from_ai(date_str):
    """
    Parses a date string from AI output (expected "YYYY-MM", "YYYY-MM-DD", or "YYYY").
    Returns a datetime.date object or None if parsing fails or input is "Unknown"/"Not Specified".
    """
    if not date_str or date_str.lower() in ["unknown", "not specified"]:
        return None
    try:
        if len(date_str) == 10 and date_str.count('-') == 2: # YYYY-MM-DD
            return datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
        elif len(date_str) == 7 and date_str.count('-') == 1: # YYYY-MM
            # Assume first day of the month for calculation if only YYYY-MM is given
            return datetime.datetime.strptime(date_str + "-01", "%Y-%m-%d").date()
        elif len(date_str) == 4 and date_str.isdigit(): # YYYY
            # Assume Jan 1st of the year if only YYYY is given
            return datetime.date(int(date_str), 1, 1)
        else:
            # print(f"DEBUG: Unrecognized date format for parsing: '{date_str}'") # Keep for debugging if needed
            return None
    except ValueError:
        # print(f"DEBUG: ValueError parsing date string: '{date_str}'") # Keep for debugging if needed
        return None

# Helper function: Calculates tenure in years for a single role
def calculate_single_role_tenure_years(start_date_obj, end_date_obj):
    """
    Calculates tenure in years between two date objects.
    Returns tenure in years (float) or 0.0 if dates are invalid or end_date < start_date.
    """
    if not start_date_obj or not end_date_obj or end_date_obj < start_date_obj:
        return 0.0
    # Using dateutil.relativedelta for more precise year/month/day differences if available
    # from dateutil.relativedelta import relativedelta
    # delta = relativedelta(end_date_obj, start_date_obj)
    # return delta.years + delta.months / 12 + delta.days / 365.25 # Approximation
    delta = end_date_obj - start_date_obj
    return delta.days / 365.25

# Initialize output lists for THIS cell.
# These will be used as inputs for the NEXT cell (Embedding Generation).
final_resume_summaries_for_embedding = []
final_resume_paths_for_embedding = []
# Optional: For tracking which resumes were filtered out and why.
filtered_out_candidates_info = []

# --- Check if input lists are available and synchronized ---
if not ('raw_ai_json_outputs_for_candidates' in locals() and \
        'candidate_paths_for_processing' in locals() and \
        isinstance(raw_ai_json_outputs_for_candidates, list) and \
        isinstance(candidate_paths_for_processing, list) and \
        len(raw_ai_json_outputs_for_candidates) == len(candidate_paths_for_processing)):
    
    print("CRITICAL ERROR: Input lists ('raw_ai_json_outputs_for_candidates' or 'candidate_paths_for_processing') "
          "are missing, not lists, or their lengths do not match. Cannot proceed with filtering.")
    print(f"  Length of 'raw_ai_json_outputs_for_candidates' (if exists): "
          f"{len(raw_ai_json_outputs_for_candidates) if 'raw_ai_json_outputs_for_candidates' in locals() and isinstance(raw_ai_json_outputs_for_candidates, list) else 'Not a list or not defined'}")
    print(f"  Length of 'candidate_paths_for_processing' (if exists): "
          f"{len(candidate_paths_for_processing) if 'candidate_paths_for_processing' in locals() and isinstance(candidate_paths_for_processing, list) else 'Not a list or not defined'}")
else:
    print(f"INFO: Starting JSON parsing and rule-based filtering for {len(raw_ai_json_outputs_for_candidates)} AI outputs...")
    for i, raw_json_str in enumerate(raw_ai_json_outputs_for_candidates):
        original_path = candidate_paths_for_processing[i]
        original_filename = os.path.basename(original_path)
        candidate_passed_rules = False 
        extracted_summary_for_embedding = ""
        filter_reason = ""
        
        # Initialize tenure calculation variables
        total_tenure_months_relevant_roles = 0.0
        count_relevant_roles = 0
        total_tenure_months_all_roles = 0.0
        count_all_roles = 0
        average_tenure_relevant_roles_years = 0.0
        average_tenure_all_roles_years = 0.0
        is_usa_based = False # Default to False

        try:
            # Check if raw_json_str is an error marker from the previous AI call
            if isinstance(raw_json_str, str) and "ERROR" in raw_json_str.upper():
                filter_reason = f"AI processing error in the previous step: {raw_json_str}" 
            # Check if it's a valid-looking JSON string
            elif isinstance(raw_json_str, str) and raw_json_str.strip().startswith("{") and raw_json_str.strip().endswith("}"):
                ai_data = json.loads(raw_json_str) # Attempt to parse

                # --- Rule 1 & 2: Tenure Calculation (in Python) ---
                job_history = ai_data.get("job_history", [])
                
                if isinstance(job_history, list):
                    for job in job_history:
                        if isinstance(job, dict):
                            start_date_str = job.get("start_date")
                            end_date_str = job.get("end_date")
                            
                            start_date = parse_date_from_ai(start_date_str)
                            end_date = parse_date_from_ai(end_date_str)

                            if start_date and end_date and end_date >= start_date:
                                tenure_years_this_role = calculate_single_role_tenure_years(start_date, end_date)
                                tenure_months_this_role_float = tenure_years_this_role * 12.0
                                
                                # Accumulate all valid tenures
                                if tenure_months_this_role_float > 0: # Count as long as it's a valid tenure
                                    total_tenure_months_all_roles += tenure_months_this_role_float
                                    count_all_roles += 1
                                
                                # Accumulate "relevant" tenures (meeting minimum single role duration)
                                if tenure_months_this_role_float >= MINIMUM_TENURE_PER_ROLE_MONTHS:
                                    total_tenure_months_relevant_roles += tenure_months_this_role_float
                                    count_relevant_roles += 1
                
                if count_relevant_roles > 0:
                    average_tenure_relevant_roles_years = (total_tenure_months_relevant_roles / count_relevant_roles) / 12.0
                
                if count_all_roles > 0:
                    average_tenure_all_roles_years = (total_tenure_months_all_roles / count_all_roles) / 12.0
                
                # --- Rule 3: Location Check (in Python) ---
                location_info = ai_data.get("location_info", {})
                country = "Unknown" # Default to Unknown
                if isinstance(location_info, dict):
                    country_from_ai = location_info.get("country") 
                    if isinstance(country_from_ai, str) and country_from_ai.strip(): # Ensure country info is not empty
                        country = country_from_ai
                    elif country_from_ai is None:
                         country = "Unknown" 
                
                is_usa_based = (country.upper() == REQUIRED_COUNTRY.upper())

                # --- Rule 4: Filtering Decision (based on Python calculations) ---
                if not is_usa_based:
                    filter_reason = f"Candidate location not {REQUIRED_COUNTRY} (Detected country: {country})." 
                elif average_tenure_relevant_roles_years < MINIMUM_AVERAGE_TENURE_YEARS:
                    filter_reason = (f"Average tenure of relevant roles {average_tenure_relevant_roles_years:.2f} years (only roles >= {MINIMUM_TENURE_PER_ROLE_MONTHS} months counted) " 
                                     f"is less than required {MINIMUM_AVERAGE_TENURE_YEARS} year(s)." 
                                     f" (Average tenure of all roles: {average_tenure_all_roles_years:.2f} years)") 
                else:
                    candidate_passed_rules = True
                    summary_from_ai = ai_data.get("jd_alignment_summary", "").strip()
                    if summary_from_ai:
                        extracted_summary_for_embedding = summary_from_ai
                    else:
                        # Candidate passed rules, but AI didn't provide the summary in JSON.
                        print(f"WARNING: Candidate '{original_filename}' passed Python rules, but 'jd_alignment_summary' was missing or empty in AI's JSON output. Will be filtered out.") 
                        # If summary is crucial for subsequent steps, treat as filtered out even if rules passed
                        filter_reason = "Passed rules but 'jd_alignment_summary' was missing or empty in AI's JSON output." 
                        candidate_passed_rules = False 
            else: # raw_json_str is not a string or does not look like JSON
                filter_reason = f"Invalid AI output format (not a valid JSON string). Snippet: {str(raw_json_str)[:100]}" 

        except json.JSONDecodeError as je:
            filter_reason = f"JSON Decode Error: {je}. Raw AI output snippet: {raw_json_str[:200]}..." 
        except Exception as e:
            filter_reason = f"Unexpected error processing JSON/rules for '{original_filename}'. Error: {type(e).__name__} - {e}" 

        # --- Populate output lists ---
        if candidate_passed_rules and extracted_summary_for_embedding:
            final_resume_summaries_for_embedding.append(extracted_summary_for_embedding)
            final_resume_paths_for_embedding.append(original_path)
        else:
            if not filter_reason: # If no specific reason was set before
                filter_reason = "Did not pass rule-based filtering or critical summary was missing." 
            
            # Add more detailed tenure and location info when printing filter message
            log_message = f"INFO: Filtering out '{original_filename}'. Reason: {filter_reason}" 
            if "tenure" in filter_reason.lower(): # If reason is tenure-related (check in lowercase for robustness)
                 log_message += (f" (Relevant roles average: {average_tenure_relevant_roles_years:.2f} years, " 
                                 f"All roles average: {average_tenure_all_roles_years:.2f} years)") 
            if "location" in filter_reason.lower() and not is_usa_based: # If reason is location-related and not USA based
                 log_message += f" (Determined location: {country})" 

            print(log_message)
            filtered_out_candidates_info.append({
                "path": original_path,
                "filename": original_filename,
                "reason": filter_reason,
                "avg_tenure_relevant_roles": average_tenure_relevant_roles_years,
                "avg_tenure_all_roles": average_tenure_all_roles_years,
                "is_usa_based": is_usa_based,
                "country_detected": country if 'country' in locals() else "Not detected" 
            })

    print(f"\nINFO: JSON parsing and rule-based filtering completed.") 
    print(f"INFO: {len(final_resume_summaries_for_embedding)} candidate(s) passed all rules and have valid summaries prepared for embedding.") 
    print(f"INFO: {len(filtered_out_candidates_info)} candidate(s) were filtered out due to rules or processing/data errors.") 

# --- Optional: Verification of outputs of this cell ---
INSPECT_FILTERING_RESULTS = True # Set to True to see the outputs of this cell

if INSPECT_FILTERING_RESULTS:
    if final_resume_summaries_for_embedding:
        print(f"\n--- Summaries Prepared for Embedding ({len(final_resume_summaries_for_embedding)} total) ---") 
        # Show only the first few as examples to avoid excessive output
        for i in range(min(3, len(final_resume_summaries_for_embedding))): 
            original_filename = os.path.basename(final_resume_paths_for_embedding[i])
            print(f"\nFor Embedding - File: {original_filename} (Item {i+1})") 
            print(f"Summary to be embedded (snippet):\n{final_resume_summaries_for_embedding[i][:200]}...") 
            print("-" * 30)
        if len(final_resume_summaries_for_embedding) > 3:
            print(f"  ... and {len(final_resume_summaries_for_embedding) - 3} more summaries.") 
    else:
        print("\nINFO: No summaries prepared for embedding (list is empty).") 

    if filtered_out_candidates_info:
        print(f"\n--- Candidates Filtered Out ({len(filtered_out_candidates_info)} total) ---") 
        # Show only the first few as examples
        for i, filtered_info in enumerate(filtered_out_candidates_info[:3]): 
            print(f"\nFiltered Out - File: {filtered_info.get('filename', 'N/A')} (Item {i+1})") 
            print(f"  Reason: {filtered_info.get('reason', 'N/A')}") 
            print(f"  Average tenure relevant roles: {filtered_info.get('avg_tenure_relevant_roles', 0.0):.2f} years") 
            print(f"  Average tenure all roles: {filtered_info.get('avg_tenure_all_roles', 0.0):.2f} years") 
            print(f"  Is USA based: {'Yes' if filtered_info.get('is_usa_based') else 'No'}") 
            print(f"  Detected country: {filtered_info.get('country_detected', 'Not detected')}") 
            print("-" * 30)
        if len(filtered_out_candidates_info) > 3:
            print(f"  ... and {len(filtered_out_candidates_info) - 3} more filtered out candidates.") 
    else:
        print("\nINFO: No candidates were filtered out by rules in this run (or list is empty).") 
        
    print("--- End of Filtering Results Inspection ---") 


INFO: Starting JSON parsing and rule-based filtering for 21 AI outputs...
INFO: Filtering out 'Lauren K. Smith.pdf'. Reason: Average tenure of relevant roles 0.00 years (only roles >= 6 months counted) is less than required 1.0 year(s). (Average tenure of all roles: 0.00 years) (Relevant roles average: 0.00 years, All roles average: 0.00 years)

INFO: JSON parsing and rule-based filtering completed.
INFO: 20 candidate(s) passed all rules and have valid summaries prepared for embedding.
INFO: 1 candidate(s) were filtered out due to rules or processing/data errors.

--- Summaries Prepared for Embedding (20 total) ---

For Embedding - File: Alexandra N. White.pdf (Item 1)
Summary to be embedded (snippet):
The candidate demonstrates over 6 years of remote travel sales experience, including direct sales, KPI achievement, and pipeline management in the travel industry. Their background in high-volume sale...
------------------------------

For Embedding - File: Amanda B. Chen.pdf (Item 2)
Su

In [50]:
# --- Initialize the Sentence Transformer Model ---

# This cell loads a Sentence Transformer model to convert text into semantic vector embeddings.
# These embeddings are then used for calculating similarity between resume summaries and sample summaries.

# Current choice: 'all-mpnet-base-v2' - a strong general-purpose model from the sentence-transformers library.
# It balances performance (embedding quality) with local inference speed and model size.
SENTENCE_TRANSFORMER_MODEL_NAME = 'all-mpnet-base-v2'
# Previous model tried: 'paraphrase-MiniLM-L6-v2' # This was the model used before switching to all-mpnet-base-v2

# Alternative approach for generating embeddings:
# OpenAI's Embedding API (e.g., "text-embedding-3-small" or "text-embedding-3-large")
# could also be used here. This would involve:
#   1. Making API calls to OpenAI for each text to be embedded.
#   2. Handling potential API costs and rate limits.
#   3. Benefits might include potentially higher quality embeddings from larger, regularly updated models,
#      and no need to download/manage local model files.
#   This project currently uses a local SentenceTransformer model for ease of setup and cost control.

try:
    print(f"INFO: Loading Sentence Transformer model: '{SENTENCE_TRANSFORMER_MODEL_NAME}'...")
    # Ensure 'SentenceTransformer' class is imported from sentence_transformers library
    model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL_NAME)
    print(f"INFO: Sentence Transformer model '{SENTENCE_TRANSFORMER_MODEL_NAME}' loaded successfully.")
    
    # Optional: Specify device for the model (e.g., 'cuda' for GPU, 'cpu' for CPU)
    # model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL_NAME, device='cuda')
    
except Exception as e:
    print(f"CRITICAL ERROR: Failed to load the Sentence Transformer model '{SENTENCE_TRANSFORMER_MODEL_NAME}'.")
    print(f"Error details: {e}")
    print("Please ensure the 'sentence-transformers' library and its dependencies (like PyTorch/TensorFlow) are installed correctly.")
    print("An internet connection might be required for the first-time model download.")
    model = None # Indicate model loading failure
    # Consider raising the exception or exiting if the model is essential for subsequent steps:
    # raise RuntimeError(f"Could not load Sentence Transformer model: {e}")

INFO: Loading Sentence Transformer model: 'all-mpnet-base-v2'...
INFO: Sentence Transformer model 'all-mpnet-base-v2' loaded successfully.


In [45]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [None]:
# # --- Initialize OpenAI Client (if not already done) ---
# # Make sure your API key is set, e.g., via environment variable
# # client = openai.OpenAI() # client.api_key will be read from OPENAI_API_KEY env var by default
# # Or if you had openai.api_key = "..." from old SDK style, you might need to adapt.
# # For this example, let's assume 'openai' module is configured with the key.

# # --- Configuration for OpenAI Embedding ---
# OPENAI_EMBEDDING_MODEL = "text-embedding-3-small" # Or "text-embedding-3-large", "text-embedding-ada-002"

# def get_openai_embeddings_batch(texts: list[str], model_name: str = OPENAI_EMBEDDING_MODEL) -> np.ndarray:
#     """
#     Gets embeddings for a batch of texts using OpenAI's API.
#     Handles potential errors and returns a NumPy array of embeddings.
#     """
#     if not texts:
#         return np.array([])
    
#     # OpenAI recommends replacing newlines with spaces for their embedding models
#     # and notes that models might perform worse on inputs with many newlines.
#     processed_texts = [text.replace("\n", " ") for text in texts]
    
#     try:
#         # Note: The exact way to call might depend on your openai library version.
#         # This example assumes a newer openai SDK (v1.x.x style might be client.embeddings.create)
#         # If using an older SDK, it might be openai.Embedding.create(...)
#         # Let's use a more generic try for older versions for now.
#         response = openai.Embedding.create(input=processed_texts, model=model_name) # Older SDK style
#         # For newer SDK (v1.0+):
#         # from openai import OpenAI
#         # client = OpenAI()
#         # response = client.embeddings.create(input=processed_texts, model=model_name)
        
#         # Extract embeddings from the response
#         # The structure of 'response' depends on the SDK version.
#         # For older versions (like 0.28.0), it's often like this:
#         embeddings = [item['embedding'] for item in response['data']]
#         return np.array(embeddings)
#     except openai.error.OpenAIError as e: # Catch specific OpenAI errors
#         print(f"ERROR: OpenAI API error while getting embeddings: {e}")
#         # Return an empty array or an array of zeros/NaNs matching expected shape if possible
#         return np.array([]) 
#     except Exception as e:
#         print(f"ERROR: An unexpected error occurred while getting OpenAI embeddings: {e}")
#         return np.array([])

# # --- Replace previous model.encode() calls ---

# # It's assumed that 'sample_outputs' and 'resume_outputs' (lists of strings)
# # have been generated by the OpenAI ChatCompletion API in previous cells.

# if sample_outputs:
#     print(f"INFO: Generating OpenAI embeddings for {len(sample_outputs)} sample outputs using model '{OPENAI_EMBEDDING_MODEL}'...")
#     sample_embeddings = get_openai_embeddings_batch(sample_outputs)
#     if sample_embeddings.size > 0:
#         print(f"INFO: Generated {sample_embeddings.shape[0]} OpenAI embeddings for sample outputs, each with dimension {sample_embeddings.shape[1]}.")
#     else:
#         print("WARNING: Failed to generate OpenAI embeddings for sample outputs or list was empty.")
# else:
#     print("WARNING: 'sample_outputs' list is empty. No sample embeddings will be generated.")
#     sample_embeddings = np.array([])

# if resume_outputs:
#     print(f"INFO: Generating OpenAI embeddings for {len(resume_outputs)} resume outputs using model '{OPENAI_EMBEDDING_MODEL}'...")
#     # Before sending resume_outputs to embedding, you might need to filter out
#     # those that are just "tenure too short" or "not from USA" messages,
#     # as embedding these messages might not be meaningful for similarity comparison.
#     # Example of filtering (you'll need to define what constitutes a "valid" summary):
#     # valid_resume_outputs_for_embedding = [
#     #     out for out in resume_outputs 
#     #     if not out.startswith("Candidate not based in USA") and \
#     #        not out.startswith("Average tenure less than 1 year") and \
#     #        not out.startswith("AI_PROCESSING_ERROR") # and other error markers
#     # ]
#     # print(f"INFO: Found {len(valid_resume_outputs_for_embedding)} valid resume outputs to embed out of {len(resume_outputs)} total.")
#     # if valid_resume_outputs_for_embedding:
#     #    resume_embeddings = get_openai_embeddings_batch(valid_resume_outputs_for_embedding)
#     #    # ... (rest of the logic, be mindful that resume_embeddings might now be shorter than resume_outputs/resume_paths)
#     # else:
#     #    resume_embeddings = np.array([])
#     # For simplicity in this direct replacement example, we'll encode all:
#     resume_embeddings = get_openai_embeddings_batch(resume_outputs)
#     if resume_embeddings.size > 0:
#         print(f"INFO: Generated {resume_embeddings.shape[0]} OpenAI embeddings for resume outputs, each with dimension {resume_embeddings.shape[1]}.")
#     else:
#         print("WARNING: Failed to generate OpenAI embeddings for resume outputs or list was empty.")
# else:
#     print("WARNING: 'resume_outputs' list is empty. No resume embeddings will be generated.")
#     resume_embeddings = np.array([])


In [51]:
# --- Cell: Generate Embeddings for Processed Summaries ---

# This cell takes the AI-generated textual summaries for "good" sample resumes ('sample_outputs')
# AND the filtered, AI-generated textual summaries for candidate resumes that passed
# hard-coded rules ('final_resume_summaries_for_embedding'), and converts them into
# dense vector embeddings using the loaded Sentence Transformer model.

# --- Prerequisites ---
# Expected variables from previous cells:
#   - 'model': A loaded SentenceTransformer model object (from SentenceTransformer(SENTENCE_TRANSFORMER_MODEL_NAME)).
#     This should have been successfully initialized.
#   - 'sample_outputs': A list of strings. Each string is the AI-generated "ideal_match_summary"
#     (or direct summary) for a "good" sample resume. This list should only contain summaries
#     ready for embedding.
#   - 'final_resume_summaries_for_embedding': A list of strings. Each string is the
#     AI-generated "jd_alignment_summary" (extracted from JSON) for a candidate resume
#     that has PASSED the Python-based rule filtering (e.g., tenure, location).
#     This list should only contain summaries of qualified candidates ready for embedding.
#
# Required imported modules:
#   - 'numpy' (as np)
#   - 'SENTENCE_TRANSFORMER_MODEL_NAME' (string, for logging purposes, defined when model was loaded)

# Initialize embedding variables to ensure they exist, even if an input list is empty or model loading failed.
sample_embeddings = np.array([]) # Using an empty NumPy array as a default.
resume_embeddings = np.array([]) # Using an empty NumPy array as a default.

print(f"INFO: Preparing to generate embeddings using model '{SENTENCE_TRANSFORMER_MODEL_NAME}'.")

# Ensure the Sentence Transformer model was loaded successfully before attempting to use it.
if 'model' in locals() and model is not None:
    
    # --- Encode Sample Output Summaries ---
    # Check if 'sample_outputs' exists, is a list, and is not empty.
    if 'sample_outputs' in locals() and isinstance(sample_outputs, list) and len(sample_outputs) > 0:
        # Defensive check: ensure all elements are strings.
        if all(isinstance(s, str) for s in sample_outputs):
            print(f"INFO: Encoding {len(sample_outputs)} AI-generated summaries for sample resumes...")
            try:
                sample_embeddings = model.encode(sample_outputs, show_progress_bar=True)
                print(f"INFO: Generated {sample_embeddings.shape[0]} embeddings for sample outputs, "
                      f"each with dimension {sample_embeddings.shape[1]}.")
            except Exception as e:
                print(f"ERROR: Failed to encode 'sample_outputs'. Error: {e}")
                # sample_embeddings remains an empty np.array
        else:
            print("ERROR: 'sample_outputs' contains non-string elements. Encoding aborted for samples.")
    else:
        print("WARNING: 'sample_outputs' list is empty or not valid. No sample embeddings will be generated. "
              "This will critically affect scoring.")

    # --- Encode Filtered Candidate Resume Summaries ---
    # Check if 'final_resume_summaries_for_embedding' exists, is a list, and is not empty.
    # This list is the output from the "Cell: Parse AI's JSON Output, Apply Python Rules..."
    if 'final_resume_summaries_for_embedding' in locals() and \
       isinstance(final_resume_summaries_for_embedding, list) and \
       len(final_resume_summaries_for_embedding) > 0:
        
        # Defensive check: ensure all elements are strings.
        if all(isinstance(s, str) for s in final_resume_summaries_for_embedding):
            print(f"INFO: Encoding {len(final_resume_summaries_for_embedding)} AI-generated summaries for "
                  "filtered candidate resumes...")
            try:
                resume_embeddings = model.encode(final_resume_summaries_for_embedding, show_progress_bar=True)
                print(f"INFO: Generated {resume_embeddings.shape[0]} embeddings for filtered candidate resume summaries, "
                      f"each with dimension {resume_embeddings.shape[1]}.")
            except Exception as e:
                print(f"ERROR: Failed to encode 'final_resume_summaries_for_embedding'. Error: {e}")
                # resume_embeddings remains an empty np.array
        else:
            print("ERROR: 'final_resume_summaries_for_embedding' contains non-string elements. "
                  "Encoding aborted for candidate resumes.")
    else:
        print("WARNING: 'final_resume_summaries_for_embedding' list is empty or not valid. "
              "No candidate resume embeddings will be generated. This means no candidates passed prior filtering "
              "or there were issues generating their summaries.")

else:
    # This block executes if the 'model' from the previous cell was not loaded successfully.
    print("CRITICAL ERROR: Sentence Transformer model ('model') is None. Cannot generate embeddings.")
    print("              Please ensure the model was loaded correctly in the preceding cell.")

# --- Optional: Verification of Embedding Shapes ---
# Useful during development to confirm embeddings were generated and have the expected dimensions.
INSPECT_EMBEDDING_SHAPES = True # Set to True to print shapes.

if INSPECT_EMBEDDING_SHAPES:
    if 'sample_embeddings' in locals() and isinstance(sample_embeddings, np.ndarray) and sample_embeddings.size > 0:
        print(f"DEBUG: Shape of sample_embeddings: {sample_embeddings.shape}")
    else:
        print("DEBUG: 'sample_embeddings' is empty or was not generated.")

    if 'resume_embeddings' in locals() and isinstance(resume_embeddings, np.ndarray) and resume_embeddings.size > 0:
        print(f"DEBUG: Shape of resume_embeddings: {resume_embeddings.shape}")
    else:
        print("DEBUG: 'resume_embeddings' is empty or was not generated.")

INFO: Preparing to generate embeddings using model 'all-mpnet-base-v2'.
INFO: Encoding 5 AI-generated summaries for sample resumes...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.29it/s]


INFO: Generated 5 embeddings for sample outputs, each with dimension 768.
INFO: Encoding 20 AI-generated summaries for filtered candidate resumes...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.59it/s]

INFO: Generated 20 embeddings for filtered candidate resume summaries, each with dimension 768.
DEBUG: Shape of sample_embeddings: (5, 768)
DEBUG: Shape of resume_embeddings: (20, 768)





In [53]:
# --- Cell: Calculate Comprehensive Similarity Scores for Candidate Resumes ---

# This cell calculates a "best match" similarity score for each candidate resume
# against all available sample resume profiles. It populates a list called
# 'processed_candidate_data' with detailed scoring information for each candidate.

# --- Prerequisites ---
# Expected variables from previous cells:
#   - 'model': The loaded SentenceTransformer model object (checked for existence).
#   - 'sample_embeddings': NumPy array of embeddings for 'sample_outputs'.
#     Its length and order MUST strictly correspond to 'sample_paths_processed'.
#   - 'sample_paths_processed': List of file paths for original sample resumes,
#     synchronized with 'sample_outputs' from which 'sample_embeddings' were generated.
#   - 'resume_embeddings': NumPy array of embeddings for 'final_resume_summaries_for_embedding'.
#     Its length and order MUST strictly correspond to 'final_resume_paths_for_embedding'.
#   - 'final_resume_paths_for_embedding': List of file paths for candidate resumes that passed
#     filtering, synchronized with 'final_resume_summaries_for_embedding'.
#
# Required imported modules/functions:
#   - 'cosine_similarity' from 'sklearn.metrics.pairwise'
#   - 'os' (for os.path.basename)
#   - 'np' (NumPy) module.

# Initialize the list to store processed candidate data
processed_candidate_data = [] 

# Default score information for a candidate if scoring cannot be performed properly
DEFAULT_SCORE_INFO_FOR_CANDIDATE = {
    "best_score": 0.0,
    "best_matching_sample_path": "N/A",
    "best_matching_sample_filename": "N/A"
}

# --- Sanity Checks for Inputs ---
model_loaded_successfully = ('model' in locals() and model is not None)

# Check sample embeddings and their corresponding paths
sample_data_valid = (
    'sample_embeddings' in locals() and isinstance(sample_embeddings, np.ndarray) and sample_embeddings.size > 0 and
    'sample_paths_processed' in locals() and isinstance(sample_paths_processed, list) and
    len(sample_embeddings) == len(sample_paths_processed)
)

# Check candidate embeddings and their corresponding paths
candidate_data_valid = (
    'resume_embeddings' in locals() and isinstance(resume_embeddings, np.ndarray) and resume_embeddings.size > 0 and
    'final_resume_paths_for_embedding' in locals() and isinstance(final_resume_paths_for_embedding, list) and
    len(resume_embeddings) == len(final_resume_paths_for_embedding)
)

if not model_loaded_successfully:
    print("CRITICAL ERROR: Sentence Transformer model ('model') is not loaded. Scoring aborted.")
elif not sample_data_valid:
    print("CRITICAL ERROR: 'sample_embeddings' is invalid, empty, or not synchronized with 'sample_paths_processed'. "
          "Cannot perform meaningful scoring. All candidates will receive default scores if candidate data is present.")
    # If sample data is invalid, but we have candidate paths, populate with default scores
    if 'final_resume_paths_for_embedding' in locals() and isinstance(final_resume_paths_for_embedding, list):
        for candidate_path in final_resume_paths_for_embedding:
            processed_candidate_data.append({
                "candidate_path": candidate_path,
                "candidate_filename": os.path.basename(candidate_path),
                **DEFAULT_SCORE_INFO_FOR_CANDIDATE
            })
elif not candidate_data_valid:
    print("WARNING: 'resume_embeddings' is invalid, empty, or not synchronized with 'final_resume_paths_for_embedding'. "
          "No candidate resumes to score. 'processed_candidate_data' will be empty.")
else:
    # Proceed with scoring if all checks pass
    print(f"INFO: Starting similarity score calculation for {len(final_resume_paths_for_embedding)} candidate resume(s) "
          f"against {len(sample_paths_processed)} sample resume profile(s)...")

    # --- Main Scoring Loop ---
    # Iterate through each candidate resume's embedding and its corresponding path
    for resume_idx, current_resume_embedding_vector in enumerate(resume_embeddings):
        candidate_original_path = final_resume_paths_for_embedding[resume_idx]
        candidate_original_filename = os.path.basename(candidate_original_path)
        
        # Store similarities of this candidate with all samples
        similarities_with_all_samples = []
        
        # Iterate through each sample resume's embedding and its corresponding path
        for sample_idx, current_sample_embedding_vector in enumerate(sample_embeddings):
            sample_original_path = sample_paths_processed[sample_idx]
            sample_original_filename = os.path.basename(sample_original_path)
            
            try:
                # Calculate cosine similarity. Reshape embeddings to 2D arrays as expected by cosine_similarity.
                similarity_score_value = cosine_similarity(
                    current_resume_embedding_vector.reshape(1, -1), 
                    current_sample_embedding_vector.reshape(1, -1)
                )[0][0] # Extract the single similarity value
            except Exception as e:
                print(f"ERROR: Cosine similarity calculation failed for candidate '{candidate_original_filename}' "
                      f"with sample '{sample_original_filename}'. Score set to 0.0. Error: {e}")
                similarity_score_value = 0.0 

            similarities_with_all_samples.append({
                "score": similarity_score_value,
                "matched_sample_path": sample_original_path,
                "matched_sample_filename": sample_original_filename
            })
        
        # Find the best match for the current candidate if any similarities were calculated
        if similarities_with_all_samples:
            best_match_details = max(similarities_with_all_samples, key=lambda x: x["score"])
            
            processed_candidate_data.append({
                "candidate_path": candidate_original_path,
                "candidate_filename": candidate_original_filename,
                "best_score": best_match_details["score"],
                "best_matching_sample_path": best_match_details["matched_sample_path"],
                "best_matching_sample_filename": best_match_details["matched_sample_filename"]
            })
        else:
            # This case should ideally not happen if sample_embeddings is not empty,
            # but it's a safeguard.
            print(f"WARNING: No similarity scores calculated for candidate '{candidate_original_filename}'. Using default scores.")
            processed_candidate_data.append({
                "candidate_path": candidate_original_path,
                "candidate_filename": candidate_original_filename,
                **DEFAULT_SCORE_INFO_FOR_CANDIDATE
            })

    print(f"INFO: Finished calculating similarity scores. "
          f"Generated detailed scoring data for {len(processed_candidate_data)} candidate resume(s).")

# --- Optional: Verification of 'processed_candidate_data' ---
INSPECT_PROCESSED_CANDIDATE_DATA_OUTPUT = True # Set to True for debugging

if INSPECT_PROCESSED_CANDIDATE_DATA_OUTPUT:
    if processed_candidate_data:
        # Modified to print all entries instead of just a sample
        print(f"\n--- Detailed Scores for All Processed Candidates ({len(processed_candidate_data)} Total) ---")
        for i, data_item in enumerate(processed_candidate_data): # Iterate through ALL items
            print(f"\nCandidate File: {data_item.get('candidate_filename', 'N/A')} (Item {i+1})")
            print(f"  Best Score: {data_item.get('best_score', 0.0):.4f}") # Format score to 4 decimal places
            print(f"  Best Matching Sample: {data_item.get('best_matching_sample_filename', 'N/A')}")
            print("-" * 40) # Separator for readability
        print(f"--- End of Detailed Scores for All Processed Candidates ---")
    else:
        print("\nINFO: 'processed_candidate_data' is empty. Nothing to inspect from scoring.")




INFO: Starting similarity score calculation for 20 candidate resume(s) against 5 sample resume profile(s)...
INFO: Finished calculating similarity scores. Generated detailed scoring data for 20 candidate resume(s).

--- Detailed Scores for All Processed Candidates (20 Total) ---

Candidate File: Alexandra N. White.pdf (Item 1)
  Best Score: 0.9225
  Best Matching Sample: Michael D. Lee.pdf
----------------------------------------

Candidate File: Amanda B. Chen.pdf (Item 2)
  Best Score: 0.8806
  Best Matching Sample: Emily S. Turner.pdf
----------------------------------------

Candidate File: Brian J. Walker.pdf (Item 3)
  Best Score: 0.8776
  Best Matching Sample: Emily S. Turner.pdf
----------------------------------------

Candidate File: Chloe R. Johnson.pdf (Item 4)
  Best Score: 0.7327
  Best Matching Sample: Emily S. Turner.pdf
----------------------------------------

Candidate File: Christopher N. Patel.pdf (Item 5)
  Best Score: 0.8687
  Best Matching Sample: Emily S. Turne

In [58]:
# --- Cell: Filter Candidate Resumes and Copy Selected Ones with Enriched Filenames ---

# This cell takes the 'processed_candidate_data' list (generated by the previous
# "scoring" cell) and filters candidates based on their 'best_score'.
# For selected candidates, it constructs a new filename that includes the score
# and an identifier of the best matching sample resume. It then copies the original
# candidate resume file to the 'destination_folder' using this new filename.

# --- Prerequisites ---
# Expected variables from previous cells:
#   - 'processed_candidate_data': A list of dictionaries. Each dictionary must contain:
#       - 'candidate_path': Full path to the original candidate resume file.
#       - 'candidate_filename': The original filename of the candidate resume.
#       - 'best_score': The calculated best similarity score (float, e.g., 0.0 to 1.0).
#       - 'best_matching_sample_filename': The filename of the sample resume that yielded the best_score.
#   - 'destination_folder': The path to the directory where selected resumes will be copied
#                           (e.g., "AI_RESUME_SCANNER/selected_resume/YYYY-MM-DD").
#     This folder should have been created by 'os.makedirs(destination_folder, exist_ok=True)' previously.
#
# Required imported modules:
#   - 'os' (for os.path.basename, os.path.join, os.path.splitext, os.path.exists)
#   - 'shutil' (for shutil.copy)

# --- Configuration for Selection and Naming ---
# Define the similarity threshold for selecting a resume. This should be consistent
# with how scores were interpreted (e.g., if higher is better).
# Scores from cosine_similarity with SentenceTransformer embeddings are typically between -1.0 and 1.0,
# but often practically between 0.0 and 1.0 for similar documents.
SELECTION_THRESHOLD = 0.88  # Example threshold. Adjust based on your scoring scale and desired selectivity.

# Counter for the number of resumes selected and copied.
selected_resumes_count = 0

print(f"\nINFO: Starting selection and copying of candidate resumes to '{os.path.abspath(destination_folder)}'.")
print(f"INFO: Selection threshold for 'best_score' is >= {SELECTION_THRESHOLD:.2f} (i.e., {SELECTION_THRESHOLD*100:.0f} on a 0-100 scale for filename).")

# Check if 'processed_candidate_data' is available and is a non-empty list.
if 'processed_candidate_data' in locals() and \
   isinstance(processed_candidate_data, list) and \
   processed_candidate_data:

    # Iterate through each item in the 'processed_candidate_data' list.
    # Each 'candidate_info' is a dictionary with scoring details.
    for candidate_info in processed_candidate_data:
        
        # Validate the structure of the dictionary item.
        if not isinstance(candidate_info, dict) or \
           not all(key in candidate_info for key in ["candidate_path", 
                                                     "candidate_filename", 
                                                     "best_score", 
                                                     "best_matching_sample_filename"]):
            print(f"WARNING: Skipping an invalid or incomplete data item in 'processed_candidate_data': {candidate_info}")
            continue

        original_candidate_file_path = candidate_info["candidate_path"]
        original_candidate_filename = candidate_info["candidate_filename"]
        candidate_best_score = candidate_info["best_score"]
        matched_sample_filename_raw = candidate_info["best_matching_sample_filename"]

        # Apply the selection threshold to the 'best_score'.
        if candidate_best_score >= SELECTION_THRESHOLD:
            selected_resumes_count += 1
            
            # --- Construct the new, informative filename ---
            # Convert score to a 0-100 integer string for readability in the filename.
            score_for_filename = str(int(candidate_best_score * 100)) 
            
            # Get an identifier for the matched sample (e.g., filename without its extension).
            # Handle the 'N/A' case if no sample was matched (e.g., due to empty sample_embeddings).
            matched_sample_identifier = "NoMatchingSample" # Default if 'N/A'
            if matched_sample_filename_raw != "N/A" and isinstance(matched_sample_filename_raw, str):
                # Use os.path.splitext to get filename without extension. [0] is the name, [1] is the ext.
                matched_sample_identifier = os.path.splitext(matched_sample_filename_raw)[0]
                # Optional: Sanitize matched_sample_identifier if it might contain problematic characters for filenames.
                # For example, replace spaces or special characters.
                # matched_sample_identifier = matched_sample_identifier.replace(" ", "_") 
            
            # New filename format: "{Score}_matchedWith_{SampleNameIdentifier}_{OriginalCandidateFilename}"
            # Example: "75_matchedWith_SampleResumeA_JohnDoe_Resume.pdf"
            new_filename = f"{score_for_filename}_matchedWith_{matched_sample_identifier}_{original_candidate_filename}"
            
            # Full path for the destination file.
            destination_file_path = os.path.join(destination_folder, new_filename)
            
            # --- Copy the original resume file to the destination with the new name ---
            try:
                # First, ensure the source file (original candidate resume) actually exists.
                if os.path.exists(original_candidate_file_path):
                    shutil.copy(original_candidate_file_path, destination_file_path)
                    # print(f"  SUCCESS: Copied '{original_candidate_filename}' to '{new_filename}'") # Optional success log per file
                else:
                    print(f"ERROR: Source resume file not found, cannot copy: '{original_candidate_file_path}' "
                          f"for new file '{new_filename}'.")
            except Exception as e:
                print(f"ERROR: Could not copy file '{original_candidate_filename}' to '{destination_file_path}'. "
                      f"Error: {e}")
    
    # --- Final Summary ---
    if selected_resumes_count > 0:
        print(f"INFO: Finished selection process. Successfully copied {selected_resumes_count} resume(s) "
              f"to '{os.path.abspath(destination_folder)}'.")
    else:
        print(f"INFO: Finished selection process. No candidate resumes met the selection threshold "
              f"of >= {SELECTION_THRESHOLD:.2f} (score {SELECTION_THRESHOLD*100:.0f}).")

elif not ('processed_candidate_data' in locals() and isinstance(processed_candidate_data, list)):
    print("ERROR: 'processed_candidate_data' list is not defined or not a list. "
          "Cannot perform resume selection and copying.")
else: # 'processed_candidate_data' is an empty list
    print("INFO: 'processed_candidate_data' is empty. "
          "No candidate resumes were scored in the previous step, so no files to select or copy.")

# --- End of File Selection, Renaming, and Copying Cell ---


INFO: Starting selection and copying of candidate resumes to 'c:\Users\harry\OneDrive\Desktop\AI RESUME SCANNER\AI_RESUME_SCANNER\selected_resume\2025-05-18'.
INFO: Selection threshold for 'best_score' is >= 0.88 (i.e., 88 on a 0-100 scale for filename).
INFO: Finished selection process. Successfully copied 5 resume(s) to 'c:\Users\harry\OneDrive\Desktop\AI RESUME SCANNER\AI_RESUME_SCANNER\selected_resume\2025-05-18'.
