<a href="https://colab.research.google.com/github/hent0mi/resume-scorer-colab-analysis/blob/main/ResumeAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# Optional: Verify the mount by listing your app folder
# !ls /content/drive/MyDrive/ResumeScorerApp

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pandas scikit-learn nltk spacy PyPDF2 python-docx google-generativeai
!python -m spacy download en_core_web_sm # Download a small English model for spaCy

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import json
import pandas as pd
from collections import Counter
import re # Import the regular expression module
import asyncio # For async API calls
import os
import PyPDF2
from docx import Document
import spacy
from google.colab import userdata
import google.generativeai as genai

# Install nest_asyncio if not already installed
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply() # Apply this at the beginning of your script/notebook

# --- Existing setup from previous steps ---
# Load a small English language model for spaCy
nlp = spacy.load("en_core_web_sm")

# Configure Gemini API (assuming GEMINI_API_KEY is set in Colab Secrets)
api_key = userdata.get('GEMINI_API_KEY')
if api_key is None:
    raise ValueError("GEMINI_API_KEY secret not found or not enabled for this notebook.")
genai.configure(api_key=api_key)

# Define a broad list of common IT skills
common_skills = [
    # Core Programming & Data (Existing)
    "python", "java", "javascript", "c++", "c#", "go", "ruby", "php", "swift", "kotlin",
    "sql", "nosql", "mongodb", "postgresql", "mysql", "redis",
    "data analysis", "pandas", "numpy", "matplotlib", "seaborn",

    # Cloud & DevOps (Existing)
    "aws", "azure", "google cloud", "docker", "kubernetes", "terraform",
    "git", "jira", "agile", "scrum", "devops", "ci/cd",

    # General Business & Soft Skills (Existing)
    "project management", "leadership", "communication", "teamwork", "problem-solving",
    "microsoft office", "excel", "word", "powerpoint", "outlook",
    "customer service",

    # Web Development (Existing)
    "react", "angular", "vue.js", "node.js", "django", "flask", "spring", "laravel",
    "web development", "frontend", "backend", "fullstack", "api development", "restful apis",
    "ui/ux design", "figma", "sketch", "adobe xd",

    # Generative AI Specific Skills
    "generative ai", "genai", "artificial intelligence", "large language models", "llms", "ai models",
    "prompt engineering", "prompting", "ai ethics", "model evaluation", "fine-tuning",
    "retrieval augmented generation", "rag", "embeddings", "vector databases",

    # Process Optimization & Automation
    "process optimization", "business process automation", "bpa", "workflow automation",
    "process analysis", "lean six sigma", "efficiency improvement", "automation",
    "robotics process automation", "rpa", "business analysis",

    # Low-Code/No-Code Platforms
    "low-code", "no-code", "power platform", "microsoft power automate", "zapier", "make.com",
    "bubble.io", "webflow", "appian", "outsystems", "mendix", "salesforce flow",

    # Analytical & Troubleshooting
    "troubleshooting", "root cause analysis", "diagnostic skills", "analytical thinking",
    "data interpretation", "critical thinking", "problem diagnosis",

    # AI/ML Fundamentals (Expanded from generic ML)
    "machine learning", "deep learning", "nlp", "natural language processing", "computer vision",
    "tensorflow", "pytorch", "scikit-learn", "data science", "model deployment",
    "model monitoring", "feature engineering", "statistical analysis",

    # Communication & Training (for the analyst role)
    "training", "user training", "documentation", "technical writing", "presentations",
    "stakeholder management", "user adoption", "change management"
]

# Define base paths for Google Drive folders
DRIVE_BASE_PATH = '/content/drive/MyDrive/ResumeScorerApp/'
JD_FOLDER = os.path.join(DRIVE_BASE_PATH, 'JobDescriptions')
RESUMES_FOLDER = os.path.join(DRIVE_BASE_PATH, 'Resumes')
RESULTS_FOLDER = os.path.join(DRIVE_BASE_PATH, 'Results')
RESULTS_FILE_NAME = 'analysis_results.json' # Fixed name for easy retrieval by n8n

# --- HELPER FUNCTIONS ---

# File Reading Functions: Create functions to read content from PDF, DOCX, and TXT files (read_text_from_pdf, read_text_from_docx, read_file_content).
def read_text_from_pdf(filepath):
    """Extracts text from a PDF file."""
    text = ""
    try:
        with open(filepath, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
    except Exception as e:
        print(f"Error reading PDF {filepath}: {e}")
        text = "" # Return empty string on error
    return text

def read_text_from_docx(filepath):
    """Extracts text from a DOCX file."""
    text = ""
    try:
        doc = Document(filepath)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        text = '\n'.join(full_text)
    except Exception as e:
        print(f"Error reading DOCX {filepath}: {e}")
        text = "" # Return empty string on error
    return text

def read_file_content(filepath):
    """Determines file type and calls appropriate reader function."""
    if filepath.endswith('.pdf'):
        return read_text_from_pdf(filepath)
    elif filepath.endswith('.docx'):
        return read_text_from_docx(filepath)
    elif filepath.endswith('.txt'):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading TXT {filepath}: {e}")
            return ""
    return "" # Return empty string for unsupported types

# Skill Extraction: spaCy will be used for basic keyword matching against a comprehensive list of skills. (extract_skills)
def extract_skills(text, skill_keywords_list):
    """Extracts skills from text based on a predefined keyword list."""
    if nlp is None: # Check if nlp model loaded successfully
        print("Warning: spaCy NLP model not loaded. Skill extraction will be limited.")
        return [] # Return empty list if NLP is not available

    doc = nlp(text.lower())
    found_skills = set()
    for skill in skill_keywords_list:
        if skill.lower() in doc.text:
            found_skills.add(skill)
    return list(found_skills)

#  Scoring Algorithm: A basic Jaccard similarity based on shared skills. (calculate_score)
def calculate_score(jd_skills, resume_skills):
    """Calculates a compatibility score based on shared skills."""
    if not jd_skills or not resume_skills:
        return 0.0 # No skills to compare
    jd_set = set(jd_skills)
    resume_set = set(resume_skills)
    intersection = len(jd_set.intersection(resume_set))
    union = len(jd_set.union(resume_set))
    return (intersection / union) * 100 if union > 0 else 0.0 # Score out of 100

# Resume Summarization: Use the Gemini API to generate concise summaries.(summarize_resume)
async def summarize_resume(resume_text, job_description_text):
    """Generates a concise summary of a resume relevant to a job description using Gemini."""
    model = genai.GenerativeModel('gemini-1.5-flash') # Use gemini-1.5-flash for speed and cost-effectiveness
    prompt = f"""Summarize the following resume in 2-3 concise sentences, focusing on skills, experience, and qualifications that are most relevant to the provided job description.

    Job Description:
    {job_description_text}

    Resume:
    {resume_text}
    """
    try:
        response = await model.generate_content_async(prompt)
        return response.text
    except Exception as e:
        print(f"Error summarizing resume with Gemini: {e}")
        return "Summary unavailable due to processing error."

# Extract Contact Information: Extracting personal information like name, email, and address (extract_contact_info)
def extract_contact_info(text):
    """
    Extracts candidate name, email, and a simplified address from resume text.
    This is a heuristic approach and may not be 100% accurate.
    """
    name = "N/A"
    email = "N/A"
    address = "N/A"

    # 1. Extract Email (most reliable with regex)
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, text)
    if emails:
        email = emails[0] # Take the first email found

    # 2. Extract Name (very challenging without advanced NLP)
    # This is a very basic heuristic: often the first few lines of a resume contain the name.
    # We'll try to find a capitalized phrase at the beginning.
    lines = text.strip().split('\n')
    if lines:
        first_line = lines[0].strip()
        # Look for a line that seems like a name (e.g., "John Doe" or "J. Doe")
        # This regex looks for 2-4 capitalized words, potentially with periods.
        name_pattern = r'^[A-Z][a-z]+(?: [A-Z][a-z]+){1,3}$|^[A-Z]\. [A-Z][a-z]+(?: [A-Z][a-z]+)?$'
        if re.match(name_pattern, first_line):
            name = first_line
        else:
            # Fallback: try to find a capitalized phrase in the first few lines
            for i in range(min(len(lines), 3)): # Check first 3 lines
                line = lines[i].strip()
                # Simple check for multiple capitalized words
                potential_name_parts = [word for word in line.split() if word and word[0].isupper()]
                if len(potential_name_parts) >= 2 and len(" ".join(potential_name_parts)) < 40: # Avoid long sentences
                    name = " ".join(potential_name_parts)
                    break # Found a plausible name, stop searching

    # 3. Extract Address (also challenging, highly variable formats)
    # This is a very simplified approach, looking for common address components.
    # It will likely extract only parts of an address or common city/state/zip patterns.
    address_keywords = ['street', 'avenue', 'road', 'lane', 'drive', 'boulevard',
                        'st', 'ave', 'rd', 'ln', 'dr', 'blvd',
                        'city', 'state', 'zip', 'postal code', 'united states', 'usa',
                        r'\d{5}(-\d{4})?', # US Zip code
                        r'[A-Z]{2}\s+\d{5}', # State Abbreviation + Zip
                        r'[A-Z][a-z]+,?\s+[A-Z]{2}\s+\d{5}' # City, State Zip
                       ]
    found_address_parts = []
    for keyword in address_keywords:
        if isinstance(keyword, str):
            if keyword.lower() in text.lower():
                # Find the first occurrence of a keyword and try to extract surrounding text
                match = re.search(r'(.{0,50}' + re.escape(keyword) + r'.{0,50})', text, re.IGNORECASE | re.DOTALL)
                if match:
                    found_address_parts.append(match.group(1).strip())
        else: # It's a regex pattern
            matches = re.findall(keyword, text)
            if matches:
                found_address_parts.extend(matches)

    if found_address_parts:
        # Try to combine unique parts, but this is still very rough
        address = ", ".join(list(set(found_address_parts)))
        # Further refine by trying to get a full line that contains an address component
        for line in lines:
            if any(re.search(re.escape(k), line, re.IGNORECASE) if isinstance(k, str) else re.search(k, line) for k in address_keywords):
                if len(line) < 150: # Avoid very long lines
                    address = line.strip()
                    break # Take the first plausible full line

    return {"name": name, "email": email, "address": address}


# --- Main Analysis Function ---
async def run_analysis(jd_file_name, resume_file_names):
    """
    Main function to run the resume analysis.
    Args:
        jd_file_name (str): The filename of the job description in the JobDescriptions folder.
        resume_file_names (list): A list of filenames of resumes in the Resumes folder.
    """
    print(f"Starting analysis for JD: {jd_file_name}, Resumes: {resume_file_names}")

    # 1. Read Job Description
    jd_path = os.path.join(JD_FOLDER, jd_file_name)
    jd_text = read_file_content(jd_path)
    if not jd_text:
        print(f"Error: Could not read job description from {jd_path}. Aborting analysis.")
        return {"error": "Could not read job description."}

    jd_skills = extract_skills(jd_text, common_skills)
    print(f"Extracted JD skills: {jd_skills}")

    candidates_data = []
    all_extracted_skills_overall = [] # To collect all skills for the general word cloud

    # 2. Process Each Resume
    for resume_name in resume_file_names:
        resume_path = os.path.join(RESUMES_FOLDER, resume_name)
        resume_text = read_file_content(resume_path)
        if not resume_text:
            print(f"Warning: Could not read resume from {resume_path}. Skipping.")
            continue

        # --- Extract Contact Info ---
        contact_info = extract_contact_info(resume_text)
        candidate_name = contact_info["name"]
        candidate_email = contact_info["email"]
        candidate_address = contact_info["address"]
        print(f"Extracted info for {resume_name}: Name='{candidate_name}', Email='{candidate_email}', Address='{candidate_address}'")


        resume_skills = extract_skills(resume_text, common_skills)
        score = calculate_score(jd_skills, resume_skills)
        summary = await summarize_resume(resume_text, jd_text)

        candidates_data.append({
            "resume_name": resume_name,
            "candidate_name": candidate_name, # Name
            "email": candidate_email,         # Email
            "address": candidate_address,     # Address
            "score": round(score, 2), # Round score for cleaner display
            "summary": summary,
            "skills": resume_skills # Skills extracted from this specific resume
        })
        all_extracted_skills_overall.extend(resume_skills) # Add to overall list

    # 3. Sort Candidates and Select Top 3
    candidates_data.sort(key=lambda x: x['score'], reverse=True)
    top_3_candidates = candidates_data[:3]
    print(f"Top 3 candidates: {top_3_candidates}")

    # 4. Prepare Data for Visualizations

    # For Word Cloud: Count frequencies of all extracted skills across all resumes
    skill_counts_for_wordcloud = Counter(all_extracted_skills_overall)
    word_cloud_data = [{"text": skill, "value": count} for skill, count in skill_counts_for_wordcloud.items()]
    print(f"Word cloud data generated.")

    # For Radar Chart: Skill presence for top 3 candidates
    radar_chart_data = []
    for candidate in top_3_candidates:
        candidate_skill_presence = {skill: 1 if skill in candidate['skills'] else 0 for skill in common_skills}
        radar_chart_data.append({
            "name": candidate['candidate_name'], # Use extracted name for radar chart
            "skills": candidate_skill_presence # Dictionary of skill: 0/1 presence
        })
    print(f"Radar chart data generated.")

    # 5. Compile and Save Results
    results = {
        "top_candidates": top_3_candidates,
        "skill_word_cloud": word_cloud_data,
        "radar_chart_data": radar_chart_data
    }

    results_file_path = os.path.join(RESULTS_FOLDER, RESULTS_FILE_NAME)
    os.makedirs(RESULTS_FOLDER, exist_ok=True) # Ensure results folder exists
    with open(results_file_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4)
    print(f"Analysis complete and results saved to {results_file_path}")

    return results # Return results for potential direct use or logging


# --- Entry Point for n8n (unchanged, as it receives parameters from n8n) ---
# This block will be executed by n8n.
# n8n's Colab node allows you to pass parameters as key-value pairs.
# These parameters will be available as variables in the Colab environment.

# Ensure these variables are defined when n8n runs the notebook
# If running locally for testing, define them here:
if 'jd_file_name' not in locals():
    print("Running in local test mode. Please ensure 'jd_file_name' and 'resume_file_names' are defined.")
    # Example for local testing:
    # Create dummy files in your Google Drive folders for testing
    # with open(os.path.join(JD_FOLDER, 'example_job_description.txt'), 'w') as f:
    #     f.write('We are looking for a Python developer with strong data analysis skills and experience in machine learning.')
    # with open(os.path.join(RESUMES_FOLDER, 'example_resume_1.pdf'), 'w') as f:
    #     f.write('This is a dummy PDF content for Resume 1. Python, data analysis, machine learning experience. John Doe, john.doe@example.com, 123 Main St, Anytown, CA 90210.')
    # with open(os.path.join(RESUMES_FOLDER, 'example_resume_2.docx'), 'w') as f:
    #     f.write('This is a dummy DOCX content for Resume 2. Java developer, some Python. Jane Smith, jane.smith@email.net, 456 Oak Ave, Otherville, NY 10001.')
    jd_file_name = 'example_job_description.pdf'
    resume_file_names = ['example_resume_1.pdf', 'example_resume_2.pdf', 'example_resume_3.pdf', 'example_resume_4.pdf', 'example_resume_5.pdf', 'example_resume_6.pdf']


# Run the analysis
if 'jd_file_name' in locals() and 'resume_file_names' in locals():
    try:
        asyncio.run(run_analysis(jd_file_name, resume_file_names))
    except Exception as e:
        print(f"An error occurred during analysis execution: {e}")
else:
    print("Colab notebook executed without required parameters (jd_file_name, resume_file_names).")

Running in local test mode. Please ensure 'jd_file_name' and 'resume_file_names' are defined.
Starting analysis for JD: example_job_description.pdf, Resumes: ['example_resume_1.pdf', 'example_resume_2.pdf', 'example_resume_3.pdf', 'example_resume_4.pdf', 'example_resume_5.pdf', 'example_resume_6.pdf']
Extracted JD skills: ['excel', 'aws', 'low-code', 'documentation', 'git', 'azure', 'process optimization', 'java', 'rag', 'go', 'php', 'python', 'google cloud', 'artificial intelligence', 'ai models']
Extracted info for example_resume_1.pdf: Name='TejaswiSaiKumar Parepalli', Email='N/A', Address='Results -driven Data Scientist  with expertise in architecting scalable ML systems and developing production data'
Error summarizing resume with Gemini: object GenerateContentResponse can't be used in 'await' expression
Extracted info for example_resume_2.pdf: Name='VASANTHA LAKSHMI EDA', Email='vasanthaeda982@gmail.com', Address='I am a Data Analyst with 4 years of experience in transforming raw