<a href="https://colab.research.google.com/github/hent0mi/resume-scorer-colab-analysis/blob/main/ResumeAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# Optional: Verify the mount by listing your app folder
# !ls /content/drive/MyDrive/ResumeScorerApp

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Optional: Verify the mount by listing your app folder
!ls /content/drive/MyDrive/ResumeScorerApp

JobDescriptions  Results  Resumes  test.gdoc


In [3]:
!pip install pandas scikit-learn nltk spacy PyPDF2 python-docx google-generativeai
!python -m spacy download en_core_web_sm # Download a small English model for spaCy

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# !pip install google-generativeai

In [5]:
# Google Gemini API Setup:

import google.generativeai as genai
from google.colab import userdata # This library helps access Colab secrets

# Retrieve the API key from Colab Secrets
# The string 'GEMINI_API_KEY' must match the name you gave your secret in step 2.
api_key = userdata.get('GEMINI_API_KEY')

# Check if the API key was retrieved successfully (important for debugging)
if api_key is None:
    raise ValueError("GEMINI_API_KEY secret not found or not enabled for this notebook.")
else:
    print("Gemini API key loaded successfully.") # You can remove this line in production

# Configure the google-generativeai library with your API key
genai.configure(api_key=api_key)

# Optional: Test a simple interaction to verify the setup
try:
    model = genai.GenerativeModel('gemini-1.5-flash') # Using the faster flash model
    response = model.generate_content("Hello, Gemini!")
    print("Gemini says:", response.text)
except Exception as e:
    print(f"Error testing Gemini API: {e}")
    print("Please ensure your API key is correct and has the necessary permissions.")

Gemini API key loaded successfully.
Gemini says: Hello there!  How can I help you today?



In [6]:
# File Reading Functions: Create functions to read content from PDF, DOCX, and TXT files.

import os
import PyPDF2
from docx import Document

def read_text_from_pdf(filepath):
    """Extracts text from a PDF file."""
    text = ""
    try:
        with open(filepath, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
    except Exception as e:
        print(f"Error reading PDF {filepath}: {e}")
        text = "" # Return empty string on error
    return text

def read_text_from_docx(filepath):
    """Extracts text from a DOCX file."""
    text = ""
    try:
        doc = Document(filepath)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        text = '\n'.join(full_text)
    except Exception as e:
        print(f"Error reading DOCX {filepath}: {e}")
        text = "" # Return empty string on error
    return text

def read_file_content(filepath):
    """Determines file type and calls appropriate reader function."""
    if filepath.endswith('.pdf'):
        return read_text_from_pdf(filepath)
    elif filepath.endswith('.docx'):
        return read_text_from_docx(filepath)
    elif filepath.endswith('.txt'):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading TXT {filepath}: {e}")
            return ""
    return "" # Return empty string for unsupported types

In [7]:
# Skill Extraction: Use spaCy for basic keyword matching. You'll need a comprehensive list of skills.

import spacy
# Load a small English language model
nlp = spacy.load("en_core_web_sm")

def extract_skills(text, skill_keywords_list):
    """Extracts skills from text based on a predefined keyword list."""
    doc = nlp(text.lower())
    found_skills = set()
    # Simple keyword matching: checks if a skill keyword is present in the text
    for skill in skill_keywords_list:
        if skill.lower() in doc.text:
            found_skills.add(skill)
    # For more advanced extraction, you could use spaCy's entity recognition
    # or train a custom NER model.
    return list(found_skills)

# Define a broad list of common skills. Expand this significantly for better results.
common_skills = [
    # Core Programming & Data (Existing)
    "python", "java", "javascript", "c++", "c#", "go", "ruby", "php", "swift", "kotlin",
    "sql", "nosql", "mongodb", "postgresql", "mysql", "redis",
    "data analysis", "pandas", "numpy", "matplotlib", "seaborn",

    # Cloud & DevOps (Existing)
    "aws", "azure", "google cloud", "docker", "kubernetes", "terraform",
    "git", "jira", "agile", "scrum", "devops", "ci/cd",

    # General Business & Soft Skills (Existing)
    "project management", "leadership", "communication", "teamwork", "problem-solving",
    "microsoft office", "excel", "word", "powerpoint", "outlook",
    "customer service",

    # Web Development (Existing)
    "react", "angular", "vue.js", "node.js", "django", "flask", "spring", "laravel",
    "web development", "frontend", "backend", "fullstack", "api development", "restful apis",
    "ui/ux design", "figma", "sketch", "adobe xd",

    # New: Generative AI Specific Skills
    "generative ai", "genai", "artificial intelligence" "large language models", "llms", "ai models",
    "prompt engineering", "prompting", "ai ethics", "model evaluation", "fine-tuning",
    "retrieval augmented generation", "rag", "embeddings", "vector databases",

    # New: Process Optimization & Automation
    "process optimization", "business process automation", "bpa", "workflow automation",
    "process analysis", "lean six sigma", "efficiency improvement", "automation",
    "robotics process automation", "rpa", "business analysis",

    # New: Low-Code/No-Code Platforms
    "low-code", "no-code", "power platform", "microsoft power automate", "zapier", "make.com",
    "bubble.io", "webflow", "appian", "outsystems", "mendix", "salesforce flow",

    # New: Analytical & Troubleshooting
    "troubleshooting", "root cause analysis", "diagnostic skills", "analytical thinking",
    "data interpretation", "critical thinking", "problem diagnosis",

    # New: AI/ML Fundamentals (Expanded from generic ML)
    "machine learning", "deep learning", "nlp", "natural language processing", "computer vision",
    "tensorflow", "pytorch", "scikit-learn", "data science", "model deployment",
    "model monitoring", "feature engineering", "statistical analysis",

    # New: Communication & Training (for the analyst role)
    "training", "user training", "documentation", "technical writing", "presentations",
    "stakeholder management", "user adoption", "change management"
]

In [8]:
# Scoring Algorithm: A basic Jaccard similarity based on shared skills.

def calculate_score(jd_skills, resume_skills):
    """Calculates a compatibility score based on shared skills."""
    if not jd_skills or not resume_skills:
        return 0.0 # No skills to compare
    jd_set = set(jd_skills)
    resume_set = set(resume_skills)
    intersection = len(jd_set.intersection(resume_set))
    union = len(jd_set.union(resume_set))
    return (intersection / union) * 100 if union > 0 else 0.0 # Score out of 100

In [17]:
# Resume Summarization (Google Gemini): Use the Gemini API to generate concise summaries.

# import asyncio # For async API calls - no longer needed for synchronous version

def summarize_resume(resume_text, job_description_text):
    """Generates a concise summary of a resume relevant to a job description using Gemini."""
    model = genai.GenerativeModel('gemini-1.5-flash') # Use gemini-1.5-flash for speed and cost-effectiveness
    prompt = f"""Summarize the following resume in 2-3 concise sentences, focusing on skills, experience, and qualifications that are most relevant to the provided job description.

    Job Description:
    {job_description_text}

    Resume:
    {resume_text}
    """
    try:
        # Use the synchronous generate_content instead of generate_content_async
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error summarizing resume with Gemini: {e}")
        return "Summary unavailable due to processing error."

In [18]:
# Main Analysis Function: This function will orchestrate the entire process, accept file names as input, and save the results.

import json
import pandas as pd
from collections import Counter # For word cloud skill counts
import asyncio # Keep asyncio import for potential future use if needed, but remove await from run_analysis

# Define base paths for Google Drive folders
DRIVE_BASE_PATH = '/content/drive/MyDrive/ResumeScorerApp/'
JD_FOLDER = os.path.join(DRIVE_BASE_PATH, 'JobDescriptions')
RESUMES_FOLDER = os.path.join(DRIVE_BASE_PATH, 'Resumes')
RESULTS_FOLDER = os.path.join(DRIVE_BASE_PATH, 'Results')
RESULTS_FILE_NAME = 'analysis_results.json' # Fixed name for easy retrieval by n8n

# Remove async from the function definition
def run_analysis(jd_file_name, resume_file_names):
    """
    Main function to run the resume analysis.
    Args:
        jd_file_name (str): The filename of the job description in the JobDescriptions folder.
        resume_file_names (list): A list of filenames of resumes in the Resumes folder.
    """
    print(f"Starting analysis for JD: {jd_file_name}, Resumes: {resume_file_names}")

    # 1. Read Job Description
    jd_path = os.path.join(JD_FOLDER, jd_file_name)
    jd_text = read_file_content(jd_path)
    if not jd_text:
        print(f"Error: Could not read job description from {jd_path}. Aborting analysis.")
        return {"error": "Could not read job description."}

    jd_skills = extract_skills(jd_text, common_skills)
    print(f"Extracted JD skills: {jd_skills}")

    candidates_data = []
    all_extracted_skills_overall = [] # To collect all skills for the general word cloud

    # 2. Process Each Resume
    for resume_name in resume_file_names:
        resume_path = os.path.join(RESUMES_FOLDER, resume_name)
        resume_text = read_file_content(resume_path)
        if not resume_text:
            print(f"Warning: Could not read resume from {resume_path}. Skipping.")
            continue

        resume_skills = extract_skills(resume_text, common_skills)
        score = calculate_score(jd_skills, resume_skills)
        # Call the synchronous summarize_resume function
        summary = summarize_resume(resume_text, jd_text)

        candidates_data.append({
            "resume_name": resume_name,
            "score": round(score, 2), # Round score for cleaner display
            "summary": summary,
            "skills": resume_skills # Skills extracted from this specific resume
        })
        all_extracted_skills_overall.extend(resume_skills) # Add to overall list

    # 3. Sort Candidates and Select Top 3
    candidates_data.sort(key=lambda x: x['score'], reverse=True)
    top_3_candidates = candidates_data[:3]
    print(f"Top 3 candidates: {top_3_candidates}")

    # 4. Prepare Data for Visualizations

    # For Word Cloud: Count frequencies of all extracted skills across all resumes
    skill_counts_for_wordcloud = Counter(all_extracted_skills_overall)
    # Convert to a list of objects for easier JS consumption if needed, or keep as dict
    # Example: [{"text": "python", "value": 10}, ...]
    word_cloud_data = [{"text": skill, "value": count} for skill, count in skill_counts_for_wordcloud.items()]
    print(f"Word cloud data generated.")

    # For Radar Chart: Skill presence for top 3 candidates
    radar_chart_data = []
    for candidate in top_3_candidates:
        candidate_skill_presence = {skill: 1 if skill in candidate['skills'] else 0 for skill in common_skills}
        radar_chart_data.append({
            "name": candidate['resume_name'],
            "skills": candidate_skill_presence # Dictionary of skill: 0/1 presence
        })
    print(f"Radar chart data generated.")

    # 5. Compile and Save Results
    results = {
        "top_candidates": top_3_candidates,
        "skill_word_cloud": word_cloud_data,
        "radar_chart_data": radar_chart_data
    }

    results_file_path = os.path.join(RESULTS_FOLDER, RESULTS_FILE_NAME)
    os.makedirs(RESULTS_FOLDER, exist_ok=True) # Ensure results folder exists
    with open(results_file_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4)
    print(f"Analysis complete and results saved to {results_file_path}")

    return results # Return results for potential direct use or logging

In [19]:
# Entry Point for n8n: The n8n Google Colaboratory node allows you to pass parameters to your notebook.
# You'll need a way for your notebook to receive these. A common pattern is to use sys.argv for command-line arguments,
# but for Colab, parameters are often set as variables directly in a cell that n8n executes.

# This block will be executed by n8n.
# n8n's Colab node allows you to pass parameters as key-value pairs.
# These parameters will be available as variables in the Colab environment.
# Example: If n8n passes 'jd_file_name' and 'resume_file_names'
# you can access them directly here.

# --- Placeholder for n8n-injected parameters ---
# For local testing in Colab, you can uncomment and set these:
# jd_file_name = 'example_job_description.pdf' # Make sure this file exists in your Drive JD folder
# resume_file_names = ['example_resume_1.pdf', 'example_resume_2.pdf', 'example_resume_3.pdf', 'example_resume_4.pdf', 'example_resume_5.pdf', 'example_resume_6.pdf'] # Make sure these exist in your Drive Resumes folder
# -------------------------------------------------

# Ensure these variables are defined when n8n runs the notebook
# If running locally for testing, define them here:
if 'jd_file_name' not in locals():
    print("Running in local test mode. Please ensure 'jd_file_name' and 'resume_file_names' are defined.")
    # Example for local testing:
    # Create dummy files in your Google Drive folders for testing
    # with open(os.path.join(JD_FOLDER, 'example_job_description.txt'), 'w') as f:
    #     f.write('We are looking for a Python developer with strong data analysis skills and experience in machine learning.')
    # with open(os.path.join(RESUMES_FOLDER, 'example_resume_1.pdf'), 'w') as f:
    #     f.write('This is a dummy PDF content for Resume 1. Python, data analysis, machine learning experience.')
    # with open(os.path.join(RESUMES_FOLDER, 'example_resume_2.docx'), 'w') as f:
    #     f.write('This is a dummy DOCX content for Resume 2. Java developer, some Python.')
    jd_file_name = 'example_job_description.pdf'
    resume_file_names = ['example_resume_1.pdf', 'example_resume_2.pdf', 'example_resume_3.pdf', 'example_resume_4.pdf', 'example_resume_5.pdf', 'example_resume_6.pdf']


# Run the analysis
if 'jd_file_name' in locals() and 'resume_file_names' in locals():
    try:
        # Use await instead of asyncio.run() when running within Colab
        import nest_asyncio
        nest_asyncio.apply()
        await run_analysis(jd_file_name, resume_file_names)
    except Exception as e:
        print(f"An error occurred during analysis execution: {e}")
else:
    print("Colab notebook executed without required parameters (jd_file_name, resume_file_names).")

Starting analysis for JD: example_job_description.pdf, Resumes: ['example_resume_1.pdf', 'example_resume_2.pdf', 'example_resume_3.pdf', 'example_resume_4.pdf', 'example_resume_5.pdf', 'example_resume_6.pdf']
Extracted JD skills: ['php', 'go', 'google cloud', 'excel', 'rag', 'low-code', 'documentation', 'java', 'ai models', 'aws', 'git', 'azure', 'python', 'process optimization']
Top 3 candidates: [{'resume_name': 'example_resume_2.pdf', 'score': 29.03, 'summary': 'Vasantha Lakshmi EDA is a Data Analyst with 4+ years of experience leveraging Python, SQL, AWS, and various visualization tools (Tableau, Power BI) to create impactful dashboards and optimize data workflows.  Her expertise includes developing and deploying machine learning models (TensorFlow, scikit-learn), ETL processes, and data cleaning/processing of large datasets (over 10 million records).  Proven success in improving data accuracy, reducing operational costs, and enhancing decision-making across multiple business units