In [None]:
import os
import PyPDF2
from tqdm import tqdm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# --- Extract Resume Texts ---
RESUME_FOLDER = "data/data"


In [None]:
def extract_text_from_pdf(file_path):
    text = ""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return text


In [None]:
resume_texts = []
resume_files = []

all_files = [os.path.join(root, file)
             for root, dirs, files in os.walk(RESUME_FOLDER)
             for file in files if file.lower().endswith(".pdf")]

for file_path in tqdm(all_files, desc="Extracting PDFs"):
    text = extract_text_from_pdf(file_path)
    if text.strip():
        resume_texts.append(text)
        resume_files.append(file_path)

df_resumes = pd.DataFrame({"file": resume_files, "text": resume_texts})
print(f"Loaded {len(df_resumes)} resumes")

In [None]:
# --- Sample Job Description ---
job_description = """
Looking for a Data Scientist with Python, Machine Learning,
Deep Learning, and NLP experience.
"""

In [None]:
# --- TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(stop_words='english')
all_texts = [job_description] + df_resumes['text'].tolist()
tfidf_matrix = vectorizer.fit_transform(all_texts)


In [None]:
# --- Compute similarity scores ---
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
df_resumes['score'] = similarity_scores

In [None]:
df_resumes

In [None]:
# --- Optional: Train a model to predict scores ---
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix[1:], similarity_scores, test_size=0.2, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.4f}")

In [None]:
# --- Rank resumes ---
ranked_resumes = df_resumes.sort_values(by='score', ascending=False)
print("\nTop 5 resumes:")
for idx, row in ranked_resumes.head(5).iterrows():
    print(f"{row['file']} — Score: {row['score']:.2f}")

In [None]:
def calculate_teacher_score(resume_text):
    # --- Skills match ---
    skill_keywords = ["teaching", "classroom management", "lesson planning", "curriculum", "education", "pedagogy"]
    skills_score = sum(1 for skill in skill_keywords if skill.lower() in resume_text.lower()) / len(skill_keywords)
    
    # --- Experience match ---
    import re
    years = re.findall(r'(\d+)\s+years?', resume_text.lower())
    exp_score = min(sum(int(y) for y in years) / 20, 1)  # normalize to 0-1, max 20 years
    
    # --- Education match ---
    edu_keywords = ["bachelor", "master", "phd", "education degree", "teaching certification"]
    edu_score = sum(1 for edu in edu_keywords if edu in resume_text.lower()) / len(edu_keywords)
    
    # --- Weighted final score ---
    final_score = 0.4*skills_score + 0.4*exp_score + 0.2*edu_score
    return final_score

# Apply to your resumes
df_resumes['teacher_score'] = df_resumes['text'].apply(calculate_teacher_score)
df_resumes = df_resumes.sort_values(by='teacher_score', ascending=False)

print("\nTop 5 Teacher resumes:")
for idx, row in df_resumes.head(5).iterrows():
    print(f"{row['file']} — Teacher Score: {row['teacher_score']:.2f}")


In [None]:
from sentence_transformers import SentenceTransformer, util
import PyPDF2

# --- Extract text from a single PDF ---
def extract_text_from_pdf(file_path):
    text = ""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return text


# --- Load pre-trained transformer ---
model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Job description + Resume ---
resume_file = "data/data/TEACHER/90363254.pdf"
resume_text = extract_text_from_pdf(resume_file)

job_description = """
Looking for a Teacher with strong classroom management, lesson planning,
and education background. Teaching certification is a plus.
"""

# --- Create embeddings ---
embeddings = model.encode([job_description, resume_text], convert_to_tensor=True)

# --- Cosine similarity ---
similarity = util.cos_sim(embeddings[0], embeddings[1]).item()

print(f"Resume: {resume_file}")
print(f"Deep NLP Similarity Score: {similarity:.2f}")


In [None]:
import os
import PyPDF2
from sentence_transformers import SentenceTransformer, util

# --- Load model (PyTorch only) ---
model = SentenceTransformer('all-MiniLM-L6-v2')  # small, fast, good for semantic similarity

# --- Function: Extract text from a PDF ---
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# --- Job description (your target text) ---
job_description = """
We are seeking a passionate and creative Arts Teacher to inspire students through visual and performing arts. 
The ideal candidate will have strong skills in painting, drawing, sculpture, or digital media, along with experience 
in art history and contemporary practices. Responsibilities include developing engaging lesson plans, encouraging 
creative expression, and helping students build technical and conceptual skills. Strong classroom management, 
curriculum planning, and the ability to foster an inclusive and motivating environment are essential.

"""

# --- Encode job description once ---
job_embedding = model.encode(job_description, convert_to_tensor=True)

# --- Folder containing resumes ---
resume_folder = "data/data/ARTS"   # change to your folder path

# --- Loop through all PDFs in the folder ---
results = []
for file_name in os.listdir(resume_folder):
    if file_name.endswith(".pdf"):
        pdf_path = os.path.join(resume_folder, file_name)
        resume_text = extract_text_from_pdf(pdf_path)

        # Encode resume text
        resume_embedding = model.encode(resume_text, convert_to_tensor=True)

        # Compute cosine similarity
        score = util.pytorch_cos_sim(job_embedding, resume_embedding).item()

        results.append((file_name, score))

# --- Sort results by score (highest first) ---
results.sort(key=lambda x: x[1], reverse=True)

# --- Print top matches ---
print("Top matching resumes:")
for file_name, score in results[:5]:
    print(f"{file_name} — Score: {score:.2f}")


In [None]:
!pip uninstall tensorflow tensorflow-macos tensorflow-metal -y


In [None]:
sample_text = """
John Doe
123 Main Street, New York, NY 10001
john.doe@example.com | (123) 456-7890

Career Objective:
Motivated data scientist with a strong background in computer science and statistics, seeking to contribute to innovative data projects in a dynamic organization.

Education:
Bachelor of Science in Computer Science, NYU, 2018 – 2022
High School Diploma, Lincoln High School, 2014 – 2018

Skills:
Python, Java, SQL, Data Analysis, Machine Learning, Deep Learning, TensorFlow, PyTorch, Excel, HTML, CSS, JavaScript

Professional Experience:
Data Science Intern – DataWorks Inc. (June 2021 – August 2021)
- Built predictive models using Python and scikit-learn
- Cleaned and analyzed large datasets with pandas and NumPy
- Collaborated with cross-functional teams to deliver insights

Junior Data Analyst – Insight Analytics (July 2022 – Present)
- Developed dashboards in Power BI and Tableau
- Automated ETL pipelines using Python scripts
- Conducted A/B tests to optimize marketing strategies

Certifications:
Google Data Analytics Certificate
AWS Certified Data Practitioner

Projects:
- Customer Churn Prediction using Logistic Regression
- Movie Recommendation System with Collaborative Filtering

Languages:
English (Fluent), Spanish (Conversational)

"""

In [None]:
import re
import spacy
from nltk.corpus import stopwords
import pdfplumber  # <-- new import

# Load English NLP model
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

# Load PDF and extract text
def load_resume_text(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Keywords
EDU_KEYWORDS = [
    'bachelor', 'master', 'phd', 'high school', 'secondary school',
    'bs', 'ms', 'b.sc', 'm.sc', 'mba', 'b.e', 'm.e', 'm.tech', 'b.tech'
]

SKILL_KEYWORDS = [
    'python', 'java', 'c++', 'machine learning', 'deep learning',
    'sql', 'excel', 'data analysis', 'tensorflow', 'pytorch',
    'html', 'css', 'javascript', 'react', 'node.js'
]

EXPERIENCE_KEYWORDS = [
    'experience', 'employment history', 'work history', 'professional experience'
]

def extract_education(text):
    education = []
    for line in text.split('\n'):
        if any(keyword in line.lower() for keyword in EDU_KEYWORDS):
            education.append(line.strip())
    return education

def extract_skills(text):
    text = text.lower()
    found_skills = [skill for skill in SKILL_KEYWORDS if skill.lower() in text]
    return list(set(found_skills))

def extract_experience(text):
    experience_section = ""
    lines = text.lower().split('\n')
    for i, line in enumerate(lines):
        if any(keyword in line for keyword in EXPERIENCE_KEYWORDS):
            # Grab next 10 lines as naive experience section
            experience_section = "\n".join(lines[i:i+10])
            break
    return experience_section.strip()

# Example usage
if __name__ == "__main__":
    resume_text = load_resume_text('/Volumes/CrucialX9/Project/Data Science/NLP/data/data/INFORMATION-TECHNOLOGY/10265057.pdf')

    education = extract_education(resume_text)
    skills = extract_skills(resume_text)
    experience = extract_experience(resume_text)

    print("📚 Education:")
    for item in education:
        print(" -", item)

    print("\n🛠️ Skills:")
    for skill in skills:
        print(" -", skill)

    print("\n💼 Experience Snippet:")
    print(experience)


In [None]:
import re
import spacy
from nltk.corpus import stopwords
import pdfplumber
from difflib import SequenceMatcher

# Load spaCy NLP model
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

# Load PDF and extract text
def load_resume_text(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Extract person's name using NLP from the top lines of resume
def extract_name(text):
    top_lines = '\n'.join(text.strip().split('\n')[:10])  # first 10 lines
    doc = nlp(top_lines)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            return ent.text.strip()
    return "Candidate"  # fallback name if no person entity found

# Keywords
EDU_KEYWORDS = [
    'bachelor', 'master', 'phd', 'high school', 'secondary school',
    'bs', 'ms', 'b.sc', 'm.sc', 'mba', 'b.e', 'm.e', 'm.tech', 'b.tech'
]

SKILL_KEYWORDS = [
    # 🔧 Technical & IT Skills
    'python', 'java', 'c++', 'c#', 'ruby', 'go', 'r', 'javascript',
    'sql', 'nosql', 'html', 'css', 'react', 'angular', 'vue.js', 'node.js',
    'flask', 'django', 'spring boot', 'dotnet', '.net', 'php',
    'machine learning', 'deep learning', 'artificial intelligence', 'data science',
    'data analysis', 'data engineering', 'big data', 'data visualization',
    'pandas', 'numpy', 'matplotlib', 'tensorflow', 'pytorch',
    'cloud computing', 'aws', 'azure', 'google cloud platform', 'gcp',
    'docker', 'kubernetes', 'jenkins', 'devops', 'ci/cd',
    'linux', 'windows server', 'bash', 'powershell',
    'networking', 'cybersecurity', 'penetration testing', 'firewall', 'vpn',
    'sap', 'abap', 'hana', 'ecc', 'erp', 'oracle', 'crm',
    'sql server', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
    'jira', 'confluence', 'git', 'github', 'gitlab',
    'itil', 'agile', 'scrum', 'kanban',
    
    # 📊 Business, Finance, and Management
    'accounting', 'bookkeeping', 'finance', 'financial analysis',
    'budgeting', 'forecasting', 'auditing', 'payroll',
    'tax preparation', 'quickbooks', 'xero', 'tally', 'erp systems',
    'microsoft excel', 'microsoft word', 'microsoft powerpoint',
    'data entry', 'ms office', 'spreadsheet analysis',
    'operations management', 'supply chain', 'logistics', 'inventory management',
    'project management', 'program management', 'pmp', 'six sigma', 'lean',
    'procurement', 'vendor management', 'contract negotiation',

    # 🛍️ Marketing, Sales, and Customer Support
    'salesforce', 'hubspot', 'seo', 'sem', 'google analytics',
    'email marketing', 'content marketing', 'copywriting', 'adwords',
    'market research', 'branding', 'social media marketing',
    'customer service', 'crm', 'retail sales', 'upselling',
    'cold calling', 'account management', 'b2b sales', 'lead generation',
    
    # 🧑‍🏫 Education and Training
    'lesson planning', 'curriculum development', 'teaching',
    'classroom management', 'online teaching', 'tutoring', 'e-learning',
    'blackboard', 'moodle', 'canvas', 'zoom', 'google classroom',
    'student assessment', 'education administration',

    # 🧪 Healthcare, Science, and Research
    'nursing', 'patient care', 'clinical research', 'healthcare management',
    'medical billing', 'medical coding', 'emr', 'ehr',
    'laboratory testing', 'phlebotomy', 'health informatics',
    'data collection', 'data interpretation', 'quantitative research',

    # 🛠️ Trades, Construction, and Engineering
    'autocad', 'solidworks', 'mechanical engineering',
    'civil engineering', 'electrical engineering', 'hvac',
    'blueprint reading', 'welding', 'plumbing', 'carpentry',
    'osha', 'project scheduling', 'site supervision',

    # 🧠 Soft Skills (ATS-friendly)
    'communication', 'teamwork', 'leadership', 'problem solving',
    'time management', 'critical thinking', 'adaptability',
    'creativity', 'collaboration', 'decision making',
    'attention to detail', 'multitasking', 'empathy', 'resilience'
]


EXPERIENCE_KEYWORDS = [
    'experience', 'employment history', 'work history', 'professional experience'
]

# Fuzzy matching helper
def fuzzy_match(a, b, threshold=0.85):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio() >= threshold

# Extract section from resume
def extract_section(text, keywords, window=10):
    lines = text.split('\n')
    for i, line in enumerate(lines):
        line_lower = line.lower()
        if any(k in line_lower for k in keywords):
            return '\n'.join(lines[i:i+window])
    return ""

# Extract education
def extract_education(text):
    edu_section = extract_section(text, EDU_KEYWORDS)
    education = []
    for line in edu_section.split('\n'):
        if any(fuzzy_match(keyword, line) for keyword in EDU_KEYWORDS):
            education.append(line.strip())
    return education

# Extract skills
def extract_skills(text):
    skills_found = set()
    text_lower = text.lower()
    for skill in SKILL_KEYWORDS:
        if skill in text_lower:
            skills_found.add(skill)
    return list(skills_found)

# Extract experience section
def extract_experience(text):
    exp_section = extract_section(text, EXPERIENCE_KEYWORDS)
    return exp_section.strip()

# Extract skills/edu from job description
def extract_job_requirements(job_desc):
    job_desc_lower = job_desc.lower()
    job_skills = [skill for skill in SKILL_KEYWORDS if skill in job_desc_lower]
    job_edu = [edu for edu in EDU_KEYWORDS if edu in job_desc_lower]
    return job_skills, job_edu

# Compute ATS score
def ats_score(resume_edu, resume_skills, resume_exp_text, job_edu, job_skills):
    edu_match = 1 if any(any(job_e in edu.lower() for job_e in job_edu) for edu in resume_edu) else 0

    skill_match = len(set(resume_skills).intersection(set(job_skills))) / len(job_skills) if job_skills else 0

    exp_text = resume_exp_text.lower()
    exp_matches = sum(1 for skill in job_skills if skill in exp_text)
    exp_match = exp_matches / len(job_skills) if job_skills else 0

    score = (edu_match * 0.3 + skill_match * 0.5 + exp_match * 0.2) * 100
    return round(score, 2)


if __name__ == "__main__":
    # Path to your resume PDF
    resume_path = 'surya resume new (1) (1).pdf'  # change as needed
    resume_text = load_resume_text(resume_path)

    # Extract candidate name automatically
    user_name = extract_name(resume_text)

    # Dummy Job Description
    JOB_DESCRIPTION = """
   We are looking for energetic and driven Sales & Marketing Interns to join our team at Alzone Software. This internship offers hands-on experience in various aspects of sales and marketing, including cold calling, lead generation, social media marketing, campaign execution, and CRM management.

Successful interns may be offered a full-time salaried role based on performance after the internship.
    """

    # Extract features from resume
    education = extract_education(resume_text)
    skills = extract_skills(resume_text)
    experience = extract_experience(resume_text)

    # Extract job requirements
    job_skills, job_edu = extract_job_requirements(JOB_DESCRIPTION)

    # Calculate ATS score
    score = ats_score(education, skills, experience, job_edu, job_skills)

    # Print Output
    print(f"\n👋 Hello {user_name}, here's your resume match analysis:\n")

    print("📚 Education Found in Resume:")
    if education:
        for item in education:
            print(" -", item)
    else:
        print(" - No matching education found.")

    print("\n🛠️ Skills Found in Resume:")
    if skills:
        for skill in skills:
            print(" -", skill)
    else:
        print(" - No matching skills found.")

    print("\n💼 Experience Section Snippet:")
    print(experience if experience else " - No experience section found.")

    print(f"\n✅ ATS Matching Score: {score} / 100")


In [None]:
import os
import re
import spacy
import pdfplumber
from nltk.corpus import stopwords
from difflib import SequenceMatcher

# Load NLP model
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

# Skill and education keywords
SKILL_KEYWORDS = [
    # IT & Tech
    'python', 'java', 'c++', 'c#', 'ruby', 'go', 'r', 'sql', 'html', 'css',
    'javascript', 'react', 'node.js', 'flask', 'django', 'tensorflow', 'pytorch',
    'machine learning', 'deep learning', 'data science', 'data analysis',
    'big data', 'devops', 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'git',
    'sap', 'abap', 'hana', 'ecc', 'erp', 'linux', 'agile', 'scrum',

    # Business & Management
    'accounting', 'bookkeeping', 'finance', 'payroll', 'forecasting', 'budgeting',
    'project management', 'pmp', 'six sigma', 'lean', 'inventory', 'logistics',
    'vendor management', 'contract negotiation',

    # Marketing & Sales
    'marketing', 'sales', 'seo', 'sem', 'social media marketing',
    'crm', 'email marketing', 'branding', 'lead generation',

    # Education & Training
    'teaching', 'tutoring', 'curriculum development', 'classroom management',

    # Admin & Support
    'data entry', 'ms office', 'excel', 'powerpoint', 'word',
    'customer service', 'front desk', 'scheduling', 'document handling',

    # Soft Skills
    'communication', 'teamwork', 'leadership', 'problem solving',
    'adaptability', 'multitasking', 'collaboration'
]
EDU_KEYWORDS = [
    'bachelor', 'master', 'phd', 'mba', 'high school', 'secondary school',
    'b.sc', 'm.sc', 'bba', 'bca', 'mca', 'b.com', 'm.com'
]

EXPERIENCE_KEYWORDS = ['experience', 'employment history', 'work history', 'professional experience']

# Helper functions
def fuzzy_match(a, b, threshold=0.85):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio() >= threshold

def load_resume_text(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def extract_section(text, keywords, window=10):
    lines = text.split('\n')
    for i, line in enumerate(lines):
        if any(k in line.lower() for k in keywords):
            return '\n'.join(lines[i:i+window])
    return ""

def extract_education(text):
    section = extract_section(text, EDU_KEYWORDS)
    return [line.strip() for line in section.split('\n') if any(k in line.lower() for k in EDU_KEYWORDS)]

def extract_skills(text):
    found = set()
    text_lower = text.lower()
    for skill in SKILL_KEYWORDS:
        if skill in text_lower:
            found.add(skill)
    return list(found)

def extract_experience(text):
    return extract_section(text, EXPERIENCE_KEYWORDS).strip()

def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return "Candidate"

def extract_job_requirements(desc):
    desc_lower = desc.lower()
    skills = [k for k in SKILL_KEYWORDS if k in desc_lower]
    edu = [e for e in EDU_KEYWORDS if e in desc_lower]
    return skills, edu

def ats_score(resume_edu, resume_skills, resume_exp_text, job_edu, job_skills):
    edu_match = 1 if any(job_e in edu.lower() for edu in resume_edu for job_e in job_edu) else 0
    skill_match = len(set(resume_skills) & set(job_skills)) / len(job_skills) if job_skills else 0
    exp_match = sum(1 for skill in job_skills if skill in resume_exp_text.lower()) / len(job_skills) if job_skills else 0
    score = (edu_match * 0.3 + skill_match * 0.5 + exp_match * 0.2) * 100
    return round(score, 2)

# Job Description
JOB_DESCRIPTION = """
We are hiring a Marketing and Front Desk Coordinator with strong communication, lead generation, and social media skills.
Experience with CRM tools, Excel, and basic data entry is required.
A bachelor's degree in business or marketing is preferred.
"""

# --- Batch Resume Processing ---
def process_resumes(resume_folder_path):
    job_skills, job_edu = extract_job_requirements(JOB_DESCRIPTION)
    results = []

    for filename in os.listdir(resume_folder_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(resume_folder_path, filename)
            try:
                resume_text = load_resume_text(file_path)
                name = extract_name(resume_text)
                education = extract_education(resume_text)
                skills = extract_skills(resume_text)
                experience = extract_experience(resume_text)
                score = ats_score(education, skills, experience, job_edu, job_skills)

                result = {
                    'Name': name,
                    'File': filename,
                    'Education': education,
                    'Skills': skills,
                    'Score': score
                }

                results.append(result)
            except Exception as e:
                print(f"❌ Error reading {filename}: {e}")

    return results

# --- Run the System ---
if __name__ == "__main__":
    folder = "/Volumes/CrucialX9/Project/Data Science/NLP/data/data/BUSINESS-DEVELOPMENT"  # 👈 Replace with your folder path
    match_results = process_resumes(folder)
    match_results.sort(key=lambda x: x['Score'], reverse=True)
    for res in match_results:
        print(f"\n👤 {res['Name']} ({res['File']})")
        print("📚 Education:", res['Education'] or "None found")
        print("🛠️ Skills:", ', '.join(res['Skills']) or "None found")
        print(f"✅ ATS Score: {res['Score']} / 100")