In [37]:
!pip install pdfminer.six
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [41]:
import os
import re
from pdfminer.high_level import extract_text
import spacy
from spacy.matcher import Matcher
from google.colab import files
from fuzzywuzzy import fuzz

# Define skills list
skills_list = ['Python', 'Data Analysis', 'Machine Learning', 'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau', 'NLP', 'Computer Vision', 'OpenCV', 'Pyresparser', 'Roberta', 'Spacy', 'Scikit-Learn', 'Scipy', 'Neural Network', 'Docker', 'Jenkins', 'Github', 'AWS', 'Azure', 'GCP', 'BERT', 'Data Scientist', 'ROBOT', 'MySQL',
               'AI-Artificial Intelligence', 'Research & Development', 'Pattern Recognition', 'Problem Solving', 'Debugging', 'Troubleshooting', 'Monitoring', 'Unit Testing', 'Testing', 'Visualization', 'Software Developement',
               'Robotics', 'Raspberry Pi', 'Sentimental Analysis', 'DecisionMaking', 'Dlib', 'HOG']

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_contact_number_from_resume(text):
    contact_number = None
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()
    return contact_number

def extract_email_from_resume(text):
    email = None
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()
    return email

def extract_skills_from_text(text, skills_list):
    skills = []
    for skill in skills_list:
        pattern = r"\b{}\b".format(re.escape(skill))
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            skills.append(skill)
    return skills

def extract_skills_from_section(section_text, skills_list):
    return extract_skills_from_text(section_text, skills_list)

def extract_experience_skills(text, skills_list):
    # Assuming experience section starts with "EXPERIENCE"
    experience_start = text.find("EXPERIENCE")
    if experience_start != -1:
        # Extract the content of the experience section
        experience_text = text[experience_start:]

        # Extract skills from the experience section
        experience_skills = extract_skills_from_section(experience_text, skills_list)

        return experience_skills

    return []

def extract_project_skills(text, skills_list):
    # Assuming project section starts with "PROJECTS"
    project_start = text.find("PROJECTS")
    if project_start != -1:
        # Extract the content of the project section
        project_text = text[project_start:]

        # Extract skills from the project section
        project_skills = extract_skills_from_section(project_text, skills_list)

        return project_skills

    return []

def extract_education_from_resume(text):
    education = []
    # Pattern with common degree variations in India
    pattern = r"(?i)(?:B\.Tech|B\.E|B\.Sc|B\.A|B\.Com|M\.Tech|M\.E|M\.Sc|M\.A|Ph\.D|Diploma|Bachelors|Master(?:'s)?|Bachelor(?:'s)?|HSC|SSC)\s*(?:\w+\s*)*\w+\s*\d*"
    matches = re.findall(pattern, text)
    for match in matches:
        # Remove digits from the match
        clean_match = re.sub(r'\d', '', match)
        # Remove newline characters and other unwanted characters
        clean_match = clean_match.replace('\n', '').strip()
        education.append(clean_match)
    return education


def extract_name(resume_text):
    nlp = spacy.load('en_core_web_sm')
    matcher = Matcher(nlp.vocab)

    # Define name patterns
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name, Middle name, and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'IS_PUNCT': True, 'OP': '?'}, {'POS': 'PROPN'}],  # First name, optional punctuation, Middle name, optional punctuation, and Last name
        [{'POS': 'PROPN'}, {'IS_PUNCT': True, 'OP': '?'}, {'POS': 'PROPN'}],  # First name, optional punctuation, and Last name
    ]

    for pattern in patterns:
        matcher.add('NAME', patterns=[pattern])

    doc = nlp(resume_text)
    matches = matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        # Adjusting to return the full name
        return " ".join([token.text for token in span])

    return None

# Function to calculate accuracy
def calculate_accuracy(extracted_value, ground_truth):
    if extracted_value and ground_truth:
        # Convert lists to sets for comparison
        extracted_set = set(extracted_value)
        ground_truth_set = set(ground_truth)

        # Check if there are common elements in the sets
        common_elements = extracted_set.intersection(ground_truth_set)

        # Calculate accuracy based on the number of common elements
        accuracy = len(common_elements) / len(ground_truth_set)

        return accuracy
    return 0.0

# Upload the PDF file to Colab
uploaded = files.upload()

uploaded_files = list(uploaded.keys())

if len(uploaded_files) == 0:
    print("No files uploaded. Please upload the PDF file.")
else:
    for resume_path in uploaded_files:
        text = extract_text_from_pdf(resume_path)

        print("Resume:", resume_path)

        name = extract_name(text)
        if name:
            print("Name:", name)
        else:
            print("Name not found")

        contact_number = extract_contact_number_from_resume(text)
        if contact_number:
            print("Contact Number:", contact_number)
        else:
            print("Contact Number not found")

        email = extract_email_from_resume(text)
        if email:
            print("Email:", email)
        else:
            print("Email not found")

        extracted_skills = extract_skills_from_text(text, skills_list)
        if extracted_skills:
            print("Skills:", extracted_skills)
        else:
            print("No skills found")

        extracted_education = extract_education_from_resume(text)
        if extracted_education:
            print("Education:", extracted_education)
        else:
            print("No education information found")

        # Extract skills from the overall text
        overall_skills = extract_skills_from_text(text, skills_list)
        if overall_skills:
          print("Overall Skills:", overall_skills)
        else:
          print("No overall skills found")

        # Extract skills from the experience section
        experience_skills = extract_experience_skills(text, skills_list)
        if experience_skills:
          print("Experience Skills:", experience_skills)
        else:
          print("No experience skills found")

        # Extract skills from the project section
        project_skills = extract_project_skills(text, skills_list)
        if project_skills:
          print("Project Skills:", project_skills)
        else:
          print("No project skills found")

        # Ground truth values (replace these with actual ground truth values)
        ground_truth_name = "KALPESH DAHAKE"
        ground_truth_contact_number = "+91-8329681560"
        ground_truth_email = "kalpeshdahake123@gmail.com"
        ground_truth_skills = ['Python', 'Machine Learning', 'SQL', 'NLP', 'Computer Vision', 'MySQL', 'ROBOT', 'Tableau', 'Docker', 'Jenkins', 'Github']
        ground_truth_education = ['Bachelor of Engineering', 'Diploma in Engineering', 'HSC', 'SSC']
        ground_truth_experience_skills = ['Python', 'AI-Artificial Intelligence', 'Research & Development', 'Pattern Recognition', 'Problem Solving', 'Debugging', 'Troubleshooting', 'Monitoring', 'Unit Testing', 'Testing', 'Visualization', 'Software Developement']
        ground_truth_project_skills = ['Robotics', 'Raspberry Pi', 'Sentimental Analysis', 'DecisionMaking', 'Dlib', 'HOG', 'NLP']

        # Extracted values
        name = extract_name(text)
        contact_number = extract_contact_number_from_resume(text)
        email = extract_email_from_resume(text)
        extracted_skills = extract_skills_from_resume(text, skills_list)
        extracted_education = extract_education_from_resume(text)

        # Calculate accuracy for each field
        accuracy_name = calculate_accuracy(name, ground_truth_name)
        accuracy_contact_number = calculate_accuracy(contact_number, ground_truth_contact_number)
        accuracy_email = calculate_accuracy(email, ground_truth_email)
        accuracy_skills = calculate_accuracy(extracted_skills, ground_truth_skills)
        accuracy_education = calculate_accuracy(extracted_education, ground_truth_education)
        accuracy_experience_skills = calculate_accuracy(experience_skills, ground_truth_experience_skills)
        accuracy_project_skills = calculate_accuracy(project_skills, ground_truth_project_skills)

        # Print accuracy for each field
        print(f"Accuracy - Name: {accuracy_name}")
        print(f"Accuracy - Contact Number: {accuracy_contact_number}")
        print(f"Accuracy - Email: {accuracy_email}")
        print(f"Accuracy - Skills: {accuracy_skills}")
        print(f"Accuracy - Education: {accuracy_education}")
        print(f"Accuracy - Experience Skills: {accuracy_experience_skills}")
        print(f"Accuracy - Project Skills: {accuracy_project_skills}")
        print()


Saving CV_kalpeshdahake.pdf to CV_kalpeshdahake (6).pdf
Resume: CV_kalpeshdahake (6).pdf
Name: KALPESH DAHAKE
Contact Number: 91-8329681560
Email: kalpeshdahake123@gmail.com
Skills: ['Python', 'Machine Learning', 'SQL', 'Tableau', 'NLP', 'Computer Vision', 'OpenCV', 'Docker', 'Jenkins', 'Github', 'ROBOT', 'MySQL', 'Debugging', 'Troubleshooting', 'Monitoring', 'Unit Testing', 'Testing', 'Visualization', 'Raspberry Pi', 'Sentimental Analysis', 'DecisionMaking', 'Dlib', 'HOG']
Education: ['Master in Data Science', 'Bachelor of Engineering', 'Diploma in Engineering', 'HSC', 'SSC']
Overall Skills: ['Python', 'Machine Learning', 'SQL', 'Tableau', 'NLP', 'Computer Vision', 'OpenCV', 'Docker', 'Jenkins', 'Github', 'ROBOT', 'MySQL', 'Debugging', 'Troubleshooting', 'Monitoring', 'Unit Testing', 'Testing', 'Visualization', 'Raspberry Pi', 'Sentimental Analysis', 'DecisionMaking', 'Dlib', 'HOG']
Experience Skills: ['Python', 'Machine Learning', 'SQL', 'Tableau', 'NLP', 'Computer Vision', 'OpenCV',