In [None]:
!pip install pdfminer.six pyparsing spacy
!python -m spacy download en_core_web_sm

In [None]:
# Resume Analyzer App using PyPDF2, pdfminer, and pyparsing
# Step 2: Import Required Libraries
import re
from pdfminer.high_level import extract_text
import spacy
import re
from pyparsing import Word, alphas

In [None]:
# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Step 3: Define Helper Functions

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_email(text):
    emails = re.findall(r"[a-zA-Z0-9._%+-]@+[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    return emails

def extract_phone_number(text):
    # Matches Tunisian phone numbers starting with +216
    phone_pattern = re.compile(
        r'(\+216[\s\-\.]?\d{2}[\s\-\.]?\d{3}[\s\-\.]?\d{3})'
    )

    matches = phone_pattern.findall(text)

    return list(set(match.strip() for match in matches)) if matches else None


def extract_name(text):
    # Split text into lines, strip whitespace, remove empty lines
    lines = [line.strip() for line in text.strip().split('\n') if line.strip()]

    # Assume the name is on the first non-empty line
    return lines[0] if lines else "Unknown"

def extract_job_title(text):
    # We'll look for likely job titles using custom keywords since SpaCy's default model doesn't label job titles directly
    job_titles = []
    job_keywords = [
        "Engineer", "Data Scientist", "Manager", "Consultant", "Developer",
        "Analyst", "Intern", "Technician", "Supervisor", "Coordinator",
        "Architect", "Administrator", "Director", "Assistant", "Specialist",
        "Officer", "Trainer", "Executive", "CEO", "CTO", "Founder"
    ]

    for line in text.split('\n'):
        for title in job_keywords:
            if title.lower() in line.lower():
                job_titles.append(title)

    return list(set(job_titles)) if job_titles else None
def extract_skills(text, skill_keywords):
    text = text.lower()
    return [skill for skill in skill_keywords if skill.lower() in text]


def extract_college_names(text):
    # List of starting keywords for Tunisian colleges
    keywords = ["Université", "Ecole","Institut"]

    # Regular expression pattern to find words starting with 'Université' or 'Ecole'
    pattern = r'\b(?:' + '|'.join([re.escape(keyword) for keyword in keywords]) + r')[\w\s\(\)-]+'

    # Find all matches based on the pattern
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    # Filter out any empty matches or irrelevant text
    colleges_found = [match.strip() for match in matches if match.strip()]

    return colleges_found if colleges_found else "Unknown"



def extract_degrees(text):
    # Common degree keywords used in Tunisia
    degree_keywords = [
        "Licence", "Maîtrise", "Master", "Doctorat", "Diplôme", "Ingénieur", "Baccalauréat"
    ]

    # Build regex pattern to match degree lines
    pattern = r'\b(?:' + '|'.join([re.escape(keyword) for keyword in degree_keywords]) + r')[\w\s\(\),.-]{0,100}'

    # Extract matching lines
    degrees_found = re.findall(pattern, text, flags=re.IGNORECASE)

    # Clean results
    degrees = [deg.strip().capitalize() for deg in degrees_found if deg.strip()]

    return list(set(degrees)) if degrees else "Unknown"



def extract_languages(text):
    known_languages = ['English', 'French', 'Arabic', 'Spanish', 'German', 'Mandarin',"Francais" , "Anglais","Arab"]
    return [lang for lang in known_languages if lang.lower() in text.lower()]


In [None]:
import re
from dateutil import parser
from datetime import datetime

# Parse a string into a date (fuzzy=True allows partial strings like "Jan 2020")
def parse_date(date_str):
    try:
        return parser.parse(date_str, fuzzy=True, default=datetime(1900, 1, 1))
    except:
        return None

# Extract only the "Experience" section
def extract_experience_section(text):
    # Normalize
    text = text.lower()

    # Keywords to mark the start of experience section
    start_keywords = ['experience', 'work history', 'professional experience']
    end_keywords = ['education', 'formation', 'certification', 'skills', 'languages']

    # Find the start
    start_idx = -1
    for kw in start_keywords:
        match = re.search(rf"{kw}", text)
        if match:
            start_idx = match.end()
            break

    if start_idx == -1:
        return ""

    # Find the end
    end_idx = len(text)
    for kw in end_keywords:
        match = re.search(rf"{kw}", text[start_idx:])
        if match:
            end_idx = start_idx + match.start()
            break

    return text[start_idx:end_idx]

# Extract and compute total experience
def extract_total_experience(resume_text):
    experience_text = extract_experience_section(resume_text)

    experience_text = experience_text.replace('–', '-').replace('—', '-')

    # Regex for date ranges like 2020 - 2023 or Jan 2021 - Present
    date_patterns = re.findall(
        r'((?:\w{3,9}[\s/-]*)?\d{4})\s*[-to]{1,3}\s*((?:\w{3,9}[\s/-]*)?\d{4}|present|aujourd\'hui)',
        experience_text,
        re.IGNORECASE
    )

    total_months = 0
    now = datetime.now()

    for start_str, end_str in date_patterns:
        start_date = parse_date(start_str)
        end_date = parse_date(end_str) if 'present' not in end_str.lower() and 'aujourd' not in end_str.lower() else now

        if start_date and end_date and end_date > start_date:
            months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
            total_months += months

    if total_months == 0:
        return "Unknown"

    years = total_months // 12
    months = total_months % 12
    return f"{years} year(s) and {months} month(s)"

In [None]:
# Step 4: Define Your Skill Keywords (customize as needed)
common_skills = [
    "Python", "Java", "C++", "Machine Learning", "Deep Learning",
    "Data Analysis", "SQL", "Pandas", "Numpy", "Django", "Flask",
    "React", "JavaScript", "HTML", "CSS", "TensorFlow", "Keras",
]

In [None]:
from os import name
# Step 5: Run the Analyzer
pdf_path = '/content/drive/MyDrive/cv/Cv Rjab Jawher .pdf'

resume_text = extract_text_from_pdf(pdf_path)
languages = extract_languages(resume_text)


print("Extracted Name:", extract_name(resume_text))
print("Extracted job_title:", extract_job_title(resume_text))
print("Extracted Email:", extract_email(resume_text))
print("Extracted Phone Number:", extract_phone_number(resume_text))
print("Extracted Skills:", extract_skills(resume_text, common_skills))
print("Extracted College Name(s):", extract_college_names(resume_text))
print("Extracted Degree(s):", extract_degrees(resume_text))
print("Total Experience:", extract_total_experience(resume_text))
print("Extracted Spoken Language(s):", languages)

In [None]:
def analyze_resume(pdf_path):
    # Step 1: Extract text from the PDF
    resume_text = extract_text_from_pdf(pdf_path)

    # Step 2: Extract data
    languages = extract_languages(resume_text)
    name = extract_name(resume_text)
    job_title = extract_job_title(resume_text)
    email = extract_email(resume_text)
    phone = extract_phone_number(resume_text)
    skills = extract_skills(resume_text, common_skills)
    experience = extract_total_experience(resume_text)
    colleges = extract_college_names(resume_text)
    degrees = extract_degrees(resume_text)

    # Step 3: Print results
    print("Extracted Name:", name)
    print("Extracted job_title:", job_title)
    print("Extracted Email:", email)
    print("Extracted Phone Number:", phone)
    print("Extracted Skills:", skills)
    print("Extracted College Name(s):", colleges)
    print("Extracted Degree(s):", degrees)
    print("Total Experience:", experience)
    print("Extracted Spoken Language(s):", languages)

    # Optional: Return results as a dictionary if you want to use them elsewhere
    return {
        "Information extraction completed"
    }


In [None]:
pdf_path = '/content/drive/MyDrive/cv/Cv Rjab Jawher .pdf'
analyze_resume('/content/drive/MyDrive/cv/Cv Rjab Jawher .pdf')