In [5]:
import os
import spacy
import json
import csv
import re
import logging
from pyresparser import ResumeParser
from pdfminer.high_level import extract_text  # To extract text from PDF files
from tqdm import tqdm  # For progress tracking

# Setup logging
logging.basicConfig(filename='resume_extraction.log', level=logging.ERROR, 
                    format='%(asctime)s %(levelname)s: %(message)s')

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Generalized function to extract names using SpaCy NER, regex, and comparison with ResumeParser
import re
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def extract_name_general(resume_text, resume_parser_name=None):
    doc = nlp(resume_text)
    
    # Use SpaCy to detect PERSON entities
    person_names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    
    # If ResumeParser has extracted a name, compare and use it if found
    if resume_parser_name:
        for name in person_names:
            # If ResumeParser's name matches one of the SpaCy-detected names, return it
            if resume_parser_name.lower() in name.lower():
                return name

    # Regex pattern to extract names: First Name, Optional Middle Name, Last Name
    name_pattern = re.compile(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+[A-Z][a-z]+(?:[-\s][A-Z][a-z]+)*)\b')
    name_match = name_pattern.search(resume_text)
    if name_match:
        return name_match.group(0)
    
    # Fallback to the first line of the resume (assuming the name might be at the top)
    first_line = resume_text.split('\n')[0].strip()
    
    # Check if the first line matches the name format (First Last or First Middle Last)
    fallback_name_match = name_pattern.search(first_line)
    if fallback_name_match:
        return fallback_name_match.group(0)

    # If no names were found, return None or a suitable message
    return None


# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def extract_company_names(resume_text):
    company_names = []
    company_keywords = [
        'Inc', 'Corp', 'Ltd', 'Technologies', 'Systems', 'Solutions', 'LLC', 'Pvt', 'Private Limited', 
        'Limited', 'Labs', 'Consulting', 'Group', 'Partners', 'Industries', 'Services', 'Networks'
    ]
    
    # Use SpaCy to detect organizations
    doc = nlp(resume_text)
    for ent in doc.ents:
        if ent.label_ == "ORG" and len(ent.text) > 2:
            company_names.append(ent.text.strip())

    # Regex pattern to detect company names with common suffixes
    company_pattern = re.compile(
        r'\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s+(Inc|Corp|Ltd|LLC|Pvt|Technologies|Solutions|Systems|Private Limited|Limited|Labs|Consulting|Group|Partners|Industries|Services|Networks))\b'
    )
    company_matches = company_pattern.findall(resume_text)
    if company_matches:
        company_names.extend([match[0].strip() for match in company_matches])

    # Additional context-based pattern for identifying lines with company-related keywords
    for line in resume_text.split('\n'):
        if any(keyword in line for keyword in company_keywords):
            company_names.append(line.strip())

    # Post-processing to filter out irrelevant terms (like job titles, skills, or unrelated terms)
    irrelevant_terms = [
        'Engineer', 'QA', 'SQL', 'Developer', 'Tester', 'Testing', 'Data', 'Planning', 
        'Consultant', 'Security', 'Audit', 'Manager', 'Analysis', 'System', 'Service', 'Methods', 'Owner', 'Demo'
    ]

    # Remove company names containing irrelevant terms
    filtered_company_names = [name for name in company_names if not any(term in name for term in irrelevant_terms)]
    
    # Final filter for ensuring the remaining names have company keywords or reasonable length
    final_filtered_company_names = [
        name for name in filtered_company_names
        if any(keyword in name for keyword in company_keywords) or len(name.split()) > 1
    ]

    # Return the cleaned list of company names, ensuring no duplicates
    return list(set(final_filtered_company_names))


# Generalized function to extract college names using regex and keyword-based patterns
def extract_college_names(resume_text):
    college_keywords = ['University', 'Institute', 'College', 'Academy', 'School of', 'Faculty of']
    college_names = []

    # Regex pattern for college names
    college_pattern = re.compile(r'[A-Z][a-zA-Z]+\s+(University|Institute|College|Academy|School|Faculty of)')
    college_matches = college_pattern.findall(resume_text)
    college_names.extend(college_matches)

    # Keyword-based pattern
    for line in resume_text.split('\n'):
        if any(keyword in line for keyword in college_keywords):
            college_names.append(line.strip())

    return list(set(college_names))

# Generalized function to extract total experience using regex
def extract_total_experience(resume_text):
    # Pattern matching for 'X years Y months'
    experience_pattern = re.compile(r'(\d+\s*years?\s*\d*\s*months?)')
    match = experience_pattern.search(resume_text)
    if match:
        return match.group(0)

    # Additional pattern for 'X+ years'
    experience_pattern_plus = re.compile(r'(\d+)\+?\s*(years?|months?)')
    match_plus = experience_pattern_plus.search(resume_text)
    if match_plus:
        return match_plus.group(0)

    return "Experience not specified"

# Function to extract text from a PDF file
def extract_resume_text(file_path):
    try:
        return extract_text(file_path)
    except Exception as e:
        logging.error(f"Failed to extract text from {file_path}: {e}")
        return ""

# Specify the folder path with resume PDF files
folder_path = r"C:\Users\91901\OneDrive\Desktop\resumes"  # Change this to your actual folder path

# List to hold the extracted data from all resumes
extracted_data = []

# Progress bar for batch processing
resume_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]

# Iterate through all the files in the folder
for filename in tqdm(resume_files, desc="Processing resumes"):
    resume_path = os.path.join(folder_path, filename)
    
    try:
        # Parse the resume using pyresparser
        data = ResumeParser(resume_path).get_extracted_data()

        # Extract text from the PDF for custom extraction
        resume_text = extract_resume_text(resume_path)
        
        # Generalized name extraction (now with comparison to ResumeParser result)
        extracted_name = extract_name_general(resume_text, data.get('name'))
        if extracted_name:
            data['name'] = extracted_name  # Override the name field with more accurate name
        
        # Generalized company names extraction
        company_names = extract_company_names(resume_text)
        if company_names:
            data['company_names'] = company_names
        
        # Generalized total experience extraction
        total_experience = extract_total_experience(resume_text)
        if total_experience:
            data['total_experience'] = total_experience
        
        # Generalized college extraction
        extracted_colleges = extract_college_names(resume_text)
        if extracted_colleges:
            data['college_name'] = extracted_colleges

        # Append the extracted data along with filename for reference
        if data:
            data['file_name'] = filename
            extracted_data.append(data)

    except Exception as e:
        logging.error(f"An error occurred while processing {filename}: {e}")

# Save the extracted data to a JSON file (replace with a valid path on your system)
json_output_path = r"C:\Users\91901\OneDrive\Desktop\Saved.json"  # Change this to a valid output path
with open(json_output_path, 'w', encoding='utf-8') as json_file:
    json.dump(extracted_data, json_file, ensure_ascii=False, indent=4)

# Save the extracted data to a CSV file (replace with a valid path on your system)
if extracted_data:  # Only save if we have data
    csv_output_path = r"C:\Users\91901\OneDrive\Desktop\Saved.csv"  # Change this to a valid output path
    csv_columns = extracted_data[0].keys()  # Get the keys from the first dict as column headers
    
    with open(csv_output_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=csv_columns)
        writer.writeheader()
        for data in extracted_data:
            writer.writerow(data)

print(f"Data has been saved to {json_output_path} and {csv_output_path}")










Processing resumes: 100%|████████████████████████████████████████████████████████████| 115/115 [06:45<00:00,  1.97s/it]Processing resumes: 100%|████████████████████████████████████████████████████████████| 115/115 [06:45<00:00,  3.52s/it]

Data has been saved to C:\Users\91901\OneDrive\Desktop\Saved.json and C:\Users\91901\OneDrive\Desktop\Saved.csv



