In [1]:
import os
import re
import pandas as pd
from glob import glob

In [2]:
# Function to calculate word count and special characters count
def calculate_counts(file_content):
    word_count = len(file_content.split())
    special_char_count = len(re.findall(r'[^\w\s]', file_content))
    return word_count, special_char_count

In [3]:
# Function to count keywords in the resume
def count_keywords(file_content, keywords):
    keyword_counts = {}
    for keyword in keywords:
        keyword_counts[keyword] = len(re.findall(keyword, file_content, re.IGNORECASE))
    return keyword_counts

In [4]:
# Function to remove the phrase "Contact this candidate"
def remove_contact_phrase(file_content):
    return file_content.replace("Contact this candidate", "")

# Function to count occurrences of "continued..."
def count_continued(file_content):
    return len(re.findall(r'continued\.\.\.', file_content, re.IGNORECASE))

# Function to process each resume and return a dictionary of its data
def extract_resume_data(resume_file, folder_name, keywords):
    with open(resume_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Remove the phrase "Contact this candidate" if it exists
    content = remove_contact_phrase(content)
    
    # Get word count and special characters count
    word_count, special_char_count = calculate_counts(content)
    
    # Count the occurrences of keywords
    keyword_counts = count_keywords(content, keywords)
    
    # Count the occurrences of "continued..."
    continued_count = count_continued(content)
    
    # Resume file name
    resume_name = os.path.basename(resume_file)
    
    # Combine all data into a single dictionary
    resume_data = {
        'Resume File Name': resume_name,
        'Resume Folder (Job Title)': folder_name,
        'Word Count': word_count,
        'Special Characters Count': special_char_count,
        'Continued Count': continued_count,  # New column for "continued..." count
        'Resume': content  # Add full text of the resume
    }
    
    # Add the keyword counts to the resume data
    resume_data.update(keyword_counts)
    
    return resume_data

In [5]:
# Function to process all resumes in subfolders and store the data into a CSV
def process_resumes_to_csv(main_folder, output_csv, keywords):
    all_resumes = []
    
    # Traverse through each job title folder
    for folder in os.listdir(main_folder):
        folder_path = os.path.join(main_folder, folder)
        
        # Ensure it's a folder
        if os.path.isdir(folder_path):
            # Iterate over all txt files in the folder
            for resume_file in glob(os.path.join(folder_path, "*.txt")):
                resume_data = extract_resume_data(resume_file, folder, keywords)
                all_resumes.append(resume_data)
    
    # Create a DataFrame from the list of dictionaries and save as CSV
    df = pd.DataFrame(all_resumes)
    df.to_csv(output_csv, index=False)
    print(f"CSV saved as {output_csv}")

In [6]:
# List of keywords to search for in the resumes
keywords = [
    'Name', 'Contact', 'Professional Summary', 'Skills', 'Education',
    'Experience', 'Certifications', 'Accomplishments', 'Achievements',
    'Hobbies', 'Languages', 'Linkedin', 'Github'
]

In [7]:
# Folder containing job title subfolders with resume .txt files and desired output CSV
main_folder = 'postjobfree'  # Update with your main folder path
output_csv = 'resumes_data.csv'

In [8]:
# Run the script
process_resumes_to_csv(main_folder, output_csv, keywords)

CSV saved as resumes_data.csv
