In [5]:
import os
import docx
import pandas as pd

# Function to extract text from a DOCX file
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Function to extract skills and experience from the resume text
def extract_skills_and_experience(resume_text):
    skills = []
    experience = None

    # Define a list of skills you want to search for in the resume
    predefined_skills = ['python', 'java', 'sql', 'html', 'css', 'machine learning', 'data science']

    # Check for skills in the resume text
    for skill in predefined_skills:
        if skill.lower() in resume_text.lower():
            skills.append(skill)

    # Check for years of experience (simple regex for "X years")
    import re
    experience_match = re.search(r'(\d+)\s*years?', resume_text, re.IGNORECASE)
    if experience_match:
        experience = int(experience_match.group(1))

    return skills, experience

# Function to process all DOCX files in the folder
def process_folder(folder_path):
    result = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            file_path = os.path.join(folder_path, filename)
            resume_text = extract_text_from_docx(file_path)
            skills, experience = extract_skills_and_experience(resume_text)
            result.append({
                'Name': filename.replace('.docx', ''),
                'Skills': ', '.join(skills),
                'Years of Experience': experience if experience else 0
            })
    return result

# Function to save the extracted data to an Excel file
def save_to_excel(data, output_file):
    # Ensure the directory exists
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)

# Define folder path and output file path
folder_path = r'C:\Users\harbi\OneDrive\Desktop\resume_screening\Resumes'  # Corrected path
output_file = r'C:\Users\harbi\OneDrive\Desktop\resume_screeningextracted_data.xlsx'  # Path to save the Excel file

# Process all DOCX files in the folder
extracted_data = process_folder(folder_path)

# Save the extracted data to an Excel file
save_to_excel(extracted_data, output_file)

print(f"Data extraction complete. The results have been saved to {output_file}.")


Data extraction complete. The results have been saved to C:\Users\harbi\OneDrive\Desktop\resume_screeningextracted_data.xlsx.


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the extracted resume data (skills and experience)
df = pd.read_excel(r'C:\Users\harbi\OneDrive\Desktop\Resumes\extracted_data.xlsx')

# Function to take user input for skills and years of experience
def get_best_resumes():
    # Ask the user for required skills and years of experience
    required_skills = input("Enter the required skills (comma separated): ").lower().split(",")
    required_experience = int(input("Enter the required years of experience: "))
    
    # Clean up skills input (remove spaces)
    required_skills = [skill.strip() for skill in required_skills]
    
    # Filter resumes based on required experience
    matching_resumes = df[df['Years of Experience'] >= required_experience]
    
    # Function to check if a resume matches the required skills
    def match_skills(resume_skills):
        # Check if the resume_skills is not NaN or empty
        if isinstance(resume_skills, str):
            # Get skills from the resume (convert to lowercase)
            resume_skills = [skill.strip().lower() for skill in resume_skills.split(",")]
        
            # Check if the required skills are present in the resume
            matched_skills = [skill for skill in required_skills if skill in resume_skills]
        
            return len(matched_skills) == len(required_skills)
        else:
            return False  # If resume_skills is NaN or not a string, return False
    
    # Filter resumes based on skills match
    best_matches = matching_resumes[matching_resumes['Skills'].apply(match_skills)]
    
    # If no matching resumes based on skills, return an empty result
    if best_matches.empty:
        print("No matching resumes found based on skills.")
        return

    # Use cosine similarity to find the best matches based on skills
    vectorizer = TfidfVectorizer(stop_words='english')
    skill_matrix = vectorizer.fit_transform(best_matches['Skills'])
    
    # Create the skill vector for the user's input
    user_skill_input = ", ".join(required_skills)
    user_skill_vector = vectorizer.transform([user_skill_input])
    
    # Calculate cosine similarities
    similarity_scores = cosine_similarity(user_skill_vector, skill_matrix)
    best_matches['Similarity'] = similarity_scores.flatten()
    
    # Sort the resumes by similarity score in descending order
    best_matches_sorted = best_matches.sort_values(by='Similarity', ascending=False)
    
    # Show the best matching resumes
    print("\nBest matching resumes based on your input:")
    result = best_matches_sorted[['Name', 'Skills', 'Years of Experience', 'Similarity']]
    print(result[['Name', 'Skills', 'Years of Experience', 'Similarity']].to_string(index=False))

# Run the function
get_best_resumes()



Best matching resumes based on your input:
                                  Name                                                       Skills  Years of Experience  Similarity
                Prashant Chawda_resume                                                   java, html                   19    0.665155
                  Ashok Jayakumar - PM                                              java, sql, html                   12    0.547158
                         BA - Abhishek                                              java, sql, html                   11    0.547158
              Othman - Project Manager                                              java, sql, html                   10    0.547158
             Amrinder Business Analyst                                              java, sql, html                   10    0.547158
   Ranjan_Project Manager-Scrum Master                                              java, sql, html                   14    0.547158
                         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_matches['Similarity'] = similarity_scores.flatten()


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Load the extracted resume data (skills and experience)
df = pd.read_excel(r'C:\Users\harbi\OneDrive\Desktop\Resumes\extracted_data.xlsx')

# Function to take user input for skills and years of experience
def get_best_resumes():
    # Ask the user for required skills and years of experience
    required_skills = input("Enter the required skills (comma separated): ").lower().split(",")
    required_experience = int(input("Enter the required years of experience: "))
    
    # Clean up skills input (remove spaces)
    required_skills = [skill.strip() for skill in required_skills]
    
    # Filter resumes based on required experience
    matching_resumes = df[df['Years of Experience'] >= required_experience]
    
    # Assign random weights to skills and experience
    skill_weight = random.uniform(0.3, 0.7)  # Random weight between 0.3 and 0.7
    experience_weight = 1 - skill_weight  # Experience weight is the complementary value
    
    print(f"Using skill weight: {skill_weight:.2f}")
    print(f"Using experience weight: {experience_weight:.2f}")
    
    # Function to check if a resume matches the required skills
    def match_skills(resume_skills):
        # Check if the resume_skills is not NaN or empty
        if isinstance(resume_skills, str):
            # Get skills from the resume (convert to lowercase)
            resume_skills = [skill.strip().lower() for skill in resume_skills.split(",")]
        
            # Check if the required skills are present in the resume
            matched_skills = [skill for skill in required_skills if skill in resume_skills]
        
            return len(matched_skills) == len(required_skills)
        else:
            return False  # If resume_skills is NaN or not a string, return False
    
    # Filter resumes based on skills match
    best_matches = matching_resumes[matching_resumes['Skills'].apply(match_skills)]
    
    # If no matching resumes based on skills, return an empty result
    if best_matches.empty:
        print("No matching resumes found based on skills.")
        return

    # Use cosine similarity to find the best matches based on skills
    vectorizer = TfidfVectorizer(stop_words='english')
    skill_matrix = vectorizer.fit_transform(best_matches['Skills'])
    
    # Create the skill vector for the user's input
    user_skill_input = ", ".join(required_skills)
    user_skill_vector = vectorizer.transform([user_skill_input])
    
    # Calculate cosine similarities
    similarity_scores = cosine_similarity(user_skill_vector, skill_matrix)
    best_matches['Skill_Similarity'] = similarity_scores.flatten()
    
    # Normalize the Skill_Similarity score (between 0 and 1)
    best_matches['Skill_Similarity'] = (best_matches['Skill_Similarity'] - best_matches['Skill_Similarity'].min()) / (best_matches['Skill_Similarity'].max() - best_matches['Skill_Similarity'].min())
    
    # Calculate experience score (normalized)
    best_matches['Experience_Score'] = best_matches['Years of Experience'].apply(lambda x: min(x / required_experience, 1) if x >= required_experience else 0)
    
    # Calculate final score by combining skill similarity and experience score
    best_matches['Final_Score'] = (best_matches['Skill_Similarity'] * skill_weight) + (best_matches['Experience_Score'] * experience_weight)
    
    # Sort the resumes by final score in descending order
    best_matches_sorted = best_matches.sort_values(by='Final_Score', ascending=False)
    
    # Show the best matching resumes
    print("\nBest matching resumes based on your input:")
    result = best_matches_sorted[['Name', 'Skills', 'Years of Experience', 'Final_Score']]
    print(result[['Name', 'Skills', 'Years of Experience', 'Final_Score']].to_string(index=False))

# Run the function
get_best_resumes()


Using skill weight: 0.41
Using experience weight: 0.59

Best matching resumes based on your input:
                                  Name                                     Skills  Years of Experience  Final_Score
 Resume - Kishore Kotapati - 218E - BA                             sql, html, css                   15     1.000000
                       Jagan S Iyer PM                             sql, html, css                   20     1.000000
                              Rajesh_k                       java, sql, html, css                   25     0.854979
                             avinash G                       java, sql, html, css                   50     0.854979
      B Suresh Kumar_Project Manager_1               python, java, sql, html, css                   16     0.724917
Ravi Pattar- Sr. Agile Program manager python, java, sql, html, css, data science                   17     0.585280


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_matches['Skill_Similarity'] = similarity_scores.flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_matches['Skill_Similarity'] = (best_matches['Skill_Similarity'] - best_matches['Skill_Similarity'].min()) / (best_matches['Skill_Similarity'].max() - best_matches['Skill_Similarity'].min())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_