In [82]:
import pandas as pd
import json
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity
from sklearn.metrics import pairwise
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import openai
import numpy as np

# Set your OpenAI API key
openai.api_key = "sk-proj-Cw1drZV2K9ag9wXRLR2pT3BlbkFJZ9SiI26TtRgTVc9dT1UE" 

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parthvinm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/parthvinm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [83]:
# Define text normalization function
def normalize_text(text: str) -> list:
    """Normalize the input text."""
    words = word_tokenize(text)
    words = [re.sub('[^a-zA-Z]', '', word).lower() for word in words]
    words = [word for word in words if word and word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    if not words:
        return ["default", "content"]  # Default text to avoid empty vocabulary issues
    
    return words


In [84]:
# Define cosine similarity function for text documents
def cosine_similarity_text(document1: str, document2: str) -> float:
    """Calculate the cosine similarity between two text documents."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([document1, document2])
    cosine_similarity_score = sklearn_cosine_similarity(vectors[0], vectors[1])
    return cosine_similarity_score.item()


In [85]:
# # Truncate text to a specified token limit
# def truncate_text(text: str, token_limit: int = 512) -> str:
#     """Truncate text to a specified number of tokens."""
#     if isinstance(text, str):
#         words = text.split()
#         return " ".join(words[:token_limit])
#     return ""

In [86]:
# Function to split text into chunks
def split_into_chunks(text, max_tokens=512):
    words = text.split()
    for i in range(0, len(words), max_tokens):
        yield " ".join(words[i:i + max_tokens])

# Define cosine similarity function using OpenAI embeddings with chunking
def cosine_similarity_openai(document1: str, document2: str) -> float:
    """Calculate the cosine similarity between two documents using OpenAI embeddings with chunking."""
    # Split documents into chunks
    chunks1 = list(split_into_chunks(document1))
    chunks2 = list(split_into_chunks(document2))
    
    # Generate embeddings for all chunks
    all_chunks = chunks1 + chunks2
    response = openai.Embedding.create(
        input=all_chunks, 
        model="text-embedding-ada-002"
    )
    embeddings = [data['embedding'] for data in response['data']]
    
    # Separate embeddings for document1 and document2
    embeddings1 = np.array(embeddings[:len(chunks1)])
    embeddings2 = np.array(embeddings[len(chunks1):])
    
    # Average the embeddings
    average_embedding1 = np.mean(embeddings1, axis=0)
    average_embedding2 = np.mean(embeddings2, axis=0)
    
    # Calculate cosine similarity between the averaged embeddings
    cosine_similarity_score = sklearn_cosine_similarity(
        average_embedding1.reshape(1, -1), 
        average_embedding2.reshape(1, -1)
    )
    return cosine_similarity_score.item()

In [87]:
# Function to extract and process fields
def extract_and_process_field(json_data, field_name):
    try:
        field = json_data.get(field_name, "")
        if isinstance(field, list):
            processed_items = []
            for item in field:
                if isinstance(item, dict):
                    # Convert each dictionary to a string representation
                    processed_items.append(" ".join(f"{k}: {v}" for k, v in item.items() if isinstance(v, str)))
                else:
                    processed_items.append(str(item))
            field = " ".join(processed_items)
        return field
    except (json.JSONDecodeError, AttributeError):
        return ""

In [88]:
# Function to extract and concatenate all text from a resume
def extract_full_resume_text(json_data):
    try:
        sections = []
        for key, value in json_data.items():
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        sections.append(" ".join(f"{k}: {v}" for k, v in item.items() if isinstance(v, str)))
                    else:
                        sections.append(str(item))
            elif isinstance(value, dict):
                sections.append(" ".join(f"{k}: {v}" for k, v in value.items() if isinstance(v, str)))
            else:
                sections.append(str(value))
        return " ".join(sections)
    except (json.JSONDecodeError, AttributeError):
        return ""

In [89]:
# Function to extract and concatenate all text from a JSON
def extract_full_text(json_data):
    try:
        sections = []
        for key, value in json_data.items():
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        sections.append(" ".join(f"{k}: {v}" for k, v in item.items() if isinstance(v, str)))
                    else:
                        sections.append(str(item))
            elif isinstance(value, dict):
                sections.append(" ".join(f"{k}: {v}" for k, v in value.items() if isinstance(v, str)))
            else:
                sections.append(str(value))
        return " ".join(sections)
    except (json.JSONDecodeError, AttributeError):
        return ""

In [90]:
# Process each row of the dataframe
def process_resume_row(row, threshold=0.3):
    if pd.isna(row['new_resume']):
        return pd.Series([0.0, 0.0, 0.0, 0.0, 0.0])  # Assign 0 similarity for empty new_resume
    try:
        # Load JSON objects
        resume_json = json.loads(row['resume_json'])
        new_resume_json = json.loads(row['new_resume'])
        job_json = json.loads(row['job_json'])

        # Extract and process work experience and projects
        work_ex_resume = extract_and_process_field(resume_json, 'work_experience')
        projects_resume = extract_and_process_field(resume_json, 'projects')
        work_ex_new_resume = extract_and_process_field(new_resume_json, 'work_experience')
        projects_new_resume = extract_and_process_field(new_resume_json, 'projects')

        # Extract and process the full texts
        full_resume_text = extract_full_text(resume_json)
        full_new_resume_text = extract_full_text(new_resume_json)
        full_job_text = extract_full_text(job_json)

        # Normalize text
        work_ex_resume_normalized = normalize_text(work_ex_resume)
        projects_resume_normalized = normalize_text(projects_resume)
        work_ex_new_resume_normalized = normalize_text(work_ex_new_resume)
        projects_new_resume_normalized = normalize_text(projects_new_resume)
        full_resume_normalized = normalize_text(full_resume_text)
        full_new_resume_normalized = normalize_text(full_new_resume_text)
        full_job_normalized = normalize_text(full_job_text)

        # Convert normalized lists back to strings
        work_ex_resume_normalized = " ".join(work_ex_resume_normalized)
        projects_resume_normalized = " ".join(projects_resume_normalized)
        work_ex_new_resume_normalized = " ".join(work_ex_new_resume_normalized)
        projects_new_resume_normalized = " ".join(projects_new_resume_normalized)
        full_resume_normalized = " ".join(full_resume_normalized)
        full_new_resume_normalized = " ".join(full_new_resume_normalized)
        full_job_normalized = " ".join(full_job_normalized)

        # Ensure non-empty normalized text for TF-IDF similarity calculation
        if not work_ex_resume_normalized.strip() or not work_ex_new_resume_normalized.strip():
            return pd.Series([0.0, 0.0, 0.0, 0.0, 0.0])
        if not projects_resume_normalized.strip() or not projects_new_resume_normalized.strip():
            return pd.Series([0.0, 0.0, 0.0, 0.0, 0.0])

        # Calculate cosine similarities for text
        work_ex_score = cosine_similarity_text(work_ex_resume_normalized, work_ex_new_resume_normalized)
        projects_score = cosine_similarity_text(projects_resume_normalized, projects_new_resume_normalized)
        full_resume_score = cosine_similarity_text(full_resume_normalized, full_new_resume_normalized)

        # Apply threshold and calculate OpenAI-based cosine similarities for job match
        if work_ex_score < threshold or projects_score < threshold:
            resume_job_score = 0.0
            new_resume_job_score = 0.0
        else:
            resume_job_score = cosine_similarity_openai(full_resume_normalized, full_job_normalized)
            new_resume_job_score = cosine_similarity_openai(full_new_resume_normalized, full_job_normalized)

        # Round similarity scores to 2 decimal places
        return pd.Series([
            round(work_ex_score, 2), 
            round(projects_score, 2), 
            round(full_resume_score, 2), 
            round(resume_job_score, 2), 
            round(new_resume_job_score, 2)
        ])

    except json.JSONDecodeError:
        # Handle JSON decoding errors
        return pd.Series([0.0, 0.0, 0.0, 0.0, 0.0])


In [91]:
# Example usage with a DataFrame 'df_gpt'
df = pd.read_csv('new_resumes.csv')  # Assuming you load your DataFrame from a CSV file

# Apply the function to each row
df[['work_ex_similarity', 'projects_similarity', 'full_resume_similarity', 'resume_job_similarity', 'new_resume_job_similarity']] = df.apply(process_resume_row, axis=1)

# Calculate improvement percentage, handle division by zero and negative values
df['improvement_percentage'] = df.apply(
    lambda row: max(((row['new_resume_job_similarity'] - row['resume_job_similarity']) / row['resume_job_similarity']) * 100, 0) if row['resume_job_similarity'] else 0, axis=1
)

# Print the average improvement percentage
avg_improvement_percentage = df['improvement_percentage'].mean()
print(f"Average Improvement Percentage: {avg_improvement_percentage:.2f}%")

# Save the DataFrame to a new CSV file
df.to_csv('new_resumes_wsimgpt.csv', index=False)

Average Improvement Percentage: 0.54%


In [92]:
# Example usage with a DataFrame 'df_gpt'
df_gpt = pd.read_csv('new_resumes_gpt.csv')  # Assuming you load your DataFrame from a CSV file

# Apply the function to each row
df_gpt[['work_ex_similarity', 'projects_similarity', 'full_resume_similarity', 'resume_job_similarity', 'new_resume_job_similarity']] = df_gpt.apply(process_resume_row, axis=1, threshold=0.0)

# Calculate improvement percentage, handle division by zero and negative values
df_gpt['improvement_percentage'] = df_gpt.apply(
    lambda row: max(((row['new_resume_job_similarity'] - row['resume_job_similarity']) / row['resume_job_similarity']) * 100, 0) if row['resume_job_similarity'] else 0, axis=1
)

# Print the average improvement percentage
avg_improvement_percentage = df_gpt['improvement_percentage'].mean()
print(f"Average Improvement Percentage: {avg_improvement_percentage:.2f}%")

# Save the DataFrame to a new CSV file
df_gpt.to_csv('new_resumes_gpt_wsimgpt.csv', index=False)

Average Improvement Percentage: 2.23%
