In [1]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise
from rouge import Rouge
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parthvinm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/parthvinm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Define text normalization function
def normalize_text(text: str) -> str:
    """Normalize the input text."""
    words = word_tokenize(text)
    words = [re.sub('[^a-zA-Z]', '', word).lower() for word in words]
    words = [word for word in words if len(word)]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)

In [3]:
# Define cosine similarity function
def cosine_similarity(document1: str, document2: str) -> float:
    """Calculate the cosine similarity between two documents."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([document1, document2])
    cosine_similarity_score = pairwise.cosine_similarity(vectors[0], vectors[1])
    return cosine_similarity_score.item()

In [4]:
from rouge import Rouge

# Initialize the Rouge scoring object
rouge = Rouge()

In [5]:
# Function to extract and process fields
def extract_and_process_field(json_data, field_name):
    try:
        field = json_data.get(field_name, "")
        if isinstance(field, list):
            processed_items = []
            for item in field:
                if isinstance(item, dict):
                    processed_items.append(" ".join(f"{k}: {v}" for k, v in item.items() if isinstance(v, str)))
                else:
                    processed_items.append(str(item))
            field = " ".join(processed_items)
        return field
    except (json.JSONDecodeError, AttributeError):
        return ""

In [6]:
# Function to extract and concatenate all text from a JSON
def extract_full_text(json_data):
    try:
        sections = []
        for key, value in json_data.items():
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        sections.append(" ".join(f"{k}: {v}" for k, v in item.items() if isinstance(v, str)))
                    else:
                        sections.append(str(item))
            elif isinstance(value, dict):
                sections.append(" ".join(f"{k}: {v}" for k, v in value.items() if isinstance(v, str)))
            else:
                sections.append(str(value))
        return " ".join(sections)
    except (json.JSONDecodeError, AttributeError):
        return ""

In [11]:
# Process each row of the dataframe
def process_resume_row(row, threshold=0.3):
    if pd.isna(row['new_resume']):
        return pd.Series([0.0, 0.0, 0.0, 0.0, 0.0])  # Assign 0 similarity for empty new_resume
    try:
        # Load JSON objects
        resume_json = json.loads(row['resume_json'])
        new_resume_json = json.loads(row['new_resume'])
        job_json = json.loads(row['job_json'])

        # Extract and process work experience and projects
        work_ex_resume = extract_and_process_field(resume_json, 'work_experience')
        projects_resume = extract_and_process_field(resume_json, 'projects')
        work_ex_new_resume = extract_and_process_field(new_resume_json, 'work_experience')
        projects_new_resume = extract_and_process_field(new_resume_json, 'projects')

        # Extract and process the full texts
        full_resume_text = extract_full_text(resume_json)
        full_new_resume_text = extract_full_text(new_resume_json)
        full_job_text = extract_full_text(job_json)

        # Normalize text
        work_ex_resume_normalized = normalize_text(work_ex_resume)
        projects_resume_normalized = normalize_text(projects_resume)
        work_ex_new_resume_normalized = normalize_text(work_ex_new_resume)
        projects_new_resume_normalized = normalize_text(projects_new_resume)
        full_resume_normalized = normalize_text(full_resume_text)
        full_new_resume_normalized = normalize_text(full_new_resume_text)
        full_job_normalized = normalize_text(full_job_text)

        # Calculate cosine similarities
        work_ex_score = cosine_similarity(work_ex_resume_normalized, work_ex_new_resume_normalized)
        projects_score = cosine_similarity(projects_resume_normalized, projects_new_resume_normalized)
        full_resume_score = cosine_similarity(full_resume_normalized, full_new_resume_normalized)

        # Apply threshold and calculate ROUGE similarities
        if work_ex_score < threshold or projects_score < threshold:
            resume_job_score = 0.0
            new_resume_job_score = 0.0
        else:
            resume_job_score = rouge.get_scores(full_resume_normalized, full_job_normalized)[0]['rouge-l']['f']
            new_resume_job_score = rouge.get_scores(full_new_resume_normalized, full_job_normalized)[0]['rouge-l']['f']

        # Round similarity scores to 2 decimal places
        return pd.Series([
            round(work_ex_score, 2), 
            round(projects_score, 2), 
            round(full_resume_score, 2), 
            round(resume_job_score, 2), 
            round(new_resume_job_score, 2)
        ])

    except json.JSONDecodeError:
        # Handle JSON decoding errors
        return pd.Series([0.0, 0.0, 0.0, 0.0, 0.0])

In [12]:
# Example usage with a DataFrame 'df_gpt'
df = pd.read_csv('new_resumes.csv')  # Assuming you load your DataFrame from a CSV file

# Apply the function to each row
df[['work_ex_similarity', 'projects_similarity', 'full_resume_similarity', 'resume_job_similarity', 'new_resume_job_similarity']] = df.apply(process_resume_row, axis=1, threshold=0.3)

# Calculate improvement percentage, handle division by zero and negative values
df['improvement_percentage'] = df.apply(
    lambda row: max(((row['new_resume_job_similarity'] - row['resume_job_similarity']) / row['resume_job_similarity']) * 100, 0) if row['resume_job_similarity'] else 0, axis=1
)

# Print the average improvement percentage
avg_improvement_percentage = df['improvement_percentage'].mean()
print(f"Average Improvement Percentage: {avg_improvement_percentage:.2f}%")

# Save the DataFrame to a new CSV file
df.to_csv('new_resumes_wsimrouge-l.csv', index=False)

Average Improvement Percentage: 5.69%


In [13]:
# Example usage with a DataFrame 'df_gpt'
df_gpt = pd.read_csv('new_resumes_gpt.csv')  # Assuming you load your DataFrame from a CSV file

# Apply the function to each row
df_gpt[['work_ex_similarity', 'projects_similarity', 'full_resume_similarity', 'resume_job_similarity', 'new_resume_job_similarity']] = df_gpt.apply(process_resume_row, axis=1, threshold=0.0)

# Calculate improvement percentage, handle division by zero and negative values
df_gpt['improvement_percentage'] = df_gpt.apply(
    lambda row: max(((row['new_resume_job_similarity'] - row['resume_job_similarity']) / row['resume_job_similarity']) * 100, 0) if row['resume_job_similarity'] else 0, axis=1
)

# Print the average improvement percentage
avg_improvement_percentage = df_gpt['improvement_percentage'].mean()
print(f"Average Improvement Percentage: {avg_improvement_percentage:.2f}%")

# Save the DataFrame to a new CSV file
df_gpt.to_csv('new_resumes_gpt_wsimrouge-l.csv', index=False)

Average Improvement Percentage: 39.33%
