In [3]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parthvinm/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/parthvinm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
# Define text normalization function
def normalize_text(text: str) -> list:
    """Normalize the input text."""
    words = word_tokenize(text)
    words = [re.sub('[^a-zA-Z]', '', word).lower() for word in words]
    words = [word for word in words if len(word)]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return words

In [22]:
# Define cosine similarity function
def cosine_similarity(document1: str, document2: str) -> float:
    """Calculate the cosine similarity between two documents."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([document1, document2])
    cosine_similarity_score = pairwise.cosine_similarity(vectors[0], vectors[1])
    return cosine_similarity_score.item()

In [23]:
# Function to extract and process fields
def extract_and_process_field(json_data, field_name):
    try:
        field = json_data.get(field_name, "")
        if isinstance(field, list):
            processed_items = []
            for item in field:
                if isinstance(item, dict):
                    # Convert each dictionary to a string representation
                    processed_items.append(" ".join(f"{k}: {v}" for k, v in item.items() if isinstance(v, str)))
                else:
                    processed_items.append(str(item))
            field = " ".join(processed_items)
        return field
    except (json.JSONDecodeError, AttributeError):
        return ""

In [24]:
# Function to process each row
def process_resume_row(row):
    try:
        # Load JSON objects
        resume_json = json.loads(row['resume_json'])
        new_resume_json = json.loads(row['new_resume'])

        # Extract and process work experience and projects
        work_ex_resume = extract_and_process_field(resume_json, 'work_experience')
        projects_resume = extract_and_process_field(resume_json, 'projects')
        work_ex_new_resume = extract_and_process_field(new_resume_json, 'work_experience')
        projects_new_resume = extract_and_process_field(new_resume_json, 'projects')

        # Normalize text
        work_ex_resume_normalized = " ".join(normalize_text(work_ex_resume))
        projects_resume_normalized = " ".join(normalize_text(projects_resume))
        work_ex_new_resume_normalized = " ".join(normalize_text(work_ex_new_resume))
        projects_new_resume_normalized = " ".join(normalize_text(projects_new_resume))

        # Calculate cosine similarities
        work_ex_score = cosine_similarity(work_ex_resume_normalized, work_ex_new_resume_normalized)
        projects_score = cosine_similarity(projects_resume_normalized, projects_new_resume_normalized)

        # Return similarity scores
        return pd.Series([work_ex_score, projects_score])

    except json.JSONDecodeError:
        # Handle JSON decoding errors
        return pd.Series([None, None])

In [27]:
# Example usage with a DataFrame 'df'
df = pd.read_csv('new_resumes.csv')  # Assuming you load your DataFrame from a CSV file

# Filter out rows where new_resume is empty
df = df[df['new_resume'].notna()]

# Apply the function to each row
df[['work_ex_similarity', 'projects_similarity']] = df.apply(process_resume_row, axis=1)

# Save the DataFrame to a new CSV file
df.to_csv('new_resumes_wsim.csv', index=False)

In [29]:
df.iloc[0]['projects_similarity']

0.26909632511407555