# Resume Screening - Similarity Scoring

Compute similarity scores between resumes and job descriptions using multiple metrics.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

from resume_screening.embeddings import BERTEmbedder, TFIDFEmbedder, Word2VecEmbedder
from resume_screening.similarity import SimilarityScorer, MultiMetricScorer
from resume_screening.data_loader import SyntheticDataGenerator

print("Libraries imported!")

In [None]:
# Generate dataset
resumes, jobs, labels = SyntheticDataGenerator.generate_matched_pairs(n_pairs=50)

print(f"Dataset: {len(resumes)} samples")
print(f"Matched: {sum(labels)}, Unmatched: {len(labels)-sum(labels)}")

## 1. Single Embedder Similarity

In [None]:
# Initialize embedders
bert = BERTEmbedder()
tfidf = TFIDFEmbedder()
w2v = Word2VecEmbedder()

# Train TF-IDF and Word2Vec
tfidf.train(resumes + jobs)
w2v.train(resumes + jobs)

print("Embedders initialized and trained!")

In [None]:
# Compute similarity scores
bert_scorer = SimilarityScorer(bert)
tfidf_scorer = SimilarityScorer(tfidf)
w2v_scorer = SimilarityScorer(w2v)

# Score all pairs
bert_scores = []
tfidf_scores = []
w2v_scores = []

for resume, job in zip(resumes, jobs):
    bert_scores.append(bert_scorer.score_resume(resume, job, metric='cosine'))
    tfidf_scores.append(tfidf_scorer.score_resume(resume, job, metric='cosine'))
    w2v_scores.append(w2v_scorer.score_resume(resume, job, metric='cosine'))

bert_scores = np.array(bert_scores)
tfidf_scores = np.array(tfidf_scores)
w2v_scores = np.array(w2v_scores)

print("Similarity scores computed!")
print(f"BERT - Mean: {bert_scores.mean():.4f}, Std: {bert_scores.std():.4f}")
print(f"TF-IDF - Mean: {tfidf_scores.mean():.4f}, Std: {tfidf_scores.std():.4f}")
print(f"Word2Vec - Mean: {w2v_scores.mean():.4f}, Std: {w2v_scores.std():.4f}")

## 2. Score Distribution Analysis

In [None]:
# Plot score distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(bert_scores, bins=20, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Similarity Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('BERT Similarity Distribution')
axes[0].axvline(bert_scores.mean(), color='red', linestyle='--', label=f'Mean: {bert_scores.mean():.3f}')
axes[0].legend()

axes[1].hist(tfidf_scores, bins=20, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Similarity Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('TF-IDF Similarity Distribution')
axes[1].axvline(tfidf_scores.mean(), color='red', linestyle='--', label=f'Mean: {tfidf_scores.mean():.3f}')
axes[1].legend()

axes[2].hist(w2v_scores, bins=20, color='lightgreen', edgecolor='black', alpha=0.7)
axes[2].set_xlabel('Similarity Score')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Word2Vec Similarity Distribution')
axes[2].axvline(w2v_scores.mean(), color='red', linestyle='--', label=f'Mean: {w2v_scores.mean():.3f}')
axes[2].legend()

plt.tight_layout()
plt.show()

## 3. Score Comparison by Label

In [None]:
# Compare scores by label
labels_array = np.array(labels)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# BERT
axes[0].scatter(np.where(labels_array==1)[0], bert_scores[labels_array==1], 
               alpha=0.6, s=100, color='green', label='Matched')
axes[0].scatter(np.where(labels_array==0)[0], bert_scores[labels_array==0], 
               alpha=0.6, s=100, color='red', label='Unmatched')
axes[0].set_ylabel('Score')
axes[0].set_title('BERT Scores by Label')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# TF-IDF
axes[1].scatter(np.where(labels_array==1)[0], tfidf_scores[labels_array==1], 
               alpha=0.6, s=100, color='green', label='Matched')
axes[1].scatter(np.where(labels_array==0)[0], tfidf_scores[labels_array==0], 
               alpha=0.6, s=100, color='red', label='Unmatched')
axes[1].set_ylabel('Score')
axes[1].set_title('TF-IDF Scores by Label')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Word2Vec
axes[2].scatter(np.where(labels_array==1)[0], w2v_scores[labels_array==1], 
               alpha=0.6, s=100, color='green', label='Matched')
axes[2].scatter(np.where(labels_array==0)[0], w2v_scores[labels_array==0], 
               alpha=0.6, s=100, color='red', label='Unmatched')
axes[2].set_ylabel('Score')
axes[2].set_title('Word2Vec Scores by Label')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics
print("\nScore Statistics by Label:")
print("\nBERT:")
print(f"  Matched - Mean: {bert_scores[labels_array==1].mean():.4f}, Std: {bert_scores[labels_array==1].std():.4f}")
print(f"  Unmatched - Mean: {bert_scores[labels_array==0].mean():.4f}, Std: {bert_scores[labels_array==0].std():.4f}")

print("\nTF-IDF:")
print(f"  Matched - Mean: {tfidf_scores[labels_array==1].mean():.4f}, Std: {tfidf_scores[labels_array==1].std():.4f}")
print(f"  Unmatched - Mean: {tfidf_scores[labels_array==0].mean():.4f}, Std: {tfidf_scores[labels_array==0].std():.4f}")

print("\nWord2Vec:")
print(f"  Matched - Mean: {w2v_scores[labels_array==1].mean():.4f}, Std: {w2v_scores[labels_array==1].std():.4f}")
print(f"  Unmatched - Mean: {w2v_scores[labels_array==0].mean():.4f}, Std: {w2v_scores[labels_array==0].std():.4f}")

## 4. Multi-Metric Scoring

# Create multi-metric scorer
embedders = {
    'bert': bert,
    'tfidf': tfidf,
    'word2vec': w2v
}

multi_scorer = MultiMetricScorer(embedders)

# Set custom weights (optional)
multi_scorer.set_weights({
    'bert': 0.5,
    'tfidf': 0.3,
    'word2vec': 0.2
})

# Score all pairs
multi_scores = multi_scorer.score_multiple_resumes(resumes, jobs[0])

print(f"Multi-Metric Scoring Results (for job 0):")
print(f"\nTop 5 resumes:")
for i, result in enumerate(multi_scores[:5], 1):
    print(f"{i}. Index: {result['index']}, Combined Score: {result['combined']:.4f}")

## 5. Ranking Quality Assessment

# Use first job as query
query_job = jobs[0]

# Get rankings
ranked_bert = bert_scorer.score_multiple_resumes(resumes, query_job)
ranked_tfidf = tfidf_scorer.score_multiple_resumes(resumes, query_job)
ranked_w2v = w2v_scorer.score_multiple_resumes(resumes, query_job)

# Top 5 resumes by each method
print(f"Top 5 Resumes for Query Job\n")
print("BERT Top 5:")
for rank, (idx, score) in enumerate(ranked_bert[:5], 1):
    label = "✓ Matched" if labels[idx] == 1 else "✗ Unmatched"
    print(f"  {rank}. Resume {idx}: {score:.4f} ({label})")

print("\nTF-IDF Top 5:")
for rank, (idx, score) in enumerate(ranked_tfidf[:5], 1):
    label = "✓ Matched" if labels[idx] == 1 else "✗ Unmatched"
    print(f"  {rank}. Resume {idx}: {score:.4f} ({label})")

print("\nWord2Vec Top 5:")
for rank, (idx, score) in enumerate(ranked_w2v[:5], 1):
    label = "✓ Matched" if labels[idx] == 1 else "✗ Unmatched"
    print(f"  {rank}. Resume {idx}: {score:.4f} ({label})")