# Resume Screening - Embeddings & Feature Generation

Generate embeddings using TF-IDF, Word2Vec, and BERT, then combine them for feature representation.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import time
import warnings
warnings.filterwarnings('ignore')

from resume_screening.embeddings import TFIDFEmbedder, Word2VecEmbedder, BERTEmbedder
from resume_screening.data_loader import SyntheticDataGenerator
from resume_screening.utils import PerformanceMonitor

print("Libraries imported!")

In [None]:
# Generate dataset
resumes, jobs, labels = SyntheticDataGenerator.generate_matched_pairs(n_pairs=100)

print(f"Dataset created: {len(resumes)} samples")
print(f"Sample resume: {resumes[0][:100]}...")
print(f"Sample job: {jobs[0][:100]}...")

## 1. TF-IDF Embeddings

In [None]:
# Train TF-IDF
monitor = PerformanceMonitor()

monitor.start('tfidf_train')
tfidf = TFIDFEmbedder(max_features=1000)
tfidf.train(resumes + jobs)  # Train on both resumes and jobs
monitor.end('tfidf_train')

# Get embeddings
monitor.start('tfidf_embed')
resume_emb_tfidf = tfidf.embed(resumes[:10])
monitor.end('tfidf_embed')

print(f"TF-IDF Training Time: {monitor.times['tfidf_train']['elapsed']:.4f}s")
print(f"TF-IDF Embedding Time (10 samples): {monitor.times['tfidf_embed']['elapsed']:.4f}s")
print(f"TF-IDF Embedding Shape: {resume_emb_tfidf.shape}")
print(f"Sparsity: {(resume_emb_tfidf == 0).sum() / resume_emb_tfidf.size:.4f}")

## 2. Word2Vec Embeddings

In [None]:
# Train Word2Vec
monitor.start('word2vec_train')
w2v = Word2VecEmbedder(vector_size=300, window=5)
w2v.train(resumes + jobs, epochs=10)
monitor.end('word2vec_train')

# Get embeddings
monitor.start('word2vec_embed')
resume_emb_w2v = w2v.embed(resumes[:10])
monitor.end('word2vec_embed')

print(f"Word2Vec Training Time: {monitor.times['word2vec_train']['elapsed']:.4f}s")
print(f"Word2Vec Embedding Time (10 samples): {monitor.times['word2vec_embed']['elapsed']:.4f}s")
print(f"Word2Vec Embedding Shape: {resume_emb_w2v.shape}")
print(f"Word2Vec Model Vocabulary: {len(w2v.model.wv)}")

## 3. BERT Embeddings

In [None]:
# Load BERT
monitor.start('bert_load')
bert = BERTEmbedder(model_name='all-MiniLM-L6-v2')
monitor.end('bert_load')

# Get embeddings
monitor.start('bert_embed')
resume_emb_bert = bert.embed(resumes[:10])
monitor.end('bert_embed')

print(f"BERT Load Time: {monitor.times['bert_load']['elapsed']:.4f}s")
print(f"BERT Embedding Time (10 samples): {monitor.times['bert_embed']['elapsed']:.4f}s")
print(f"BERT Embedding Shape: {resume_emb_bert.shape}")

## 4. Embedding Comparison

In [None]:
# Comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Embedding statistics
embeddings = {
    'TF-IDF': resume_emb_tfidf,
    'Word2Vec': resume_emb_w2v,
    'BERT': resume_emb_bert
}

# Shape comparison
shapes = {k: v.shape for k, v in embeddings.items()}
axes[0, 0].bar(shapes.keys(), [s[1] for s in shapes.values()], color=['skyblue', 'lightcoral', 'lightgreen'])
axes[0, 0].set_ylabel('Embedding Dimension')
axes[0, 0].set_title('Embedding Dimensions')
axes[0, 0].set_yscale('log')

# Norm distribution
for emb_name, emb in embeddings.items():
    norms = np.linalg.norm(emb, axis=1)
    axes[0, 1].hist(norms, alpha=0.5, label=emb_name, bins=20)
axes[0, 1].set_xlabel('L2 Norm')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Embedding Norm Distribution')
axes[0, 1].legend()

# PCA visualization (2D)
for emb_name, emb in embeddings.items():
    pca = PCA(n_components=2)
    emb_2d = pca.fit_transform(emb)
    axes[1, 0].scatter(emb_2d[:, 0], emb_2d[:, 1], alpha=0.6, label=emb_name, s=50)
axes[1, 0].set_xlabel('PC1')
axes[1, 0].set_ylabel('PC2')
axes[1, 0].set_title('PCA 2D Projection')
axes[1, 0].legend()

# Sparsity
sparsity_data = []
for emb_name, emb in embeddings.items():
    sparsity = (emb == 0).sum() / emb.size
    sparsity_data.append(sparsity)
axes[1, 1].bar(embeddings.keys(), sparsity_data, color=['skyblue', 'lightcoral', 'lightgreen'])
axes[1, 1].set_ylabel('Sparsity')
axes[1, 1].set_title('Embedding Sparsity')
axes[1, 1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

print("Embedding Statistics:")
for emb_name, emb in embeddings.items():
    print(f"\n{emb_name}:")
    print(f"  Shape: {emb.shape}")
    print(f"  Mean norm: {np.linalg.norm(emb, axis=1).mean():.4f}")
    print(f"  Sparsity: {(emb == 0).sum() / emb.size:.4f}")

## 5. Save Models

In [None]:
# Save models
tfidf.save('../models/tfidf_model')
w2v.save('../models/word2vec_model')
bert.save('../models/bert_model')

print("All models saved!")