# Resume Screening - Classification & Ranking Models

Train and evaluate ranking models for resume-job matching.

import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

from resume_screening.ranker import RankingModel, ResumeRanker
from resume_screening.data_loader import SyntheticDataGenerator
from resume_screening.embeddings import BERTEmbedder

print("Libraries imported!")

In [None]:
# Generate larger dataset for training
resumes, jobs, labels = SyntheticDataGenerator.generate_matched_pairs(n_pairs=200)

print(f"Dataset: {len(resumes)} samples")
print(f"Label distribution: {np.bincount(labels)}")
print(f"Class balance: {sum(labels)/len(labels):.2%} positive")

## 1. Feature Extraction

# Initialize ranker to extract features
print("Initializing BERT embedder and extracting features...")
ranker = ResumeRanker(embedder_type='bert', model_type='gradient_boosting')

# Extract features for all pairs
X = np.array([ranker.extract_features(resume, job) 
              for resume, job in zip(resumes, jobs)])
y = np.array(labels)

print(f"Feature matrix shape: {X.shape}")
print(f"Feature statistics:")
print(f"  Mean: {X.mean(axis=0)[:5]}... (first 5)")
print(f"  Std: {X.std(axis=0)[:5]}... (first 5)")

## 2. Train-Test Split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining label distribution: {np.bincount(y_train)}")
print(f"Test label distribution: {np.bincount(y_test)}")

## 3. Train Multiple Models

# Train different models
models = {}
results = {}

for model_type in ['logistic_regression', 'gradient_boosting', 'random_forest']:
    print(f"\nTraining {model_type}...")
    model = RankingModel(model_type=model_type)
    model.train(X_train, y_train)
    
    # Evaluate
    metrics = model.evaluate(X_test, y_test)
    models[model_type] = model
    results[model_type] = metrics
    
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1: {metrics['f1']:.4f}")

# Compare models
results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df)

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
results_df[['accuracy', 'precision', 'recall', 'f1']].plot(kind='bar', ax=ax)
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Detailed Analysis of Best Model

# Best model
best_model_name = results_df['f1'].idxmax()
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")

# Confusion matrix
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Unmatched', 'Matched'],
            yticklabels=['Unmatched', 'Matched'],
            ax=ax)
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
ax.set_title(f'Confusion Matrix - {best_model_name}')
plt.tight_layout()
plt.show()

# Classification report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Unmatched', 'Matched']))

# ROC curve
y_pred_proba = best_model.predict_proba(X_test)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

# Feature importance (if available)
if hasattr(best_model.model, 'feature_importances_'):
    importances = best_model.model.feature_importances_
    indices = np.argsort(importances)[-10:]
    
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(indices)), importances[indices])
    plt.yticks(range(len(indices)), [f'Feature {i}' for i in indices])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importances')
    plt.tight_layout()
    plt.show()

# Save best model
best_model.save('../models/ranking_model')
print(f"\nBest model ({best_model_name}) saved!")