In [None]:
!pip install -U sentence-transformers


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


In [None]:
df = pd.read_csv("../data/interim/cleaned_data.csv")
X = df['processed_resume'].tolist()
y = df['label']

# Load label encoder
le = joblib.load("../models/label_encoder.pkl")


In [None]:
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute 384-dim BERT embeddings for all resumes
X_bert = bert_model.encode(X, show_progress_bar=True)

# Save the BERT model (optional)
joblib.dump(bert_model, '../models/bert_encoder.pkl')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_, cmap='Blues')
plt.title("BERT + Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
joblib.dump(lr_model, '../models/bert_logistic_model.pkl')
