In [10]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from transformers import BertModel, BertTokenizer
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import ast
from sklearn.model_selection import train_test_split
from torch.optim import AdamW

from tqdm import tqdm

In [2]:
# Import the dataset
df = pd.read_csv("resume_dataset.csv")
print(df.head())

                                         job_bullets            job_role  \
0  ['Conducted surveys and focus groups to gather...     User Researcher   
1  ['Developed and conducted surveys to assess us...     User Researcher   
2  ['Assisted in the design and execution of in-d...     User Researcher   
3  ['Managed and maintained office facilities, in...  Facilities Manager   
4  ['Developed and executed space planning strate...  Facilities Manager   

                                     job_description  \
0  Conduct user research to understand customer n...   
1  Conduct user research to understand customer n...   
2  Conduct user research to understand customer n...   
3  Facilities Managers manage facilities and buil...   
4  Facilities Managers manage facilities and buil...   

                                              skills  \
0  User research methods Usability testing Data a...   
1  User research methods Usability testing Data a...   
2  User research methods Usability tes

In [3]:
# Load the bert tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
# Create dataset
class ResumeFitDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Build Job Description (Query)
        jd_text = f"{row['job_role']}. {row['job_description']} " \
                  f"\n\nSkills: {row['skills']}." \
                  f"\nResponsibilities: {row['responsibilities']}"

        # Parse and build Resume (Key) with realistic formatting
        job_bullets = row["job_bullets"]
        if isinstance(job_bullets, str):
            try:
                job_bullets = ast.literal_eval(job_bullets)
            except:
                job_bullets = []

        resume_text = f"{row['applicant_job_role']}\n" + \
                      ("\n- " + "\n- ".join(job_bullets) if job_bullets else "") + \
                      f"\n\nSkills: {row['applicant_skills']}"

        # Tokenize input pair (query, key)
        encoding = self.tokenizer(
            jd_text,
            resume_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            # "label": torch.tensor(float(row["suitability_score"]), dtype=torch.float)
            "label": torch.tensor(0.0, dtype=torch.float)  # placeholder for now
        }


In [5]:
# Example usage of the dataset
dataset = ResumeFitDataset(df, tokenizer)
sample = dataset[0]
print(sample["label"])  # should print something like 0.78

tensor(0.)


In [6]:
# Make a dataloader with proper train test splits
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = ResumeFitDataset(train_df, tokenizer)
test_dataset = ResumeFitDataset(test_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [16]:
# Make the model
class JobFitModel(nn.Module):
    def __init__(self):
        super(JobFitModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.ffn = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )


    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Grab [CLS] token
        score = self.ffn(cls_embedding)
        return score.squeeze(1)  # Return shape: (batch_size,)

In [14]:
def train_model(model, train_loader, optimizer, criterion, device, epochs=3):
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} training loss: {avg_loss:.4f}")


In [17]:
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            all_preds.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(test_loader)
    print(f"Test MSE Loss: {avg_loss:.4f}")
    return all_preds, all_labels


In [18]:
def plot_predictions(preds, labels):
    plt.figure(figsize=(6, 6))
    plt.scatter(labels, preds, alpha=0.5, color='blue')
    plt.plot([0, 1], [0, 1], linestyle='--', color='red')  # ideal diagonal
    plt.xlabel("True Suitability Score")
    plt.ylabel("Predicted Score")
    plt.title("Predicted vs. True Suitability")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [19]:
def predict_suitability(model, tokenizer, jd_text, resume_text, device):
    model.eval()

    encoding = tokenizer(
        jd_text, resume_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )
    
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
    
    return output.item()


In [15]:
# Setup variables
model = JobFitModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
# Train Model
train_model(model, train_loader, optimizer, criterion, device, epochs=3)

In [None]:
# Evaluate Model
preds, labels = evaluate_model(model, test_loader, criterion, device)
plot_predictions(preds, labels)

In [None]:
# Use the model
jd_text = "UX Researcher. Conduct user research to inform product design. Skills: Interviewing, Data Analysis. Responsibilities: Design studies, synthesize insights."
resume_text = """UX Specialist
- Conducted 30+ user interviews
- Analyzed feedback to drive design decisions
- Collaborated with product and design teams

Skills: User Research, Journey Mapping, Usability Testing"""

score = predict_suitability(model, tokenizer, jd_text, resume_text, device)
print(f"Predicted Suitability: {score:.4f}")
