In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [2]:
# Load data
data = pd.read_csv('/content/db1.csv')

In [3]:
data.head(5)

Unnamed: 0,ID,Resume_str,Category,job_id,title,description,ATS_score
0,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905367422,Trademark Attorney,Junior Trademark Associate\nOur client is a to...,21
1,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3887888322,Delivery Driver / CDL A required / Seasonal,PBNA $25.75 / hour\n\nCLICK HERE to view our D...,24
2,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905243094,Senior Recruiter,Hit a glass ceiling in your earning potential ...,47
3,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3905323971,Order Fulfillment Coordinator,Are you ready to be a crucial part of our dyna...,51
4,18176523,SENIOR INFORMATION TECHNOLOGY MANAGER...,INFORMATION-TECHNOLOGY,3903830212,Medical Assistant Urgent Care Per Diem,"As a physician-founded and led organization, e...",38


In [4]:
# Data preprocessing
class ResumeDataset(Dataset):
    def __init__(self, resumes, descriptions, scores, tokenizer, max_len):
        self.resumes = resumes
        self.descriptions = descriptions
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.resumes)  # Return the length of the dataset

    def __getitem__(self, idx):
        resume = str(self.resumes[idx])  # Get the resume text
        description = str(self.descriptions[idx])  # Get the job description
        score = self.scores[idx]  # Get the ATS score

        # Tokenize the resume and job description
        inputs = self.tokenizer.encode_plus(
            resume,
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        input_ids = inputs['input_ids'].flatten()  # Get input IDs
        attention_mask = inputs['attention_mask'].flatten()  # Get attention mask

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'score': torch.tensor(score, dtype=torch.float)  # Return the score as a tensor
        }


In [5]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Load BERT tokenizer
MAX_LEN = 512  # Set maximum sequence length

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Prepare dataset
# Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_dataset = ResumeDataset(
    resumes=train_data['Resume_str'].values,
    descriptions=train_data['description'].values,
    scores=train_data['ATS_score'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
test_dataset = ResumeDataset(
    resumes=test_data['Resume_str'].values,
    descriptions=test_data['description'].values,
    scores=test_data['ATS_score'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [7]:
# Create DataLoader for batching and shuffling
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [8]:
# Model building
class ATSBertRegressor(torch.nn.Module):
    def __init__(self):
        super(ATSBertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')  # Load pre-trained BERT model
        self.dropout = torch.nn.Dropout(p=0.3)  # Add dropout for regularization
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)  # Linear layer for regression

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]  # Get pooled output from BERT
        dropout_output = self.dropout(pooled_output)  # Apply dropout
        linear_output = self.linear(dropout_output)  # Get linear output
        return linear_output


In [9]:
# Training the model
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, num_epochs):
    model = model.to(device)  # Move model to device (GPU or CPU)
    for epoch in range(num_epochs):  # Loop over epochs
        model.train()  # Set model to training mode
        for batch in data_loader:  # Loop over batches
            input_ids = batch['input_ids'].to(device)  # Get input IDs
            attention_mask = batch['attention_mask'].to(device)  # Get attention mask
            scores = batch['score'].to(device)  # Get scores

            optimizer.zero_grad()  # Zero the gradients
            outputs = model(input_ids, attention_mask)  # Forward pass
            loss = loss_fn(outputs, scores.unsqueeze(1))  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights
            scheduler.step()  # Update learning rate

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")  # Print loss for the epoch

In [10]:
# Evaluation
def evaluate_model(model, data_loader, device):
    model = model.to(device)  # Move model to device
    model.eval()  # Set model to evaluation mode
    predictions, actuals = [], []

    with torch.no_grad():  # Disable gradient computation
        for batch in data_loader:  # Loop over batches
            input_ids = batch['input_ids'].to(device)  # Get input IDs
            attention_mask = batch['attention_mask'].to(device)  # Get attention mask
            scores = batch['score'].to(device)  # Get scores

            outputs = model(input_ids, attention_mask)  # Forward pass
            predictions.extend(outputs.squeeze().tolist())  # Collect predictions
            actuals.extend(scores.tolist())  # Collect actual scores

    mse = mean_squared_error(actuals, predictions)  # Compute mean squared error
    return mse

In [11]:
# Main script
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available
model = ATSBertRegressor()  # Initialize model
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)  # Initialize optimizer
total_steps = len(train_loader) * 5  # Total steps for scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=total_steps//10, gamma=0.1)  # Learning rate scheduler
loss_fn = torch.nn.MSELoss().to(device)  # Loss function



In [None]:
# Train the model
train_model(model, train_loader, loss_fn, optimizer, device, scheduler, num_epochs=5)



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
# Evaluate the model
mse = evaluate_model(model, test_loader, device)
print(f"Mean Squared Error: {mse}")  # Print mean squared error

In [None]:
# Save the model
torch.save(model.state_dict(), 'ats_bert_model.pth')  # Save the trained model