<a href="https://colab.research.google.com/github/itsJonnie/AI-Resume-Screener/blob/main/ai_resume_screener.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# make sure libraries are installed
!pip install transformers
!pip install torch
!pip install spacy
!python -m spacy download en_core_web_lg
# If you want to use the transformer-based spaCy model:
!python -m spacy download en_core_web_trf
# For Kaggle datasets (requires Kaggle API setup)
!pip install kaggle

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
import os
import numpy as np
import pandas as pd
import re
import string
import os
import warnings
import torch
import kagglehub
warnings.filterwarnings('ignore')



In [None]:
# Download latest version
path = kagglehub.dataset_download("gauravduttakiit/resume-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/gauravduttakiit/resume-dataset?dataset_version_number=1...


100%|██████████| 383k/383k [00:00<00:00, 1.03MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/gauravduttakiit/resume-dataset/versions/1





In [None]:
# List the files in the downloaded dataset
print("Files in the dataset directory:")
print(os.listdir(path))

# Construct the full file path by joining the download path with the filename
file_path = os.path.join(path, "UpdatedResumeDataSet.csv")
print("Full file path:", file_path)

# Check if the file exists
if os.path.exists(file_path):
    print("File exists!")
    # Now read the CSV file with the complete path
    df = pd.read_csv(file_path)

Files in the dataset directory:
['UpdatedResumeDataSet.csv']
Full file path: /root/.cache/kagglehub/datasets/gauravduttakiit/resume-dataset/versions/1/UpdatedResumeDataSet.csv
File exists!


In [None]:
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [None]:
# For text processing
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

# For BERT model
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

# For evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed
np.random.seed(42)
torch.manual_seed(42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


<torch._C.Generator at 0x7ef9af4cedf0>

In [None]:
class ResumeScreener:
    def __init__(self, model_type="bert"):
        self.model_type = model_type
        if model_type == "bert":
            # Initialize BERT tokenizer and model
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.model = None  # Will be initialized during training
        else:
            # Initialize spaCy model
            import spacy
            if model_type == "spacy-transformer":
                self.nlp = spacy.load("en_core_web_trf")
            else:
                self.nlp = spacy.load("en_core_web_lg")
            self.model = None  # Will be initialized during training
        self.label_mapping = None  # Will store category names

    def preprocess_resume(self, text):
        """Clean and preprocess resume text"""
        # Handle NaN values
        if pd.isna(text):
            return ""

        # Convert to string if not already
        text = str(text)

        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+', '', text)

        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)

        # Remove phone numbers
        text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '', text)

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def prepare_data_bert(self, resumes, labels):
        """Prepare data for BERT model"""
        # Tokenize all resumes and map tokens to IDs
        input_ids = []
        attention_masks = []
        max_length = 512  # BERT's maximum sequence length

        for resume in resumes:
            encoded_dict = self.tokenizer.encode_plus(
                resume,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )

            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])

        # Convert lists to tensors
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.tensor(labels)

        return input_ids, attention_masks, labels

    def prepare_data_spacy(self, resumes, labels):
        """Prepare data for spaCy model"""
        # Process resumes with spaCy
        docs = list(self.nlp.pipe(resumes, disable=["tagger", "parser"]))

        # Create document vectors
        X = np.array([doc.vector for doc in docs])
        y = np.array(labels)

        return X, y

    def train_bert(self, train_dataloader, validation_dataloader, num_labels):
        """Train BERT model"""
        # Initialize model
        self.model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=num_labels,
            output_attentions=False,
            output_hidden_states=False,
        )

        # Use GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)
        print(f"Using device: {device}")

        # Set up optimizer
        optimizer = AdamW(self.model.parameters(),
                         lr=2e-5,
                         eps=1e-8)

        # Number of training epochs
        epochs = 4

        # Total number of training steps
        total_steps = len(train_dataloader) * epochs

        # Set up learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                  num_warmup_steps=0,
                                                  num_training_steps=total_steps)

        # Training loop
        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")

            # Training phase
            self.model.train()
            total_train_loss = 0

            for batch in train_dataloader:
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                # Clear previously calculated gradients
                self.model.zero_grad()

                # Forward pass
                outputs = self.model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)

                loss = outputs.loss
                total_train_loss += loss.item()

                # Backward pass
                loss.backward()

                # Update parameters and take a step using the computed gradient
                optimizer.step()

                # Update the learning rate
                scheduler.step()

            avg_train_loss = total_train_loss / len(train_dataloader)
            print(f"Average training loss: {avg_train_loss}")

            # Validation phase
            self.model.eval()
            total_eval_loss = 0
            predictions = []
            true_labels = []

            for batch in validation_dataloader:
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                with torch.no_grad():
                    outputs = self.model(b_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_input_mask,
                                       labels=b_labels)

                loss = outputs.loss
                total_eval_loss += loss.item()

                # Get predictions
                logits = outputs.logits
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                predictions.extend(np.argmax(logits, axis=1).flatten())
                true_labels.extend(label_ids.flatten())

            avg_val_loss = total_eval_loss / len(validation_dataloader)
            print(f"Validation loss: {avg_val_loss}")

            # Print classification report
            print("\nClassification Report:")
            print(classification_report(true_labels, predictions,
                                       target_names=list(self.label_mapping.values())))

        print("Training complete!")
        return self.model

    def train_spacy(self, X_train, y_train, X_test, y_test):
        """Train a classifier using spaCy embeddings"""
        from sklearn.linear_model import LogisticRegression

        # Initialize classifier
        self.model = LogisticRegression(max_iter=1000, random_state=42)

        # Train classifier
        self.model.fit(X_train, y_train)

        # Predict on test set
        y_pred = self.model.predict(X_test)

        # Evaluate
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred,
                                  target_names=list(self.label_mapping.values())))

        return self.model

    def train(self, resumes, labels, label_mapping, test_size=0.2):
        """Main training method that handles both BERT and spaCy models"""
        # Store label mapping
        self.label_mapping = label_mapping

        # Preprocess resumes
        processed_resumes = [self.preprocess_resume(resume) for resume in resumes]

        if self.model_type == "bert":
            # Prepare data for BERT
            input_ids, attention_masks, label_tensor = self.prepare_data_bert(processed_resumes, labels)

            # Split data into train and validation sets
            train_idx, val_idx = train_test_split(
                np.arange(len(labels)),
                test_size=test_size,
                random_state=42,
                stratify=labels
            )

            # Create train and validation datasets
            train_dataset = TensorDataset(
                input_ids[train_idx],
                attention_masks[train_idx],
                label_tensor[train_idx]
            )

            val_dataset = TensorDataset(
                input_ids[val_idx],
                attention_masks[val_idx],
                label_tensor[val_idx]
            )

            # Create DataLoaders
            batch_size = 16

            train_dataloader = DataLoader(
                train_dataset,
                sampler=RandomSampler(train_dataset),
                batch_size=batch_size
            )

            validation_dataloader = DataLoader(
                val_dataset,
                sampler=SequentialSampler(val_dataset),
                batch_size=batch_size
            )

            # Train BERT model
            num_labels = len(np.unique(labels))
            self.train_bert(train_dataloader, validation_dataloader, num_labels)

        else:
            # Prepare data for spaCy
            X, y = self.prepare_data_spacy(processed_resumes, labels)

            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=42, stratify=y
            )

            # Train spaCy model
            self.train_spacy(X_train, y_train, X_test, y_test)

    def predict(self, resume):
        """Predict job category for a new resume"""
        if self.model is None:
            raise ValueError("Model hasn't been trained yet. Call train() first.")

        # Preprocess the resume
        processed_resume = self.preprocess_resume(resume)

        if self.model_type == "bert":
            # Tokenize resume
            encoded_dict = self.tokenizer.encode_plus(
                processed_resume,
                add_special_tokens=True,
                max_length=512,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )

            # Move to GPU if available
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            input_ids = encoded_dict['input_ids'].to(device)
            attention_mask = encoded_dict['attention_mask'].to(device)

            # Set model to evaluation mode
            self.model.eval()

            # Get prediction
            with torch.no_grad():
                outputs = self.model(input_ids, token_type_ids=None, attention_mask=attention_mask)

            logits = outputs.logits
            prediction_idx = np.argmax(logits.cpu().numpy(), axis=1).flatten()[0]

            # Convert numerical prediction to category name
            prediction = self.label_mapping[prediction_idx]

            return prediction

        else:
            # Process with spaCy
            doc = self.nlp(processed_resume)

            # Get document vector
            doc_vector = doc.vector.reshape(1, -1)

            # Get prediction
            prediction_idx = self.model.predict(doc_vector)[0]

            # Convert numerical prediction to category name
            prediction = self.label_mapping[prediction_idx]

            return prediction

    def save_model(self, filepath):
        """Save the trained model"""
        if self.model is None:
            raise ValueError("Model hasn't been trained yet. Call train() first.")

        if self.model_type == "bert":
            # Save BERT model
            self.model.save_pretrained(filepath)
            self.tokenizer.save_pretrained(filepath)

            # Save label mapping
            import json
            with open(f"{filepath}/label_mapping.json", 'w') as f:
                json.dump(self.label_mapping, f)
        else:
            # Save spaCy classifier
            import joblib
            joblib.dump(self.model, f"{filepath}/model.joblib")

            # Save label mapping
            import json
            with open(f"{filepath}/label_mapping.json", 'w') as f:
                json.dump(self.label_mapping, f)

    def load_model(self, filepath):
        """Load a trained model"""
        # Load label mapping
        import json
        with open(f"{filepath}/label_mapping.json", 'r') as f:
            self.label_mapping = json.load(f)

        if self.model_type == "bert":
            # Load BERT model
            self.model = BertForSequenceClassification.from_pretrained(filepath)
            self.tokenizer = BertTokenizer.from_pretrained(filepath)

In [None]:
# Extract the features (resume text) and labels (job categories)
resumes = df['Resume'].tolist()
categories = df['Category'].tolist()

# Create a mapping from category names to numerical labels
unique_categories = list(set(categories))
category_mapping = {category: idx for idx, category in enumerate(unique_categories)}
label_mapping = {idx: category for idx, category in enumerate(unique_categories)}

# Convert categories to numerical labels
labels = [category_mapping[category] for category in categories]

# Initialize the ResumeScreener
# Choose between "bert", "spacy-lg", or "spacy-transformer"
screener = ResumeScreener(model_type="bert")

# Train the model
screener.train(resumes, labels, label_mapping, test_size=0.2)

# Save the trained model
os.makedirs('resume_classifier_model', exist_ok=True)
screener.save_model('resume_classifier_model')

# Test the model with a sample resume
sample_resume = df['Resume'].iloc[0]
predicted_category = screener.predict(sample_resume)
print(f"Predicted category: {predicted_category}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Epoch 1/4
Average training loss: 3.118417355479026
Validation loss: 2.843572946695181

Classification Report:
                           precision    recall  f1-score   support

   Electrical Engineering       0.00      0.00      0.00         6
         Business Analyst       0.00      0.00      0.00         6
Network Security Engineer       0.00      0.00      0.00         5
               Blockchain       0.00      0.00      0.00         8
       Operations Manager       0.00      0.00      0.00         8
       Health and fitness       0.16      0.83      0.27         6
       Automation Testing       0.00      0.00      0.00         5
                   Hadoop       0.38      0.75      0.50         8
           Civil Engineer       0.00      0.00      0.00         5
           Java Developer       0.32      0.65      0.43        17
                 Database       0.00      0.00      0.00         7
                       HR       0.00      0.00      0.00         9

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
Current device: cuda
GPU name: Tesla T4
