In [1]:
import re
import nltk
from nltk.corpus import stopwords

# Ensure you have the NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'http\S+|www\S+|@\S+', '', text)  # Remove URLs, usernames, and mentions
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    tokens = nltk.word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)  # Join the tokens back into a single string


[nltk_data] Downloading package stopwords to C:\Users\Rohit
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Rohit
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv(r'R:\Big Data Analytics  Lambton\Sem 3\AML 2034 - Bhavik Gandhi\Project\bert_model\final_ats_scores9.csv')


In [3]:
# Preprocess the text data
df['cleaned_resume'] = df['Resume_str'].apply(preprocess_text)
df['cleaned_description'] = df['description'].apply(preprocess_text)


In [4]:
from sklearn.model_selection import train_test_split

# Split the data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Extract features and labels
train_resumes = train_df['cleaned_resume'].tolist()
train_descriptions = train_df['cleaned_description'].tolist()
train_labels = train_df['ATS_Score'].tolist()

val_resumes = val_df['cleaned_resume'].tolist()
val_descriptions = val_df['cleaned_description'].tolist()
val_labels = val_df['ATS_Score'].tolist()


In [5]:
import tensorflow as tf
from transformers import TFDistilBertModel, DistilBertTokenizer

# Load DistilBERT models and tokenizers
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

class ATSScorePredictionModel(tf.keras.Model):
    def __init__(self):
        super(ATSScorePredictionModel, self).__init__()
        self.distilbert_resume = distilbert_model
        self.distilbert_jd = distilbert_model
        self.dense_resume = tf.keras.layers.Dense(128, activation='relu')
        self.dense_jd = tf.keras.layers.Dense(128, activation='relu')
        self.concat_layer = tf.keras.layers.Concatenate()
        self.final_dense = tf.keras.layers.Dense(1, activation='linear')  # Regression output

    def call(self, inputs):
        resume_input, jd_input = inputs

        # Ensure inputs have correct shape
        resume_input = resume_input[:, 0, :]  # Remove unnecessary dimensions
        jd_input = jd_input[:, 0, :]  # Remove unnecessary dimensions
        
        # Process resume
        resume_embeddings = self.distilbert_resume(resume_input)[0]
        resume_pooled = tf.reduce_mean(resume_embeddings, axis=1)
        resume_output = self.dense_resume(resume_pooled)
        
        # Process JD
        jd_embeddings = self.distilbert_jd(jd_input)[0]
        jd_pooled = tf.reduce_mean(jd_embeddings, axis=1)
        jd_output = self.dense_jd(jd_pooled)
        
        # Concatenate and final prediction
        combined_output = self.concat_layer([resume_output, jd_output])
        final_output = self.final_dense(combined_output)
        
        return final_output

# Instantiate the model
model = ATSScorePredictionModel()

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss='mean_squared_error',
              metrics=['mae'])






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [6]:
def tokenize_inputs(resume_text, jd_text):
    resume_tokens = tokenizer(resume_text, return_tensors='tf', max_length=512, truncation=True, padding='max_length')
    jd_tokens = tokenizer(jd_text, return_tensors='tf', max_length=512, truncation=True, padding='max_length')
    return resume_tokens, jd_tokens


In [7]:
def create_dataset(resumes, descriptions, labels, batch_size=8):
    def gen():
        for resume, description, label in zip(resumes, descriptions, labels):
            resume_tokens = tokenizer(resume, return_tensors='tf', max_length=512, truncation=True, padding='max_length')
            description_tokens = tokenizer(description, return_tensors='tf', max_length=512, truncation=True, padding='max_length')
            yield (resume_tokens['input_ids'], description_tokens['input_ids']), label
    
    dataset = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            (
                tf.TensorSpec(shape=(None, 512), dtype=tf.int32),
                tf.TensorSpec(shape=(None, 512), dtype=tf.int32)
            ),
            tf.TensorSpec(shape=(), dtype=tf.float32)
        )
    )
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Create training and validation datasets
train_dataset = create_dataset(train_resumes, train_descriptions, train_labels)
val_dataset = create_dataset(val_resumes, val_descriptions, val_labels)


In [None]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2  # Adjust the number of epochs as needed
)


In [8]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2  # Adjust the number of epochs as needed
)


Epoch 1/2


Epoch 2/2


In [14]:
# Continue training for more epochs
history_continued = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
# Save the trained model
model.save('ats_score_prediction_model1.keras')



In [19]:
from lime.lime_text import LimeTextExplainer
import numpy as np

# Define a LIME explainer
explainer = LimeTextExplainer()

def explain_prediction(resume_text, jd_text):
    resume_input = preprocess_text(resume_text)
    jd_input = preprocess_text(jd_text)
    resume_tokens, jd_tokens = tokenize_inputs(resume_input, jd_input)
    
    # Convert tokens to the format expected by LIME
    combined_input = np.concatenate([resume_tokens['input_ids'].numpy(), jd_tokens['input_ids'].numpy()], axis=1)
    
    explanation = explainer.explain_instance(combined_input, model.predict)
    return explanation




if resume_input and jd_input:
    explanation = explain_prediction(resume_input, jd_input)
    print("Explanation:", explanation)
else:
    print("Please provide both resume and job description texts.")


TypeError: cannot use a string pattern on a bytes-like object

In [26]:
def explain_regression_prediction(resume, job_description, model, tokenizer):
    from lime.lime_text import LimeTextExplainer
    import numpy as np
    
    # Preprocess the inputs using the existing preprocess_text function
    preprocessed_resume = preprocess_text(resume)
    preprocessed_description = preprocess_text(job_description)
    
    # Combine the preprocessed texts
    combined_text = preprocessed_resume + " " + preprocessed_description
    
    # Initialize LIME Text Explainer (no mode parameter)
    explainer = LimeTextExplainer()
    
    # Define the prediction function for LIME
    def predict_fn(texts):
        tokenized_inputs = []
        for text in texts:
            tokens = tokenize_inputs(text, tokenizer)
            tokenized_inputs.append(tokens)
        
        # Stack the inputs into a single tensor for prediction
        tokenized_inputs = np.vstack(tokenized_inputs)
        predictions = model.predict(tokenized_inputs)
        return predictions.flatten()  # Flatten to return a 1D array of continuous predictions
    
    # Generate the explanation using LIME
    exp = explainer.explain_instance(combined_text, predict_fn)
    return exp


In [29]:
# Select a sample resume and job description from the dataset
sample_resume = str(df['Resume_str'].iloc[0])
sample_description = str(df['description'].iloc[0])

# Generate LIME explanation for regression
explanation = explain_regression_prediction(sample_resume, sample_description, model, tokenizer)

# Display the explanation
explanation.show_in_notebook()


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).