In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/final_ats_scores1.csv')
print(df.head())


In [None]:
# Remove Non-Grammatical Symbols
import re



# def remove_non_grammatical(text):
#     text = re.sub(r'http\S+|www\S+|@\S+|\S+.com', '', text)
#     return text

# # Apply to the relevant columns
# df['Resume_str'] = df['Resume_str'].apply(remove_non_grammatical)
# df['description'] = df['description'].apply(remove_non_grammatical)


In [None]:
# Download the 'punkt' resource
nltk.download('punkt')


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')  # Download the 'stopwords' resource


In [None]:
# Example text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'http\S+|www\S+|@\S+', '', text)  # Remove URLs, usernames, and mentions
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    tokens = nltk.word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)  # Join the tokens back into a single string

df['cleaned_resume'] = df['Resume_str'].apply(preprocess_text)
df['cleaned_description'] = df['description'].apply(preprocess_text)


In [None]:
# Handling Slangs, Abbreviations, and Emojis



In [None]:
# Spell Correction





In [None]:
#Split the text into individual tokens
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

def tokenize_text(text):
    return word_tokenize(text)

# Apply to the relevant columns
df['Resume_str'] = df['Resume_str'].apply(tokenize_text)
df['description'] = df['description'].apply(tokenize_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Convert all text to lowercase to ensure uniformity
def to_lowercase(text):
    return [word.lower() for word in text]

# Apply to the relevant columns
df['Resume_str'] = df['Resume_str'].apply(to_lowercase)
df['description'] = df['description'].apply(to_lowercase)


In [None]:
#Remove common stop words that do not add significant meaning
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

# Apply to the relevant columns
df['Resume_str'] = df['Resume_str'].apply(remove_stopwords)
df['description'] = df['description'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Reduce words to their base form

from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(word) for word in text]

# Apply to the relevant columns
df['Resume_str'] = df['Resume_str'].apply(lemmatize_text)
df['description'] = df['description'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#After preprocessing, join the tokens back into strings for embedding
def tokens_to_string(tokens):
    return ' '.join(tokens)

df['Resume_str'] = df['Resume_str'].apply(tokens_to_string)
df['description'] = df['description'].apply(tokens_to_string)


In [None]:
# Use pre-trained BERT to encode the texts
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def encode_texts(texts, tokenizer, max_len=128):
    encoded_inputs = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len, return_tensors='tf')
    return encoded_inputs['input_ids'], encoded_inputs['attention_mask']

# Encode the text columns
resume_ids, resume_masks = encode_texts(df['Resume_str'], tokenizer)
desc_ids, desc_masks = encode_texts(df['description'], tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

### Model Building

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Encode the training and validation texts
train_resume_ids, train_resume_masks = encode_texts(train_df['Resume_str'], tokenizer)
train_desc_ids, train_desc_masks = encode_texts(train_df['description'], tokenizer)

val_resume_ids, val_resume_masks = encode_texts(val_df['Resume_str'], tokenizer)
val_desc_ids, val_desc_masks = encode_texts(val_df['description'], tokenizer)

# Extract the ATS scores
train_ats_scores = train_df['ATS_Score'].values
val_ats_scores = val_df['ATS_Score'].values


In [None]:
# New
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define the model function with additional layers and regularization
def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name='attention_mask')

    bert_output = bert_model(input_ids, attention_mask=attention_mask)
    cls_token = bert_output.last_hidden_state[:, 0, :]

    # Additional Dense Layers
    dense1 = tf.keras.layers.Dense(256, activation='relu')(cls_token)
    dropout1 = tf.keras.layers.Dropout(0.3)(dense1)

    dense2 = tf.keras.layers.Dense(128, activation='relu')(dropout1)
    dropout2 = tf.keras.layers.Dropout(0.3)(dense2)

    output = tf.keras.layers.Dense(1, activation='linear')(dropout2)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss='mse', metrics=['mae'])

    return model

# Create the model
model = create_model(bert_model)
model.summary()





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_1 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                       

In [None]:
import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping


In [None]:
#from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor
    patience=3,          # Number of epochs to wait after the last improvement
    restore_best_weights=True  # Restore the best weights after stopping
)

# Train the model with early stopping
history = model.fit(
    [train_resume_ids, train_resume_masks], train_ats_scores,
    validation_data=([val_resume_ids, val_resume_masks], val_ats_scores),
    epochs=15,  # Increase epochs and use early stopping
    batch_size=32,  # Adjust based on your hardware capabilities
    callbacks=[early_stopping]
)




# # # Define the EarlyStopping callback
# early_stopping = EarlyStopping(
#     monitor='val_loss',  # Metric to monitor
#     patience=3,          # Number of epochs to wait after the last improvement
#     restore_best_weights=True  # Restore the best weights after stopping
# )

# # Train the model
# history = model.fit(
#     [train_resume_ids, train_resume_masks], train_ats_scores,
#     validation_data=([val_resume_ids, val_resume_masks], val_ats_scores),
#     epochs=7,  # Adjust based on your needs
#     batch_size=32,      # Adjust based on your hardware capabilities
#     callbacks=[early_stopping]  # Add early stopping callback
# )


Epoch 1/15




Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15


In [None]:
# def create_model(bert_model):
#     input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name='input_ids')
#     attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name='attention_mask')

#     bert_output = bert_model(input_ids, attention_mask=attention_mask)
#     cls_token = bert_output.last_hidden_state[:, 0, :]

#     dense = tf.keras.layers.Dense(64, activation='relu')(cls_token)
#     dropout = tf.keras.layers.Dropout(0.3)(dense)
#     output = tf.keras.layers.Dense(1, activation='linear')(dropout)

#     model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
#     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss='mse', metrics=['mae'])

#     return model

# # Create the model
# model = create_model(bert_model)
# model.summary()


In [None]:
# from tensorflow.keras.callbacks import EarlyStopping

# # Define the EarlyStopping callback
# early_stopping = EarlyStopping(
#     monitor='val_loss',  # Metric to monitor
#     patience=3,          # Number of epochs to wait after the last improvement
#     restore_best_weights=True  # Restore the best weights after stopping
# )

# # Train the model
# history = model.fit(
#     [train_resume_ids, train_resume_masks], train_ats_scores,
#     validation_data=([val_resume_ids, val_resume_masks], val_ats_scores),
#     epochs=7,  # Adjust based on your needs
#     batch_size=32,      # Adjust based on your hardware capabilities
#     callbacks=[early_stopping]  # Add early stopping callback
# )


In [None]:
val_loss, val_mae = model.evaluate([val_resume_ids, val_resume_masks], val_ats_scores)
print(f'Validation Loss: {val_loss}, Validation MAE: {val_mae}')


In [None]:
pip install lime


In [None]:
import numpy as np

# Function to predict ATS scores from raw text
def predict_ats_scores(texts):
    # Preprocess the texts
    encoded_inputs = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='tf')
    input_ids = encoded_inputs['input_ids']
    attention_mask = encoded_inputs['attention_mask']

    # Make predictions using the trained model
    predictions = model.predict([input_ids, attention_mask])

    return predictions.flatten()  # Flatten the output to match the expected shape


In [None]:
from lime.lime_text import LimeTextExplainer

# Create a LimeTextExplainer
explainer = LimeTextExplainer(class_names=['ATS_score'])

# Choose a sample from your validation set
sample_text = val_df['Resume_str'].iloc[0]
sample_text = sample_text.strip()

# Generate explanation for the sample text
exp = explainer.explain_instance(sample_text, predict_ats_scores, num_features=10)


In [None]:
from lime.lime_text import LimeTextExplainer

# Create a LimeTextExplainer
explainer = LimeTextExplainer(class_names=['ATS_score'])

# Choose a sample from your validation set
sample_text = val_df['Resume_str'].iloc[0]
sample_text = sample_text.strip()

# Generate explanation for the sample text
exp = explainer.explain_instance(sample_text, predict_ats_scores, num_features=10)
