In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import os
from lime.lime_text import LimeTextExplainer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Disable GPU to avoid potential issues with LIME
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Load the preprocessed data
data = pd.read_csv('dbhealthpreprocess.csv')

# Load the tokenizer
with open('tokenizershealth.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the trained model
model = load_model('bighealth2.keras')

# Tokenize and pad the combined text (resume + job description)
sequences = tokenizer.texts_to_sequences(data['combined_text'].values)
max_sequence_length = 1500  # Use the same max length as before
data_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Tokenize and pad the nouns
nouns_sequences = tokenizer.texts_to_sequences(data['nouns_str'].values)
max_nouns_length = 10  # Use the same max length as before
nouns_data = pad_sequences(nouns_sequences, maxlen=max_nouns_length)

# Your target variable
y = data['ATS_Score'].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_padded, y, test_size=0.2, random_state=42)

# Predict on the test set
y_pred = model.predict([X_test, nouns_data[:len(X_test)]])

# Evaluate the model performance with regression metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# Custom prediction function for LIME that returns words instead of token indices
def predict_proba(texts):
    # Convert text into sequences
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_sequence_length)
    
    # Use the first example's nouns as input since LIME generally alters the text part, not the nouns.
    nouns_input = np.tile(nouns_data[0], (len(texts), 1))  # Repeat the same nouns for all samples
    
    # Use the model to predict the ATS score
    predictions = model.predict([padded, nouns_input])
    
    # Create a dummy probability for a second class (1 - prediction)
    dummy_proba = 1 - predictions
    
    # Map sequences back to words for LIME interpretation
    inverse_word_index = {v: k for k, v in tokenizer.word_index.items()}
    words = [[inverse_word_index.get(idx, '') for idx in seq if idx != 0] for seq in sequences]
    words_joined = [' '.join(word_list) for word_list in words]
    
    # Return a 2D array with probabilities for both classes
    return np.column_stack((dummy_proba, predictions))

# Instantiate the explainer with class names for better readability
explainer = LimeTextExplainer(class_names=['negative', 'positive'])

# Choose a single example from the test set to explain, limit the text size
idx = 0  # Example index, you can change this to explain different instances
text_instance = data['combined_text'].iloc[idx]

# Generate the explanation with a reduced number of samples using the wrapper function
exp = explainer.explain_instance(text_instance, predict_proba, num_features=10, num_samples=500)

# Display the explanation
exp.show_in_notebook(text=True)

AttributeError: 'dict' object has no attribute 'texts_to_sequences'

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import os
from lime.lime_text import LimeTextExplainer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Disable GPU to avoid potential issues with LIME
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Load the preprocessed data
data = pd.read_csv('dbhealthpreprocess.csv')

# Load the dictionary of tokenizers
with open(r'tokenizershealth.pkl', 'rb') as handle:
    tokenizers = pickle.load(handle)

# Extract tokenizers from the dictionary
resume_tokenizer = tokenizers['resume_tokenizer']
description_tokenizer = tokenizers['description_tokenizer']
common_nouns_tokenizer = tokenizers['common_nouns_tokenizer']

# Load the trained model
model = load_model('bighealth2.keras')


max_sequence_length = 1500  # Use the same max length as before

# Tokenize and pad the combined text (resume + job description)
description_sequences = description_tokenizer.texts_to_sequences(data['processed_description'])
description_data_padded = pad_sequences(description_sequences, maxlen=max_sequence_length)

# sequences_desc = description_tokenizer.texts_to_sequences(data['description'].values)
# data_padded = pad_sequences(sequences, maxlen=max_sequence_length)


resume_sequences = resume_tokenizer.texts_to_sequences(data['processed_resume'])
resume_data_padded = pad_sequences(resume_sequences, maxlen=max_sequence_length)
# Tokenize and pad the combined text (resume + job description)
# sequences_res = resume_tokenizer.texts_to_sequences(data['Resume_str'].values)
# max_sequence_length = 1500  # Use the same max length as before
# data_padded = pad_sequences(sequences, maxlen=max_sequence_length)


# Tokenize and pad the nouns
nouns_sequences = common_nouns_tokenizer.texts_to_sequences(data['common_nouns'].values)
max_nouns_length = 10  # Use the same max length as before
nouns_data = pad_sequences(nouns_sequences, maxlen=max_nouns_length)

# Your target variable
y = data['ATS_Score'].values

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(resume_data_padded, description_data_padded, nouns_data, y, test_size=0.2, random_state=42)

# Predict on the test set
y_pred = model.predict([resume_data_padded, description_data_padded, nouns_data])

y_test = data['ATS_Score']

# Evaluate the model performance with regression metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")



[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 774ms/step
Mean Absolute Error (MAE): 12.7844
Root Mean Squared Error (RMSE): 15.4666
R² Score: 0.1343


In [4]:
from lime.lime_text import LimeTextExplainer

# ... your previous code ...

# Reconstruct text from padded sequences
def reconstruct_text(resume_seq, description_seq, nouns_seq):
    resume_text = ' '.join(resume_tokenizer.sequences_to_texts([resume_seq])[0])
    description_text = ' '.join(description_tokenizer.sequences_to_texts([description_seq])[0])
    nouns_text = ' '.join(common_nouns_tokenizer.sequences_to_texts([nouns_seq])[0])
    combined_text = resume_text + ' ' + description_text + ' ' + nouns_text
    return combined_text


In [5]:

# Create LIME explainer
explainer = LimeTextExplainer(class_names=['ATS_Score'])


In [6]:

# Explain a prediction
index = 10  # Replace with the index of the instance you want to explain
instance = reconstruct_text(resume_data_padded[index], description_data_padded[index], nouns_data[index])


In [None]:
exp = explainer.explain_instance(instance, model.predict, num_features=10)
exp.show_in_notebook()


In [21]:

class ATSScorePredictor:
    def __init__(self, model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length):
        self.model = model
        self.resume_tokenizer = resume_tokenizer
        self.description_tokenizer = description_tokenizer
        self.common_nouns_tokenizer = common_nouns_tokenizer
        self.max_sequence_length = max_sequence_length
        self.max_nouns_length = max_nouns_length

    def predict(self, texts):
        # Split the input into resume, description, and nouns parts
        resume_texts = [text[0] for text in texts]
        description_texts = [text[1] for text in texts]
        noun_texts = [text[2] for text in texts]

        # Tokenize and pad each part
        resume_sequences = self.resume_tokenizer.texts_to_sequences(resume_texts)
        resume_data_padded = pad_sequences(resume_sequences, maxlen=self.max_sequence_length)

        description_sequences = self.description_tokenizer.texts_to_sequences(description_texts)
        description_data_padded = pad_sequences(description_sequences, maxlen=self.max_sequence_length)

        nouns_sequences = self.common_nouns_tokenizer.texts_to_sequences(noun_texts)
        nouns_data = pad_sequences(nouns_sequences, maxlen=self.max_nouns_length)

        # Return the prediction as a flattened array
        return self.model.predict([resume_data_padded, description_data_padded, nouns_data]).flatten()

# Create an instance of the predictor
predictor = ATSScorePredictor(model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length)

# Create an instance of LIME Text Explainer
explainer = LimeTextExplainer(class_names=['ATS Score'])

# Sample data point to explain (you can loop over multiple samples)
index_to_explain = 0  # Choose an index to explain

# Combine the resume, description, and nouns into a single list
combined_text_instance = f"{data['processed_resume'].iloc[index_to_explain]} " \
                         f"{data['processed_description'].iloc[index_to_explain]} " \
                         f"{data['common_nouns'].iloc[index_to_explain]}"


# Generate explanation
exp = explainer.explain_instance(combined_text_instance, predictor.predict, num_features=10)

# Display the explanation in a readable format
exp.show_in_notebook(text=True)

# Or to print the explanation
print(exp.as_list())

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 582ms/step


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [22]:
import numpy as np

class ATSScorePredictor:
    def __init__(self, model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length):
        self.model = model
        self.resume_tokenizer = resume_tokenizer
        self.description_tokenizer = description_tokenizer
        self.common_nouns_tokenizer = common_nouns_tokenizer
        self.max_sequence_length = max_sequence_length
        self.max_nouns_length = max_nouns_length

    def predict(self, texts):
        # Split the input into resume, description, and nouns parts
        resume_texts = [text[0] for text in texts]
        description_texts = [text[1] for text in texts]
        noun_texts = [text[2] for text in texts]

        # Tokenize and pad each part
        resume_sequences = self.resume_tokenizer.texts_to_sequences(resume_texts)
        resume_data_padded = pad_sequences(resume_sequences, maxlen=self.max_sequence_length)

        description_sequences = self.description_tokenizer.texts_to_sequences(description_texts)
        description_data_padded = pad_sequences(description_sequences, maxlen=self.max_sequence_length)

        nouns_sequences = self.common_nouns_tokenizer.texts_to_sequences(noun_texts)
        nouns_data = pad_sequences(nouns_sequences, maxlen=self.max_nouns_length)

        # Get the predictions and return them as a 2D array
        predictions = self.model.predict([resume_data_padded, description_data_padded, nouns_data])
        return np.array(predictions).reshape(-1, 1)

# Create an instance of the predictor
predictor = ATSScorePredictor(model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length)

# Create an instance of LIME Text Explainer
explainer = LimeTextExplainer(class_names=['ATS Score'])

# Sample data point to explain (you can loop over multiple samples)
index_to_explain = 0  # Choose an index to explain

# Combine the resume, description, and nouns into a single list
combined_text_instance = [
    data['processed_resume'].iloc[index_to_explain],
    data['processed_description'].iloc[index_to_explain],
    data['common_nouns'].iloc[index_to_explain]
]

# Generate explanation
exp = explainer.explain_instance(combined_text_instance, predictor.predict, num_features=10)

# Display the explanation in a readable format
exp.show_in_notebook(text=True)

# Or to print the explanation
print(exp.as_list())


TypeError: expected string or bytes-like object, got 'list'

In [None]:

# Create an instance of LIME Text Explainer
explainer = LimeTextExplainer(class_names=['ATS Score'])

# Sample data point to explain (you can loop over multiple samples)
index_to_explain = 0  # Choose an index to explain

# Combine the resume, description, and nouns into a single string
combined_text_instance = f"{data['processed_resume'].iloc[index_to_explain]} " \
                         f"{data['processed_description'].iloc[index_to_explain]} " \
                         f"{data['common_nouns'].iloc[index_to_explain]}"

# Generate explanation
exp = explainer.explain_instance(combined_text_instance, predictor.predict, num_features=10)

# Display the explanation in a readable format
exp.show_in_notebook(text=True)

# Or to print the explanation
print(exp.as_list())

In [24]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import os
from lime.lime_text import LimeTextExplainer

# Disable GPU to avoid potential issues with LIME
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Load the preprocessed data
data = pd.read_csv('dbhealthpreprocess.csv')

# Load the dictionary of tokenizers
with open('tokenizershealth.pkl', 'rb') as handle:
    tokenizers = pickle.load(handle)

# Extract tokenizers from the dictionary
resume_tokenizer = tokenizers['resume_tokenizer']
description_tokenizer = tokenizers['description_tokenizer']
common_nouns_tokenizer = tokenizers['common_nouns_tokenizer']

# Load the trained model
model = load_model('bighealth2.keras')

# Set the maximum sequence lengths
max_sequence_length = 1500  # For resume and description
max_nouns_length = 10       # For common nouns

# Define a class for predicting ATS scores
class ATSScorePredictor:
    def __init__(self, model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length):
        self.model = model
        self.resume_tokenizer = resume_tokenizer
        self.description_tokenizer = description_tokenizer
        self.common_nouns_tokenizer = common_nouns_tokenizer
        self.max_sequence_length = max_sequence_length
        self.max_nouns_length = max_nouns_length

    def predict(self, texts):
        # Texts is a list where each element is a tuple of (resume_text, description_text, nouns_text)
        resume_texts = [text[0] for text in texts]
        description_texts = [text[1] for text in texts]
        noun_texts = [text[2] for text in texts]

        # Tokenize and pad each part
        resume_sequences = self.resume_tokenizer.texts_to_sequences(resume_texts)
        resume_data_padded = pad_sequences(resume_sequences, maxlen=self.max_sequence_length)

        description_sequences = self.description_tokenizer.texts_to_sequences(description_texts)
        description_data_padded = pad_sequences(description_sequences, maxlen=self.max_sequence_length)

        nouns_sequences = self.common_nouns_tokenizer.texts_to_sequences(noun_texts)
        nouns_data = pad_sequences(nouns_sequences, maxlen=self.max_nouns_length)

        # Get the predictions and return them as a 2D array
        predictions = self.model.predict([resume_data_padded, description_data_padded, nouns_data])
        return np.array(predictions).reshape(-1, 1)

# Create an instance of the predictor
predictor = ATSScorePredictor(model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length)

# Create an instance of LIME Text Explainer
explainer = LimeTextExplainer()

# Sample data point to explain
index_to_explain = 0  # Choose an index to explain

# Combine the resume, description, and nouns into a single string
combined_text_instance = f"{data['processed_resume'].iloc[index_to_explain]} " \
                         f"{data['processed_description'].iloc[index_to_explain]} " \
                         f"{data['common_nouns'].iloc[index_to_explain]}"

# Define a function to convert combined text into a format that LIME expects
def preprocess_instance(instance):
    return [instance]

# Define a function for LIME to predict on combined text instances
def lime_predict(texts):
    preprocessed_texts = preprocess_instance(texts[0])
    return predictor.predict(preprocessed_texts)

# Generate explanation
exp = explainer.explain_instance(combined_text_instance, lime_predict, num_features=10)

# Display the explanation in a readable format
exp.show_in_notebook(text=True)

# Or to print the explanation in a list format
print(exp.as_list())

# Evaluate the model performance with regression metrics
y_test = np.array([data['ATS_Score'].iloc[index_to_explain]])
y_pred = predictor.predict([combined_text_instance])[0]

mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
r2 = 1 - (np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


IndexError: index 1 is out of bounds for axis 1 with size 1

In [25]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import os
from lime.lime_text import LimeTextExplainer

# Disable GPU to avoid potential issues with LIME
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Load the preprocessed data
data = pd.read_csv('dbhealthpreprocess.csv')

# Load the dictionary of tokenizers
with open('tokenizershealth.pkl', 'rb') as handle:
    tokenizers = pickle.load(handle)

# Extract tokenizers from the dictionary
resume_tokenizer = tokenizers['resume_tokenizer']
description_tokenizer = tokenizers['description_tokenizer']
common_nouns_tokenizer = tokenizers['common_nouns_tokenizer']

# Load the trained model
model = load_model('bighealth2.keras')

# Set the maximum sequence lengths
max_sequence_length = 1500  # For resume and description
max_nouns_length = 10       # For common nouns

# Define a class for predicting ATS scores
class ATSScorePredictor:
    def __init__(self, model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length):
        self.model = model
        self.resume_tokenizer = resume_tokenizer
        self.description_tokenizer = description_tokenizer
        self.common_nouns_tokenizer = common_nouns_tokenizer
        self.max_sequence_length = max_sequence_length
        self.max_nouns_length = max_nouns_length

    def predict(self, texts):
        # Texts is a list where each element is a tuple of (resume_text, description_text, nouns_text)
        resume_texts = [text[0] for text in texts]
        description_texts = [text[1] for text in texts]
        noun_texts = [text[2] for text in texts]

        # Tokenize and pad each part
        resume_sequences = self.resume_tokenizer.texts_to_sequences(resume_texts)
        resume_data_padded = pad_sequences(resume_sequences, maxlen=self.max_sequence_length)

        description_sequences = self.description_tokenizer.texts_to_sequences(description_texts)
        description_data_padded = pad_sequences(description_sequences, maxlen=self.max_sequence_length)

        nouns_sequences = self.common_nouns_tokenizer.texts_to_sequences(noun_texts)
        nouns_data = pad_sequences(nouns_sequences, maxlen=self.max_nouns_length)

        # Get the predictions and return them as a 2D array with shape (n_samples, 1)
        predictions = self.model.predict([resume_data_padded, description_data_padded, nouns_data])
        return predictions.reshape(-1, 1)  # Ensure predictions are shaped correctly

# Create an instance of the predictor
predictor = ATSScorePredictor(model, resume_tokenizer, description_tokenizer, common_nouns_tokenizer, max_sequence_length, max_nouns_length)

# Create an instance of LIME Text Explainer
explainer = LimeTextExplainer()

# Sample data point to explain
index_to_explain = 0  # Choose an index to explain

# Combine the resume, description, and nouns into a single string
combined_text_instance = f"{data['processed_resume'].iloc[index_to_explain]} " \
                         f"{data['processed_description'].iloc[index_to_explain]} " \
                         f"{data['common_nouns'].iloc[index_to_explain]}"

# Define a function to convert combined text into a format that LIME expects
def preprocess_instance(instance):
    return [instance]

# Define a function for LIME to predict on combined text instances
def lime_predict(texts):
    preprocessed_texts = preprocess_instance(texts[0])
    return predictor.predict(preprocessed_texts)

# Generate explanation
exp = explainer.explain_instance(combined_text_instance, lime_predict, num_features=10)

# Display the explanation in a readable format
exp.show_in_notebook(text=True)

# Or to print the explanation in a list format
print(exp.as_list())

# Evaluate the model performance with regression metrics
y_test = np.array([data['ATS_Score'].iloc[index_to_explain]])
y_pred = predictor.predict([combined_text_instance])[0]

mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
r2 = 1 - (np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


IndexError: index 1 is out of bounds for axis 1 with size 1