1. Step 1: Load data in dataset

In [1]:
import pandas as pd

In [2]:
import numpy as np
df = pd.read_excel('./db/data.xlsx', index_col=0)
df.columns

#Strip the whitespaces from the data
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#Strip the whitespaces from the column names
df.columns = df.columns.str.strip()

#Replace unwanted values with nan (empty)
df = df.replace(["N/A"], np.nan)

#Adjust the column names to lowercase
df.columns = df.columns.str.lower()

#Drop rows with missing values
df = df[['job title', 'headline', 'summary', 'keywords', 'educations','experiences', 'skills', 'disqualified']]

#Map the values of the Disqualified column to 1 and 0
order = {"Yes": 1, "No": 0}
df["disqualified"] = df["disqualified"].map(order)

#Remove noise from columns with object data type
for col in df.columns:
    if df[col].dtype == 'object':
        df[col]= df[col].str.replace(r"_x[0-9a-fA-F]{4}_", "", regex=True)

#Fill missing values with empty string
df['summary'] = df['summary'].str.lower().fillna('')

str_cols = ['headline', 'summary','educations', 'experiences', 'skills']
for col in df.columns:
    if col in str_cols:
        df[col]= df[col].str.replace(r"_x[0-9a-fA-F]{4}_", "", regex=True).astype(str)

#Combine the text columns into one column
df['candidate_profile'] = df[['headline', 'summary','educations', 'experiences', 'skills']].apply(lambda x: ' '.join(x), axis=1)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=200, stop_words='english')

# Combine job_title and candidate_profile into one list of documents for vectorization
all_text = df['job title'].tolist() + df['candidate_profile'].tolist()

#Concatenar job title y candidate profile en una lista de documentos para vectorización

# Step 3: Fit and transform text data
tfidf_matrix = vectorizer.fit_transform(all_text)

# Separate vectors for job titles and candidate profiles
job_title_vectors = tfidf_matrix[:len(df)]
profile_vectors = tfidf_matrix[len(df):]

In [4]:
# df_clean = df[['job title', 'headline', 'summary', 'keywords', 'educations', 'experiences', 'skills', 'candidate_profile', 'disqualified']]


# X= profile_vectors.toarray() + job_title_vectors.toarray()
# y= 1 - df_clean['disqualified']


# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_md')

In [7]:
# from sklearn.metrics.pairwise import cosine_similarity

# df['candidate_profile'] = df[['headline', 'summary','educations', 'experiences', 'skills']].apply(lambda x: ' '.join(x), axis=1)

# vectorizer = TfidfVectorizer(max_features=200)

# # Combine job_title and candidate_profile into one list of documents for vectorization
# all_text = df['job title'].tolist() + df['candidate_profile'].tolist()

# #Concatenar job title y candidate profile en una lista de documentos para vectorización

# # Step 3: Fit and transform text data
# tfidf_matrix = vectorizer.fit_transform(all_text)

# # Separate vectors for job titles and candidate profiles
# job_title_vectors = tfidf_matrix[:len(df)]
# profile_vectors = tfidf_matrix[len(df):]

# # Step 4: Compute cosine similarity
# similarity_scores = cosine_similarity(job_title_vectors, profile_vectors)

# # Step 5: Extract the diagonal (similarity of each job title with its corresponding candidate profile)
# df['score'] = np.diag(similarity_scores)


In [8]:
import pickle

# Save the TF-IDF vectorizer to a pickle file
with open('./db/tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

In [9]:
df_clean = df[['job title', 'headline', 'summary', 'keywords', 'educations', 'experiences', 'skills', 'candidate_profile', 'disqualified']]


X= np.concatenate([profile_vectors.toarray(), job_title_vectors.toarray()], axis=1)
y= 1 - df_clean['disqualified']


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [12]:

nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)


# Define the parameter grid for Neural Network
param_grid_nn = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  # Different layer configurations
    'activation': ['relu', 'tanh'],                             # Activation functions
    'learning_rate_init': [0.001, 0.01, 0.1],                   # Learning rates
    'alpha': [0.0001, 0.001, 0.01],                             # L2 regularization (alpha)
    'max_iter': [200, 300]                                      # Maximum number of iterations
}

# Neural Network (MLP Regressor)
nn_model = MLPRegressor(random_state=42)

# Instantiate GridSearchCV for tunning
grid_search_nn = GridSearchCV(nn_model, param_grid_nn, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit GridSearchCV
grid_search_nn.fit(X_train, y_train)

# Output best parameters and best MSE
print("Best Neural Network Parameters:", grid_search_nn.best_params_)
print("Best Neural Network MSE:", -grid_search_nn.best_score_)

# Output best parameters
best_nn_params = grid_search_nn.best_params_
print("Best Neural Network Parameters:", best_nn_params)

# Instantiate the model with the best parameters
best_nn_model = MLPRegressor(**best_nn_params, random_state=42)

# Train the model
best_nn_model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred_nn = best_nn_model.predict(X_test)
mse_nn = mean_squared_error(y_test, y_pred_nn)
print(f"Neural Network MSE on Test Set: {mse_nn}")

Best Neural Network Parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.001, 'max_iter': 200}
Best Neural Network MSE: 0.12730724100884352
Best Neural Network Parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.001, 'max_iter': 200}
Neural Network MSE on Test Set: 0.15368727774505236


In [13]:
# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],           # Number of trees
    'max_depth': [None, 10, 20, 30],          # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],          # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],            # Minimum samples required at each leaf node
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider for each split
}

# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV for tunning model
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Output best parameters and best MSE
print("Best Random Forest Parameters:", grid_search_rf.best_params_)
print("Best Random Forest MSE:", -grid_search_rf.best_score_)

# Output best parameters
best_rf_params = grid_search_rf.best_params_
print("Best Random Forest Parameters:", best_rf_params)

# Instantiate the model with the best parameters
best_rf_model = RandomForestRegressor(**best_rf_params, random_state=42)

# Train the model
best_rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = best_rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE on Test Set: {mse_rf}")

Best Random Forest Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Random Forest MSE: 0.1267811320754717
Best Random Forest Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest MSE on Test Set: 0.1105151171875


In [14]:
# Define the parameter grid for K-Nearest Neighbors
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],                 # Number of neighbors
    'weights': ['uniform', 'distance'],           # Weighting function
    'p': [1, 2]                                   # Distance metric (1 for Manhattan, 2 for Euclidean)
}

# K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()

# Instantiate GridSearchCV for tunning model
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit GridSearchCV
grid_search_knn.fit(X_train, y_train)

# Output best parameters and best MSE
print("Best KNN Parameters:", grid_search_knn.best_params_)
print("Best KNN MSE:", -grid_search_knn.best_score_)

# Output best parameters
best_knn_params = grid_search_knn.best_params_
print("Best KNN Parameters:", best_knn_params)

# Instantiate the model with the best parameters
best_knn_model = KNeighborsRegressor(**best_knn_params)

# Train the model
best_knn_model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred_knn = best_knn_model.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
print(f"KNN MSE on Test Set: {mse_knn}")

Best KNN Parameters: {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
Best KNN MSE: 0.12239158213404078
Best KNN Parameters: {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
KNN MSE on Test Set: 0.14208962900119312


In [15]:
import pickle

with open('./db/neural_network_model.pkl', 'wb') as file:
    pickle.dump(best_nn_model, file)

with open('./db/random_forest_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

with open('./db/knn_model.pkl', 'wb') as file:
    pickle.dump(best_knn_model, file)

In [16]:
df_clean.to_pickle('./db/data_cleaned.pkl')

In [17]:
# def predict_best_model(input_text):

#     # # Load the trained models and vectorizer
#     # with open('./db/neural_network_model.pkl', 'rb') as file:
#     #     loaded_nn_model = pickle.load(file)

#     # with open('./db/random_forest_model.pkl', 'rb') as file:
#     #     loaded_rf_model = pickle.load(file)

#     # with open('./db/knn_model.pkl', 'rb') as file:
#     #     loaded_knn_model = pickle.load(file)

#     # # Load the TF-IDF vectorizer (assuming it was saved)
#     # with open('./db/tfidf_vectorizer.pkl', 'rb') as file:
#     #     tfidf_vectorizer = pickle.load(file)

#     # Transform the input text using the TF-IDF vectorizer
#     # text_vector = tfidf_vectorizer.transform([input_text])

#     # Make predictions with each model
#     # nn_prediction = loaded_nn_model.predict(text_vector)
#     # rf_prediction = loaded_rf_model.predict(text_vector)
#     # knn_prediction = loaded_knn_model.predict(text_vector)

#     # Store predictions in a dictionary
#     # predictions = {
#     #     "Neural Network": nn_prediction[0],
#     #     "Random Forest": rf_prediction[0],
#     #     "KNN": knn_prediction[0]
#     # }

#     # model_mapping = {
#     #     "Neural Network": loaded_nn_model,
#     #     "Random Forest": loaded_rf_model,
#     #     "KNN": loaded_knn_model
#     # }

#     # # Find the model with the best prediction
#     # best_model = max(predictions, key=predictions.get)  # Get the model with the highest score
#     # best_prediction = predictions[best_model]

#     # print(f"Best Model: {best_model}")
#     # print(f"Prediction Score: {best_prediction}")

#     return loaded_rf_model

def get_related_rows(input_text, cleaned_df, vectorizer, model):
    # Transform the input text using the TF-IDF vectorizer
    input_vector = vectorizer.transform([input_text])

    # Calculate similarity scores for each row in the DataFrame
    candidate_profile_vectors = vectorizer.transform(cleaned_df['candidate_profile'])

    all_vectors = np.array(
        [np.concatenate([candidate_profile_vectors.toarray()[i] , input_vector.toarray()[0]]) for i in range(len(candidate_profile_vectors.toarray()))]
    )

    print(all_vectors.shape)
    print(input_vector.toarray().shape)

    predictions = model.predict(all_vectors)

    # similarity_scores = np.dot(candidate_profile_vectors, input_vector.T).toarray()

    # similarity_scores_flattened = similarity_scores.flatten()

    # Normalize similarity scores to a scale of 0 to 100
    min_score = 0
    max_score = predictions.max()
    
    # Avoid division by zero if max_score is equal to min_score
    if max_score != min_score:
        normalized_scores = (predictions - min_score) / (max_score - min_score) * 100
    else:
        normalized_scores = np.zeros_like(predictions)  # All scores are the same, set to 0%


    # Add similarity scores to the DataFrame
    # cleaned_df['similarity_score'] = similarity_scores

    # Store normalized scores as a new column in the DataFrame
    cleaned_df['normalized_prediction_score'] = normalized_scores

    # Get predictions for the entire DataFrame
    # predictions = model.predict(input_vector)

    # Combine predictions and similarity scores in the DataFrame
    cleaned_df['prediction'] = predictions

    # Sort by similarity score and return the top 30 rows
    top_related_rows = cleaned_df.sort_values(by='normalized_prediction_score', ascending=False).head(30)

    return top_related_rows

# Example usage
input_text = "Senior software Ruby engineer with PostgreSQL experience"
# best_model, best_prediction = predict_best_model(input_text)
with open('./db/random_forest_model.pkl', 'rb') as file:
    loaded_rf_model = pickle.load(file)

# Load the TF-IDF vectorizer (assuming it was saved)
with open('./db/tfidf_vectorizer.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

# Load the cleaned DataFrame (assuming it was saved)
cleaned_df = pd.read_pickle('./db/data_cleaned.pkl')

# Get related rows
related_rows = get_related_rows(input_text, cleaned_df, tfidf_vectorizer, loaded_rf_model)

print(related_rows)

related_rows.to_csv('./db/related_rows.csv', index=True)

(199, 400)
(1, 200)
                                            job title  \
Name                                                    
Candidate 4             Ruby + (Golang/Rust) Engineer   
Candidate 2             Ruby + (Golang/Rust) Engineer   
Candidate 33             Senior DevOps Engineer - AWS   
Candidate 1             Ruby + (Golang/Rust) Engineer   
Candidate 35             Senior DevOps Engineer - AWS   
Candidate 34             Senior DevOps Engineer - AWS   
Candidate 12            Ruby + (Golang/Rust) Engineer   
Candidate 186          Senior Ruby on Rails Developer   
Candidate 3             Ruby + (Golang/Rust) Engineer   
Candidate 20                           Lead Generator   
Candidate 5             Ruby + (Golang/Rust) Engineer   
Candidate 38             Senior DevOps Engineer - AWS   
Candidate 22                           Lead Generator   
Candidate 18                           Lead Generator   
Candidate 26                           Lead Generator   
Candidate 3