In [None]:
# GAE

In [None]:
# # Set random seed for reproducibility
# random_seed = 42
# np.random.seed(random_seed)
# 
# # Load datasets
# job_descriptions = pd.read_csv('./data/processed/job_descriptions_processed.csv')
# resumes = pd.read_csv('./data/processed/resume-dataset-processed.csv', converters={'skills': literal_eval})
# 
# # Shuffle job_descriptions and select the first 100 rows
# job_descriptions = job_descriptions.sample(frac=1, random_state=random_seed).head(100)
# 
# # Convert 'skills' column to list
# job_descriptions['skills'] = job_descriptions['skills'].apply(literal_eval)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, VGAE
from torch_geometric.data import Data
from torch_geometric.utils import train_test_split_edges, negative_sampling
import pandas as pd
import uuid
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, average_precision_score

from ast import literal_eval
import numpy as np

from torch_geometric.nn import Node2Vec
from torch_geometric.utils import add_remaining_self_loops

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load datasets
job_descriptions = pd.read_csv('./data/processed/job_descriptions_processed.csv')
resumes = pd.read_csv('./data/processed/general-resume-dataset-processed-v1.csv', converters={'skills': literal_eval})

# Shuffle job_descriptions and select the first N rows
job_descriptions = job_descriptions.sample(frac=1, random_state=random_seed).head(10000)

# Convert 'skills' column to list
job_descriptions['skills'] = job_descriptions['skills'].apply(literal_eval)

In [None]:
# Replace None values in job_title and category with a default value before encoding
job_descriptions['job_title'].fillna('unknown', inplace=True)
resumes['job_title'].fillna('unknown', inplace=True)
resumes['category'].fillna('unknown', inplace=True)

# Ensure job_ids and candidate_ids are correctly assigned
job_descriptions['job_id'] = range(1, len(job_descriptions) + 1)
resumes['candidate_id'] = range(1, len(resumes) + 1)

# Add 'unknown' to the list of all titles and categories to handle unseen labels
all_titles = job_descriptions['job_title'].tolist() + resumes['job_title'].tolist()
all_titles.append('unknown')
all_categories = resumes['category'].tolist()
all_categories.append('unknown')

In [None]:
# Fit the label encoders
le_job_title = LabelEncoder()
le_category = LabelEncoder()
le_job_title.fit(all_titles)
le_category.fit(all_categories)

# Transform the columns
job_descriptions['job_title'] = le_job_title.transform(job_descriptions['job_title'])
resumes['job_title'] = le_job_title.transform(resumes['job_title'])
resumes['category'] = le_category.transform(resumes['category'])

# Encode skills
all_skills = set(skill for skills in job_descriptions['skills'].tolist() + resumes['skills'].tolist() for skill in skills)
le_skills = {skill: i for i, skill in enumerate(all_skills)}

In [None]:
# Create nodes and edges for the graph
nodes = []
edges = []
weights = []
node_features = []

jobs_from_edges = []
candidates_from_edges = []
jobs_and_candidates_from_edges = []

skill_weight_multiplier = 1  # Weight for skill overlap
title_weight = 5  # Weight for job title match

# Add job nodes
for i, row in job_descriptions.iterrows():
    nodes.append(row['job_id'])
    skills_vector = [0] * len(le_skills)
    if row['skills']:  # Check if skills are not empty
        for skill in row['skills']:
            skills_vector[le_skills[skill]] = 1
    node_features.append([row['job_title']] + skills_vector)

# Add resume nodes, using 'category' instead of 'job_title'
for i, row in resumes.iterrows():
    nodes.append(row['candidate_id'] + len(job_descriptions))
    skills_vector = [0] * len(le_skills)
    if row['skills']:  # Check if skills are not empty
        for skill in row['skills']:
            skills_vector[le_skills[skill]] = 1
    node_features.append([row['job_title']] + skills_vector)

# Add edges based on job_title overlap
# for i, job in job_descriptions.iterrows():
#     for j, resume in resumes.iterrows():
#         if job['job_title'] == resume['job_title']:  # Changed from resume['job_title'] to resume['category']
#             edges.append((job['job_id'], resume['candidate_id'] + len(job_descriptions)))

def skills_overlap(job_skills, resume_skills):
    return len(set(job_skills).intersection(set(resume_skills)))

threshold = 2  # Set a threshold for the minimum number of overlapping skills

# for i, job in job_descriptions.iterrows():
#     for j, resume in resumes.iterrows():
#         overlap = skills_overlap(job['skills'], resume['skills'])
#         if overlap >= threshold:
#             edges.append((job['job_id'], resume['candidate_id'] + len(job_descriptions)))
# 
# for i, job in job_descriptions.iterrows():
#     for j, resume in resumes.iterrows():
#         if job['job_title'] == resume['job_title'] or skills_overlap(job['skills'], resume['skills']) >= threshold:
#             edges.append((job['job_id'], resume['candidate_id'] + len(job_descriptions)))

for i, job in job_descriptions.iterrows():
    for j, resume in resumes.iterrows():
        overlap = skills_overlap(job['skills'], resume['skills'])
        combined_weight = overlap * skill_weight_multiplier

        if job['job_title'] == resume['job_title']:
            combined_weight += title_weight  # Add weight for job title match

        if combined_weight > 0:
            edges.append((job['job_id'], resume['candidate_id'] + len(job_descriptions)))
            weights.append(combined_weight)
            jobs_and_candidates_from_edges.append((job['job_id'], resume['candidate_id']))
            jobs_from_edges.append(job['job_id'])
            candidates_from_edges.append(resume['candidate_id'])

In [350]:
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_weight = torch.tensor(weights, dtype=torch.float)

In [351]:
jobs_from_edges = set(jobs_from_edges)
candidates_from_edges = set(candidates_from_edges)
jobs_and_candidates_from_edges = set(jobs_and_candidates_from_edges)

In [352]:
def remove_duplicate_edges(edge_index, edge_weight=None):
    edge_index_np = edge_index.cpu().numpy()
    unique_edges, indices = np.unique(edge_index_np, axis=1, return_index=True)
    unique_edge_index = torch.tensor(unique_edges, dtype=torch.long).to(edge_index.device)
    if edge_weight is not None:
        unique_edge_weight = edge_weight[indices]
        return unique_edge_index, unique_edge_weight
    return unique_edge_index

# Correct node indices
num_job_nodes = len(job_descriptions)
num_resume_nodes = len(resumes)
total_nodes = num_job_nodes + num_resume_nodes

# Check if self-loops are present
def has_self_loops(edge_index):
    row, col = edge_index
    return torch.any(row == col)

# Add self-loops only if they are missing
if not has_self_loops(edge_index):
    edge_index, edge_weight = add_remaining_self_loops(edge_index, edge_weight, fill_value=1, num_nodes=len(nodes))

# Clamp edge indices to ensure they are within range
edge_index = edge_index.clamp(0, len(nodes) - 1)

# Remove duplicate edges
edge_index, edge_weight = remove_duplicate_edges(edge_index, edge_weight)

# Convert node features to tensor
x = torch.tensor(node_features, dtype=torch.float)

# Create PyTorch Geometric data object
data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight)

# Ensure edge indices are within range
# data.num_nodes = len(nodes)

# original_edge_index = data.edge_index.clone()
# original_edge_weight = data.edge_weight.clone()

# Splitting edges for training/validation
# data = train_test_split_edges(data)
# 
# # Manually create negative edges for training
# neg_edge_index_train = negative_sampling(
#     edge_index=data.train_pos_edge_index,
#     num_nodes=data.num_nodes,
#     num_neg_samples=data.train_pos_edge_index.size(1)
# )
# data.train_neg_edge_index = neg_edge_index_train
# 
# # Manually create negative edges for testing
# neg_edge_index_test = negative_sampling(
#     edge_index=data.test_pos_edge_index,
#     num_nodes=data.num_nodes,
#     num_neg_samples=data.test_pos_edge_index.size(1)
# )
# data.test_neg_edge_index = neg_edge_index_test

In [353]:
# Define number of nodes
num_nodes = data.num_nodes

# Node2Vec parameters
embedding_dim = 64
walk_length = 20
context_size = 10
walks_per_node = 10
batch_size = 64
lr = 0.1
num_epochs = 100

In [354]:
# Initialize Node2Vec
node2vec = Node2Vec(
    edge_index=data.edge_index,
    embedding_dim=embedding_dim,
    walk_length=walk_length,
    context_size=context_size,
    walks_per_node=walks_per_node,
    num_negative_samples=1,
    p=1,
    q=1,
    sparse=True,
    num_nodes=num_nodes  # Specify number of nodes
)

# Move to the appropriate device (CPU/GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
node2vec = node2vec.to(device)

In [355]:
# Optimizer for Node2Vec
optimizer = torch.optim.SparseAdam(node2vec.parameters(), lr=lr)

In [356]:
# Training loop for Node2Vec
def train_node2vec(num_epochs):
    node2vec.train()
    for epoch in range(num_epochs):
        total_loss = 0
        loader = node2vec.loader(batch_size=batch_size, shuffle=True, num_workers=0)  # Set num_workers=0
        for i, (pos_rw, neg_rw) in enumerate(loader):
            optimizer.zero_grad()
            loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if epoch % 10 == 0:
                print(f'Epoch {epoch + 1}, Iteration {i}, Loss: {total_loss / 10}')
                total_loss = 0

In [357]:
# Train Node2Vec model
train_node2vec(20)

Epoch 1, Iteration 0, Loss: 0.3730912446975708
Epoch 1, Iteration 1, Loss: 0.35307979583740234
Epoch 1, Iteration 2, Loss: 0.351181697845459
Epoch 1, Iteration 3, Loss: 0.3488358974456787
Epoch 1, Iteration 4, Loss: 0.3393368721008301
Epoch 1, Iteration 5, Loss: 0.3335639715194702
Epoch 1, Iteration 6, Loss: 0.3324225902557373
Epoch 1, Iteration 7, Loss: 0.3187448024749756
Epoch 1, Iteration 8, Loss: 0.3148561954498291
Epoch 1, Iteration 9, Loss: 0.30550217628479004
Epoch 1, Iteration 10, Loss: 0.30208611488342285
Epoch 1, Iteration 11, Loss: 0.28481011390686034
Epoch 1, Iteration 12, Loss: 0.29403364658355713
Epoch 1, Iteration 13, Loss: 0.27736549377441405
Epoch 1, Iteration 14, Loss: 0.2723710536956787
Epoch 1, Iteration 15, Loss: 0.2644244909286499
Epoch 1, Iteration 16, Loss: 0.26748497486114503
Epoch 1, Iteration 17, Loss: 0.25979316234588623
Epoch 1, Iteration 18, Loss: 0.24591507911682128
Epoch 1, Iteration 19, Loss: 0.24732093811035155
Epoch 1, Iteration 20, Loss: 0.2349501609

In [358]:
# Extract embeddings
node_embeddings = node2vec.embedding.weight.data.cpu().numpy()

# Update node features with embeddings
data.x = torch.tensor(node_embeddings, dtype=torch.float)

In [363]:
class GAE(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GAE, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def encode(self, x, edge_index, edge_weight):
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        return self.conv2(x, edge_index, edge_weight)

    def decode(self, z, pos_edge_index, neg_edge_index):
        pos_pred = (z[pos_edge_index[0].long()] * z[pos_edge_index[1].long()]).sum(dim=1)
        neg_pred = (z[neg_edge_index[0].long()] * z[neg_edge_index[1].long()]).sum(dim=1)
        return pos_pred, neg_pred

    def forward(self, data):
        z = self.encode(data.x, data.edge_index, data.edge_weight)
        return z

In [364]:
# Initialize and train GAE model as before
model = GAE(data.num_node_features, 32)  # Adjust dimensions as needed
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [365]:
def train(data):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index, data.edge_weight)  # Pass edge_weight
    pos_pred, neg_pred = model.decode(z, data.edge_index, data.edge_index)
    pos_loss = loss_fn(pos_pred, torch.ones_like(pos_pred))
    neg_loss = loss_fn(neg_pred, torch.zeros_like(neg_pred))
    loss = pos_loss + neg_loss
    if torch.isnan(loss) or torch.isinf(loss):
        print("Warning: NaN or Inf loss detected")
        return float('inf')
    loss.backward()
    optimizer.step()
    return loss.item()

In [366]:
for epoch in range(1000):
    loss = train(data)
    if loss == float('inf'):
        break
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss}')

Epoch 0, Loss: 17.01972007751465
Epoch 100, Loss: 1.475785255432129
Epoch 200, Loss: 1.4283578395843506
Epoch 300, Loss: 1.4121308326721191
Epoch 400, Loss: 1.4041836261749268
Epoch 500, Loss: 1.399585485458374
Epoch 600, Loss: 1.3966388702392578
Epoch 700, Loss: 1.3946300745010376
Epoch 800, Loss: 1.3931820392608643
Epoch 900, Loss: 1.3920917510986328


In [342]:
def evaluate_model(data, model):
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x, data.edge_index, data.edge_weight)
        pos_pred = torch.sigmoid((z[data.edge_index[0].long()] * z[data.edge_index[1].long()]).sum(dim=1)).cpu().numpy()
        neg_pred = torch.sigmoid((z[data.edge_index[0].long()] * z[data.edge_index[1].long()]).sum(dim=1)).cpu().numpy()

    y_true = np.concatenate([np.ones(pos_pred.shape[0]), np.zeros(neg_pred.shape[0])])
    y_pred = np.concatenate([pos_pred, neg_pred])

    auc_roc = roc_auc_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)

    return auc_roc, ap

In [343]:
# Example usage
auc_roc, ap = evaluate_model(data, model)
print(f"AUC-ROC: {auc_roc:.4f}, AP: {ap:.4f}")

AUC-ROC: 0.5000, AP: 0.5000


In [344]:
def predict_best_candidates(job_descriptions, resumes, z):
    job_ids = job_descriptions['job_id'].values
    candidate_ids = resumes['candidate_id'].values + len(job_descriptions)

    predictions = []
    for job_id, job_title, job_skills in zip(job_ids, job_descriptions['job_title'], job_descriptions['skills']):
        best_match = None
        best_score = float('-inf')
        for candidate_id, category, candidate_job_title, skills in zip(
                resumes['candidate_id'], resumes['category'], resumes['job_title'], resumes['skills']):
            candidate_index = candidate_id + len(job_descriptions) - 1
            job_index = job_id - 1

            # Ensure indices are within range
            if candidate_index >= z.size(0) or job_index >= z.size(0):
                continue

            score = torch.sigmoid(torch.dot(z[job_index], z[candidate_index])).item()
            if score > best_score:
                best_score = score
                best_match = (job_id, job_title, candidate_id, candidate_job_title, category, skills, score)

        if best_match:
            job_id, job_title, candidate_id, candidate_job_title, category, skills, score = best_match
            match_percentage = score * 100  # Assuming the score is between 0 and 1
            predictions.append({
                "Job ID": job_id,
                "Job Title": le_job_title.inverse_transform([job_title])[0],
                "Candidate ID": candidate_id,
                "Candidate Job Title": le_job_title.inverse_transform([candidate_job_title])[0],
                "Candidate Category": le_category.inverse_transform([category])[0],
                "Match Percentage": match_percentage,
                "Mutual Skills": set(job_skills).intersection(set(skills)),
                "Job Skills": job_skills,
                "Candidate Skills": skills
            })

    predictions_df = pd.DataFrame(predictions)
    return predictions_df

In [347]:
# Example usage
with torch.no_grad():
    z = model.encode(data.x, data.edge_index, data.edge_weight)

In [348]:
predictions_df = predict_best_candidates(job_descriptions.sample(frac=1, random_state=random_seed)[:100], resumes, z)

torch.save(model.state_dict(), f"./models/gcn-recommendation-system-{auc_roc:.2f}-acc-{uuid.uuid4()}-{time.strftime('%Y%m%d-%H%M%S')}-v1.pth")

In [349]:
predictions_df.head(1000)

Unnamed: 0,Job ID,Job Title,Candidate ID,Candidate Job Title,Candidate Category,Match Percentage,Mutual Skills,Job Skills,Candidate Skills
0,6253,network administrator,2339,material and tooling control lead,arts,61.854172,{},"[system administration, server maintenance, ac...","[chef, security, support, material, diagram, s..."
1,4685,customer service representative,989,business account lead,bpo,70.003867,{},"[live chat support, online customer communicat...","[tower, business, monitoring, software, dash, ..."
2,1732,procurement manager,987,noc engineer,bpo,68.048358,{},"[procurement processes, vendor assessment, con...","[interaction, operating system, monitoring, so..."
3,4743,account executive,1143,consultant,consultant,75.481802,{},"[key account management, sales strategy develo...","[visual basic, support, chrome, data warehouse..."
4,4522,legal counsel,2129,marketing and special events coordinator,public-relations,70.496887,{},"[intellectual property law, trademark and pate...","[software, support, material, medium, marketin..."
...,...,...,...,...,...,...,...,...,...
95,3788,supply chain manager,896,receptionist and veterinary technician,fitness,67.372411,{},"[demand forecasting, inventory management, dat...","[business, software, support, documentation, s..."
96,9190,ux/ui designer,1364,chef,chef,83.734983,{},"[u, i design principles and best practices, gr...","[chef, support, material, specification, compu..."
97,7826,graphic designer,1795,engineering lab technician,engineering,59.255594,{},"[user interface, u, i, user experience, u, x, ...","[component, java, linux, release, software, su..."
98,7540,procurement specialist,1744,self,engineering,73.878193,{},"[procurement strategies, supplier management, ...","[electronic engineering, support, schedule, en..."
