In [None]:
# GAE

In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.utils import train_test_split_edges, negative_sampling
import pandas as pd
import uuid
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, average_precision_score

from ast import literal_eval
import numpy as np

from torch_geometric.nn import Node2Vec

import warnings
warnings.filterwarnings('ignore')

In [48]:
# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load datasets
job_descriptions = pd.read_csv('./data/processed/job_descriptions_processed-v4.csv')
resumes = pd.read_csv('./data/processed/general-resume-dataset-processed-v4.csv', converters={'skills': literal_eval})

# Shuffle job_descriptions and select the first N rows
job_descriptions = job_descriptions.sample(frac=1, random_state=random_seed).head(10000)

# Convert 'skills' column to list
job_descriptions['skills'] = job_descriptions['skills'].apply(literal_eval)

In [49]:
# Replace None values in job_title and category with a default value before encoding
job_descriptions['job_title'].fillna('unknown', inplace=True)
resumes['job_title'].fillna('unknown', inplace=True)
resumes['category'].fillna('unknown', inplace=True)

# Ensure job_ids and candidate_ids are correctly assigned
job_descriptions['job_id'] = range(1, len(job_descriptions) + 1)
resumes['candidate_id'] = range(1, len(resumes) + 1)

# Add 'unknown' to the list of all titles and categories to handle unseen labels
all_titles = job_descriptions['job_title'].tolist() + resumes['job_title'].tolist()
all_titles.append('unknown')
all_categories = resumes['category'].tolist()
all_categories.append('unknown')

In [50]:
# Fit the label encoders
le_job_title = LabelEncoder()
le_category = LabelEncoder()
le_job_title.fit(all_titles)
le_category.fit(all_categories)

# Transform the columns
job_descriptions['job_title'] = le_job_title.transform(job_descriptions['job_title'])
resumes['job_title'] = le_job_title.transform(resumes['job_title'])
resumes['category'] = le_category.transform(resumes['category'])

# Encode skills
all_skills = set(skill for skills in job_descriptions['skills'].tolist() + resumes['skills'].tolist() for skill in skills)
le_skills = {skill: i for i, skill in enumerate(all_skills)}

In [51]:
# Create nodes and edges for the graph
nodes = []
edges = []
weights = []
node_features = []

jobs_from_edges = []
candidates_from_edges = []
jobs_and_candidates_from_edges = []

skill_weight_multiplier = 3  # Weight for skill overlap
title_weight = 15  # Weight for job title match

# Add job nodes
for i, row in job_descriptions.iterrows():
    nodes.append(row['job_id'])
    skills_vector = [0] * len(le_skills)
    if row['skills']:  # Check if skills are not empty
        for skill in row['skills']:
            skills_vector[le_skills[skill]] = 1
    node_features.append([row['job_title']] + skills_vector)

# Add resume nodes, using 'category' instead of 'job_title'
for i, row in resumes.iterrows():
    nodes.append(row['candidate_id'] + len(job_descriptions))
    skills_vector = [0] * len(le_skills)
    if row['skills']:  # Check if skills are not empty
        for skill in row['skills']:
            skills_vector[le_skills[skill]] = 1
    node_features.append([row['job_title']] + skills_vector)

In [52]:
# Convert job_skills and resume_skills to sets once
job_descriptions['skills'] = job_descriptions['skills'].apply(set)
resumes['skills'] = resumes['skills'].apply(set)

# Create dictionaries for quick lookup
job_skills_dict = job_descriptions.set_index('job_id')['skills'].to_dict()
resume_skills_dict = resumes.set_index('candidate_id')['skills'].to_dict()
job_titles_dict = job_descriptions.set_index('job_id')['job_title'].to_dict()
resume_titles_dict = resumes.set_index('candidate_id')['job_title'].to_dict()

edges = []
weights = []
jobs_and_candidates_from_edges = []
jobs_from_edges = []
candidates_from_edges = []

for job_id, job_skills in job_skills_dict.items():
    for candidate_id, resume_skills in resume_skills_dict.items():
        overlap = len(job_skills.intersection(resume_skills))
        combined_weight = overlap * skill_weight_multiplier

        if job_titles_dict[job_id] == resume_titles_dict[candidate_id]:
            combined_weight += title_weight  # Add weight for job title match

        if combined_weight > 0:
            edges.append((job_id, candidate_id + len(job_descriptions)))
            weights.append(combined_weight)
            jobs_and_candidates_from_edges.append((job_id, candidate_id))
            jobs_from_edges.append(job_id)
            candidates_from_edges.append(candidate_id)

nodes_length = len(nodes)

In [53]:
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_weight = torch.tensor(weights, dtype=torch.float)
edge_index = edge_index.clamp(0, nodes_length - 1)

In [54]:
torch.save(edge_index, './data/features/edge_index-10k-a03.pt')
torch.save(edge_weight, './data/features/edge_weight-10k-a03.pt')
with open('./data/features/nodes_len-10k-a03.txt', 'w') as file:
    file.write(str(len(nodes)))

In [55]:
# edge_index = torch.load('./data/features/edge_index-10k-a03.pt')
# edge_weight = torch.load('./data/features/edge_weight-10k-a03.pt')
# nodes_length = 0
# with open('./data/features/nodes_len-10k-a03.txt', 'r') as file:
#     nodes_length = np.int64(file.read())

In [56]:
# Convert node features to tensor
x = torch.tensor(node_features, dtype=torch.float)

In [57]:
torch.save(x, './data/features/x-10k-a03.pt')

In [58]:
# x = torch.load('./data/features/x-10k-a03.pt')

In [59]:
# Create PyTorch Geometric data object
data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight)

# Ensure edge indices are within range
# data.num_nodes = nodes_length

original_edge_index = data.edge_index.clone()
# original_edge_weight = data.edge_weight.clone()

# Splitting edges for training/validation
data = train_test_split_edges(data)

In [60]:
# Create a dictionary to map edge indices to their weights
edge_weight_dict = {tuple(edge_index[:, i].tolist()): edge_weight[i].item() for i in range(edge_index.size(1))}

def get_edge_weights(edge_index, edge_weight_dict):
    weights = []
    for i in range(edge_index.size(1)):
        edge = tuple(edge_index[:, i].tolist())
        weight = edge_weight_dict.get(edge, 0)  # Default to 0 if edge not found
        weights.append(weight)
    return torch.tensor(weights, dtype=torch.float)

In [61]:
train_edge_weights = get_edge_weights(data.train_pos_edge_index, edge_weight_dict)
test_edge_weights = get_edge_weights(data.test_pos_edge_index, edge_weight_dict)
val_edge_weights = get_edge_weights(data.val_pos_edge_index, edge_weight_dict)

In [62]:
data.train_pos_edge_weight = train_edge_weights
data.test_pos_edge_weight = test_edge_weights
data.val_pos_edge_weight = val_edge_weights

# Manually create negative edges for training
neg_edge_index_train = negative_sampling(
    edge_index=data.train_pos_edge_index,
    num_nodes=data.num_nodes,
    num_neg_samples=data.train_pos_edge_index.size(1),
)
data.train_neg_edge_index = neg_edge_index_train

# Assign zero weights to negative edges for training
neg_train_edge_weights = torch.zeros(neg_edge_index_train.size(1), dtype=torch.float)

# Manually create negative edges for testing
neg_edge_index_test = negative_sampling(
    edge_index=data.test_pos_edge_index,
    num_nodes=data.num_nodes,
    num_neg_samples=data.test_pos_edge_index.size(1),
)
data.test_neg_edge_index = neg_edge_index_test

# Assign zero weights to negative edges for testing
neg_test_edge_weights = torch.zeros(neg_edge_index_test.size(1), dtype=torch.float)

# Combine positive and negative edge weights for training
data.train_neg_edge_weight = neg_train_edge_weights

# Combine positive and negative edge weights for testing
data.test_neg_edge_weight = neg_test_edge_weights

# Ensure edge_index tensors are of integer type
data.train_pos_edge_index = data.train_pos_edge_index.long()
data.test_pos_edge_index = data.test_pos_edge_index.long()
data.train_neg_edge_index = data.train_neg_edge_index.long()
data.test_neg_edge_index = data.test_neg_edge_index.long()

In [63]:
# Define number of nodes
num_nodes = data.num_nodes

# Node2Vec parameters
embedding_dim = 64
walk_length = 20
context_size = 10
walks_per_node = 10
batch_size = 128
lr = 0.01
num_epochs = 21

In [64]:
# Initialize Node2Vec
node2vec = Node2Vec(
    edge_index=original_edge_index,
    embedding_dim=embedding_dim,
    walk_length=walk_length,
    context_size=context_size,
    walks_per_node=walks_per_node,
    num_negative_samples=1,
    p=1,
    q=1,
    sparse=True,
    num_nodes=num_nodes  # Specify number of nodes
)

# Move to the appropriate device (CPU/GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
node2vec = node2vec.to(device)

In [65]:
# Optimizer for Node2Vec
optimizer = torch.optim.SparseAdam(node2vec.parameters(), lr=lr)

In [66]:
# Training loop for Node2Vec
def train_node2vec(num_epochs):
    node2vec.train()
    for epoch in range(num_epochs):
        total_loss = 0
        loader = node2vec.loader(batch_size=batch_size, shuffle=True, num_workers=0)  # Set num_workers=0
        for i, (pos_rw, neg_rw) in enumerate(loader):
            optimizer.zero_grad()
            loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if epoch % 10 == 0:
                print(f'Epoch {epoch + 1}, Iteration {i}, Loss: {total_loss / 10}')
                total_loss = 0

In [67]:
# Train Node2Vec model
train_node2vec(num_epochs)

Epoch 1, Iteration 0, Loss: 0.3822084665298462
Epoch 1, Iteration 1, Loss: 0.3814095497131348
Epoch 1, Iteration 2, Loss: 0.37668566703796386
Epoch 1, Iteration 3, Loss: 0.37039647102355955
Epoch 1, Iteration 4, Loss: 0.37499632835388186
Epoch 1, Iteration 5, Loss: 0.36944968700408937
Epoch 1, Iteration 6, Loss: 0.37199788093566893
Epoch 1, Iteration 7, Loss: 0.364693808555603
Epoch 1, Iteration 8, Loss: 0.365826416015625
Epoch 1, Iteration 9, Loss: 0.36258511543273925
Epoch 1, Iteration 10, Loss: 0.3623929738998413
Epoch 1, Iteration 11, Loss: 0.357783842086792
Epoch 1, Iteration 12, Loss: 0.3590676307678223
Epoch 1, Iteration 13, Loss: 0.35092010498046877
Epoch 1, Iteration 14, Loss: 0.34906649589538574
Epoch 1, Iteration 15, Loss: 0.3533889055252075
Epoch 1, Iteration 16, Loss: 0.34712560176849366
Epoch 1, Iteration 17, Loss: 0.34795753955841063
Epoch 1, Iteration 18, Loss: 0.34754533767700196
Epoch 1, Iteration 19, Loss: 0.34339475631713867
Epoch 1, Iteration 20, Loss: 0.3390869379

In [68]:
# Extract embeddings
node_embeddings = node2vec.embedding.weight.data.cpu().numpy()

# Update node features with embeddings
data.x = torch.tensor(node_embeddings, dtype=torch.float)

In [69]:
class GAE(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GAE, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def encode(self, x, edge_index, edge_weight):
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        return self.conv2(x, edge_index, edge_weight)

    def decode(self, z, pos_edge_index, neg_edge_index):
        pos_pred = (z[pos_edge_index[0].long()] * z[pos_edge_index[1].long()]).sum(dim=1)
        neg_pred = (z[neg_edge_index[0].long()] * z[neg_edge_index[1].long()]).sum(dim=1)
        return pos_pred, neg_pred

    def forward(self, data):
        z = self.encode(data.x, data.train_pos_edge_index, data.train_pos_edge_weight)
        return z

In [70]:
# Initialize and train GAE model as before
model = GAE(data.num_node_features, 16)  # Adjust dimensions as needed
gae_optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [71]:
def train(data):
    model.train()
    gae_optimizer.zero_grad()
    z = model.encode(data.x, data.train_pos_edge_index, data.train_pos_edge_weight)  # Pass train_pos_edge_weight
    pos_pred, neg_pred = model.decode(z, data.train_pos_edge_index, data.train_neg_edge_index)
    pos_loss = loss_fn(pos_pred, torch.ones_like(pos_pred))
    neg_loss = loss_fn(neg_pred, torch.zeros_like(neg_pred))
    loss = pos_loss + neg_loss
    if torch.isnan(loss) or torch.isinf(loss):
        print("Warning: NaN or Inf loss detected")
        return float('inf')
    loss.backward()
    gae_optimizer.step()
    return loss.item()

In [72]:
for epoch in range(100):
    loss = train(data)
    if loss == float('inf'):
        break
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss}')

Epoch 0, Loss: 3.341568946838379
Epoch 10, Loss: 1.7521438598632812
Epoch 20, Loss: 1.3434333801269531
Epoch 30, Loss: 1.2498263120651245
Epoch 40, Loss: 1.2117993831634521
Epoch 50, Loss: 1.1875545978546143
Epoch 60, Loss: 1.1706467866897583
Epoch 70, Loss: 1.1580443382263184
Epoch 80, Loss: 1.1483867168426514
Epoch 90, Loss: 1.1406351327896118


In [73]:
def precision_at_k(y_true, y_pred, k):
    idx = np.argsort(y_pred)[::-1][:k]
    y_pred_binary = np.zeros_like(y_pred)
    y_pred_binary[idx] = 1
    tp = np.sum(y_true * y_pred_binary)
    precision = tp / k
    return precision

def recall_at_k(y_true, y_pred, k):
    idx = np.argsort(y_pred)[::-1][:k]
    y_pred_binary = np.zeros_like(y_pred)
    y_pred_binary[idx] = 1
    tp = np.sum(y_true * y_pred_binary)
    recall = tp / np.sum(y_true)
    return recall

def average_precision(y_true, y_pred):
    idx = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[idx]
    tp = np.cumsum(y_true_sorted)
    precision = tp / (np.arange(len(y_true_sorted)) + 1)
    avg_precision = np.sum(precision * y_true_sorted) / np.sum(y_true_sorted)
    return avg_precision

def mean_average_precision(y_true, y_pred):
    return np.mean([average_precision(y_t, y_p) for y_t, y_p in zip(y_true, y_pred)])

def dcg_score(y_true, y_pred, k):
    order = np.argsort(y_pred)[::-1]
    y_true_sorted = np.take(y_true, order[:k])
    gain = 2 ** y_true_sorted - 1
    discounts = np.log2(np.arange(len(y_true_sorted)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(y_true, y_pred, k):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_pred, k)
    return actual / best

In [74]:
def evaluate_model(data, model, k):
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x, data.val_pos_edge_index, data.val_pos_edge_weight)
        pos_pred = torch.sigmoid((z[data.val_pos_edge_index[0].long()] * z[data.val_pos_edge_index[1].long()]).sum(dim=1)).cpu().numpy()
        neg_pred = torch.sigmoid((z[data.val_neg_edge_index[0].long()] * z[data.val_neg_edge_index[1].long()]).sum(dim=1)).cpu().numpy()

    y_true = np.concatenate([np.ones(pos_pred.shape[0]), np.zeros(neg_pred.shape[0])])
    y_pred = np.concatenate([pos_pred, neg_pred])

    auc_roc = roc_auc_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)

    precision = precision_at_k(y_true, y_pred, k)
    recall = recall_at_k(y_true, y_pred, k)
    map_score = mean_average_precision([y_true], [y_pred])
    ndcg = ndcg_score(y_true, y_pred, k)

    return auc_roc, ap, precision, recall, map_score, ndcg

In [75]:
k=10

In [76]:
# Example usage
auc_roc, ap, k_prec, recall, map_score, ndcg = evaluate_model(data, model, k)
print(f"AUC-ROC: {auc_roc:.4f}, AP: {ap:.4f}")
print(f"Precision@{k}: {k_prec:.4f}")
print(f"Recall@{k}: {recall:.4f}")
print(f"MAP: {map_score:.4f}")
print(f"NDCG@{k}: {ndcg:.4f}")

AUC-ROC: 0.9068, AP: 0.9071
Precision@10: 0.0000
Recall@10: 0.0000
MAP: 0.9071
NDCG@10: 0.0000


In [77]:
def predict_best_candidates(job_descriptions, resumes, z, k=1):
    job_ids = job_descriptions['job_id'].values
    candidate_ids = resumes['candidate_id'].values + len(job_descriptions)

    job_indices = job_ids - 1
    candidate_indices = candidate_ids - 1

    job_embeddings = z[job_indices]
    candidate_embeddings = z[candidate_indices]

    # Calculate scores using matrix multiplication
    scores = torch.sigmoid(torch.matmul(job_embeddings, candidate_embeddings.T)).cpu().numpy()

    predictions = []
    for i, job_id in enumerate(job_ids):
        best_match_indices = scores[i].argsort()[::-1][:k]
        for idx in best_match_indices:
            candidate_id = resumes.iloc[idx]['candidate_id']
            candidate_job_title = resumes.iloc[idx]['job_title']
            category = resumes.iloc[idx]['category']
            skills = resumes.iloc[idx]['skills']
            job_title = job_descriptions.iloc[i]['job_title']
            job_skills = job_descriptions.iloc[i]['skills']
            score = scores[i][idx]

            match_percentage = score * 100  # Assuming the score is between 0 and 1
            predictions.append({
                "Job ID": job_id,
                "Job Title": le_job_title.inverse_transform([job_title])[0],
                "Candidate ID": candidate_id,
                "Candidate Job Title": le_job_title.inverse_transform([candidate_job_title])[0],
                "Candidate Category": le_category.inverse_transform([category])[0],
                "Match Percentage": match_percentage,
                "Mutual Skills": set(job_skills).intersection(set(skills)),
                "Job Skills": job_skills,
                "Candidate Skills": skills
            })

    predictions_df = pd.DataFrame(predictions)
    return predictions_df

In [78]:
# Example usage
with torch.no_grad():
    z = model.encode(data.x, data.test_pos_edge_index, data.test_pos_edge_weight)

In [79]:
jobs_to_predict = job_descriptions[job_descriptions['job_title'].isin(resumes['job_title'])].sample(frac=1, random_state=random_seed)

In [80]:
predictions_df = predict_best_candidates(job_descriptions, resumes, z)
predictions_df = predictions_df[predictions_df['Mutual Skills'].map(len) != 0]
predictions_df['Mutual Skills Count'] = predictions_df['Mutual Skills'].map(len)
ind = predictions_df['Mutual Skills'].map(len).sort_values(ascending=False).index
predictions_df = predictions_df.reindex(ind)

In [81]:
predictions_df.head(10000)

Unnamed: 0,Job ID,Job Title,Candidate ID,Candidate Job Title,Candidate Category,Match Percentage,Mutual Skills,Job Skills,Candidate Skills,Mutual Skills Count
1697,1698,marketing coordinator,545,director of community,arts,81.278962,"{marketing strategy, social medium, email mark...","{management system, SEO, social medium, email ...","{graphic design, community education, marketin...",3
4997,4998,marketing director,176,vp,public-relations,86.839312,"{crisis management, medium relation, public re...","{crisis management, content strategy, medium r...","{crisis communication, microsoft outlook, arti...",3
1628,1629,purchasing agent,431,finance controller,finance,78.100234,"{supply chain, problem solve}","{inventory management, supply chain, problem s...","{system integration, cash flow, complex proble...",2
2949,2950,nurse practitioner,417,occupational health nurse coordinator,healthcare,70.763117,"{patient assessment, nursing care}","{patient assessment, nursing care, health prom...","{office management, incident report, quality a...",2
2947,2948,it support specialist,907,staff accountant,accountant,74.595946,"{customer service, problem solve}","{customer service, critical thinking, problem ...","{account payable, quality assurance, cash flow...",2
...,...,...,...,...,...,...,...,...,...,...
3053,3054,marketing director,637,business development manager,business-development,70.636231,{brand management},"{brand management, market research, brand stra...","{product line, market share, corporate finance...",1
3074,3075,social media manager,837,vp finance,finance,74.925274,{social medium},"{social medium, social medium advertising, soc...","{strategic alliance, white paper, risk managem...",1
3078,3079,project coordinator,471,business development rep,business-development,63.139862,{time management},"{time management, organizational skill, micros...","{twitter ad, value proposition, e commerce, in...",1
3121,3122,marketing director,1149,creative assistant,arts,84.692597,{public relation},"{crisis management, content strategy, medium r...","{storage system, scientific reasoning, contrac...",1


In [None]:
# torch.save(model.state_dict(), f"./models/gea-recommendation-system-25k-{auc_roc:.2f}-acc-{uuid.uuid4()}-{time.strftime('%Y%m%d-%H%M%S')}-v4.pth")

In [83]:
torch.save(model.state_dict(), f"./models/gea-recommendation-system-v4-a03.pth")