In [None]:
import pandas as pd
from ast import literal_eval
import numpy as np

In [None]:
# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load datasets
job_descriptions = pd.read_csv('./data/processed/job_descriptions_processed.csv')
resumes = pd.read_csv('./data/processed/resume-dataset-processed.csv', converters={'skills': literal_eval})

# Shuffle job_descriptions and select the first 100 rows
job_descriptions = job_descriptions.sample(frac=1, random_state=random_seed).head(100)

# Convert 'skills' column to list
job_descriptions['skills'] = job_descriptions['skills'].apply(literal_eval)

In [None]:
import pandas as pd
import nltk
import warnings
import uuid
import time
warnings.filterwarnings('ignore')

import torch
import torch.nn.functional as F
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch_geometric.data import Data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch_geometric.nn import GCNConv
from ast import literal_eval
from sklearn.model_selection import train_test_split

import numpy as np

datetime_pattern = "%Y%m%d-%H%M%S"

nltk.download(['stopwords', 'wordnet'])

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load datasets
job_descriptions = pd.read_csv('./data/processed/job_descriptions_processed.csv')
resumes = pd.read_csv('./data/processed/resume-dataset-processed.csv', converters={'skills': literal_eval})

# Shuffle job_descriptions and select the first 100 rows
job_descriptions = job_descriptions.sample(frac=1, random_state=random_seed).head(100)

# Convert 'skills' column to list
job_descriptions['skills'] = job_descriptions['skills'].apply(literal_eval)

# One-hot encoding skills
all_skills = list(set(sum(job_descriptions['skills'].tolist() + resumes['skills'].tolist(), [])))
mlb = MultiLabelBinarizer(classes=all_skills)

job_skills_encoded = mlb.fit_transform(job_descriptions['skills'])
resume_skills_encoded = mlb.fit_transform(resumes['skills'])

job_skills_tensor = torch.tensor(job_skills_encoded, dtype=torch.float)
resume_skills_tensor = torch.tensor(resume_skills_encoded, dtype=torch.float)

# Function to create feature matrix from selected columns
def create_feature_matrix(df, feature_columns):
    features = []
    for _, row in df.iterrows():
        features.append(torch.tensor([row[col] for col in feature_columns], dtype=torch.float))
    return torch.stack(features)

# Create feature matrices for job descriptions and resumes
job_exp_features = create_feature_matrix(job_descriptions, ['min_experience'])
resume_exp_features = create_feature_matrix(resumes, ['experience'])

# Ensure the dimensions of tensors match correctly for concatenation
job_features = torch.cat([job_skills_tensor, job_exp_features], dim=1)
resume_features = torch.cat([resume_skills_tensor, resume_exp_features], dim=1)

x_one_hot = torch.cat([job_features, resume_features], dim=0)

# Embedding skills using nn.Embedding
skill_to_index = {skill: idx for idx, skill in enumerate(all_skills)}
embedding_dim = 50
embedding = nn.Embedding(len(all_skills), embedding_dim)

# Function to get average embedding for a list of skills
def get_skill_embedding(skills, embedding, skill_to_index):
    if not skills:  # If the skills list is empty
        return torch.zeros(embedding_dim)
    skill_indices = [skill_to_index[skill] for skill in skills if skill in skill_to_index]
    if not skill_indices:
        return torch.zeros(embedding_dim)
    skill_tensor = torch.tensor(skill_indices, dtype=torch.long)
    skill_embeddings = embedding(skill_tensor)
    return skill_embeddings.mean(dim=0)

# Encode skills as embeddings
job_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in job_descriptions['skills']])
resume_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in resumes['skills']])

# Ensure the dimensions of tensors match correctly for concatenation
job_features_embedded = torch.cat([job_skills_embedded, job_exp_features], dim=1)
resume_features_embedded = torch.cat([resume_skills_embedded, resume_exp_features], dim=1)

x_embeddings = torch.cat([job_features_embedded, resume_features_embedded], dim=0)

# Function to create edge index for bipartite graph
def create_edge_index(job_descriptions, resumes):
    edges = []
    num_jobs = len(job_descriptions)
    num_resumes = len(resumes)

    for i, job in job_descriptions.iterrows():
        for j, resume in resumes.iterrows():
            if set(job['skills']).intersection(set(resume['skills'])):
                if i < num_jobs and (j + num_jobs) < (num_jobs + num_resumes):
                    edges.append([i, j + num_jobs])  # Offset for bipartite graph

    if not edges:  # Ensure there are edges
        return torch.empty((2, 0), dtype=torch.long)

    return torch.tensor(edges, dtype=torch.long).t().contiguous()

# Split the indices for job descriptions and resumes
num_jobs = len(job_descriptions)
num_resumes = len(resumes)

# Generate indices for jobs and resumes
job_indices = list(range(num_jobs))
resume_indices = list(range(num_jobs, num_jobs + num_resumes))

# Split indices into train and test sets
job_train_indices, job_test_indices = train_test_split(job_indices, test_size=0.2, random_state=42)
resume_train_indices, resume_test_indices = train_test_split(resume_indices, test_size=0.2, random_state=42)

# Combine train and test indices
train_indices = job_train_indices + resume_train_indices
test_indices = job_test_indices + resume_test_indices

# Create mask tensors for training and testing
train_mask = torch.zeros(num_jobs + num_resumes, dtype=torch.bool)
test_mask = torch.zeros(num_jobs + num_resumes, dtype=torch.bool)

train_mask[train_indices] = True
test_mask[test_indices] = True

print(f"train_mask size: {train_mask.size()}")  # Debug: Print the size of train_mask

# Adjust edge_index creation to filter only training data edges
def create_edge_index_for_split(job_descriptions, resumes, train_mask):
    edges = []
    num_jobs = len(job_descriptions)
    num_resumes = len(resumes)

    for i, job in job_descriptions.iterrows():
        for j, resume in resumes.iterrows():
            job_idx = i
            resume_idx = j + num_jobs
            # print(f"job_idx: {job_idx}, resume_idx: {resume_idx}")  # Debug: Print the indices
            if job_idx < train_mask.size(0) and resume_idx < train_mask.size(0):  # Ensure indices are within bounds
                if train_mask[job_idx] and train_mask[resume_idx]:  # Only add edges if both nodes are in the training set
                    edges.append([job_idx, resume_idx])  # Offset for bipartite graph

    if not edges:  # Ensure there are edges
        return torch.empty((2, 0), dtype=torch.long)

    return torch.tensor(edges, dtype=torch.long).t().contiguous()

# Create edge index for training data
train_edge_index = create_edge_index_for_split(job_descriptions, resumes, train_mask)

# Create Data objects for GCN
data_train = Data(x=x_embeddings, edge_index=train_edge_index)
data_test = Data(x=x_embeddings, edge_index=create_edge_index(job_descriptions, resumes))  # Use full edge_index for evaluation

# Targets (assuming binary classification as before)
targets = torch.zeros(num_jobs + num_resumes, dtype=torch.long)
targets[num_jobs:] = 1

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

input_dim = x_embeddings.size(1)  # Number of input features (50 + 1 = 51)
hidden_dim = 16  # Size of hidden layers
output_dim = 2  # Size of the output layer

model = GCN(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop with detailed debug information
def train(data, model, criterion, optimizer, targets, epochs=100, debug=False):
    print("Train start")
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[train_mask], targets[train_mask])
        loss.backward(retain_graph=True)
        optimizer.step()
        if debug:
            print(f'Epoch {epoch}, Loss: {loss.item()}')
            print(f'Output at epoch {epoch}: {out[train_mask].detach().numpy()}')
    print("Train done")

train(data_train, model, criterion, optimizer, targets, epochs=100, debug=True)

# Enhanced Evaluation Function
def evaluate(model, data, job_descriptions, resumes):
    model.eval()
    with torch.no_grad():
        out = model(data)
        probabilities = F.softmax(out, dim=1)  # Get probabilities
        pred = probabilities.argmax(dim=1)

        accuracy = accuracy_score(targets[test_mask], pred[test_mask])
        precision = precision_score(targets[test_mask], pred[test_mask])
        recall = recall_score(targets[test_mask], pred[test_mask])
        f1 = f1_score(targets[test_mask], pred[test_mask])

        print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')
        torch.save(model.state_dict(), f"./models/gcn-recommendation-system-{accuracy:.2f}-acc-{uuid.uuid4()}-{time.strftime('%Y%m%d-%H%M%S')}-v1.pth")

        # Detailed Matching Results
        num_jobs = len(job_descriptions)
        num_resumes = len(resumes)
        best_matches = {}

        for i in range(num_jobs):
            print(f"\nEvaluating Job {i}: {job_descriptions.iloc[i]['job_title']}")
            best_match_percentage = 0
            best_candidate = None
            for j in range(num_resumes):
                candidate_index = j + num_jobs
                match_percentage = probabilities[candidate_index][1].item() * 100  # Match percentage for the positive class
                # print(f"Candidate {j}: Match Percentage {match_percentage:.2f}%")  # Debug: Print match percentage for each candidate
                if match_percentage > best_match_percentage:
                    best_match_percentage = match_percentage
                    best_candidate = resumes.iloc[j]
            if best_candidate is not None:
                best_matches[job_descriptions.iloc[i]['job_id']] = (job_descriptions.iloc[i], best_candidate, best_match_percentage)
                print(f"Best candidate for job {i}: {best_candidate['job_title']} with match percentage {best_match_percentage:.2f}%")

        # Creating DataFrame from best_matches
        match_strings = []
        for job_id, (job, candidate, match_percentage) in best_matches.items():
            match_string = f"Job: {job['job_id']} - {job['job_title']}, Candidate: {candidate['candidate_id']} - {candidate['category']} ({candidate['job_title']}), Match Percentage: {match_percentage:.2f}%, Candidate Skills: {candidate['skills']}"
            match_strings.append(match_string)

        matches_df = pd.DataFrame(match_strings, columns=['Best Matches'])

        return matches_df

best_matches = evaluate(model, data_test, job_descriptions, resumes)


In [None]:
best_matches.head()