In [47]:
import pandas as pd
import nltk
import warnings
import uuid
warnings.filterwarnings('ignore')

import torch
import torch.nn.functional as F
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch_geometric.data import Data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch_geometric.nn import GCNConv
from ast import literal_eval

In [48]:
nltk.download(['stopwords', 'wordnet'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\idontwannawakeup\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\idontwannawakeup\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [49]:
# Example DataFrame structures for job descriptions and resumes
# job_descriptions = pd.DataFrame({
#     'job_id': [1, 2],
#     'skills': [['python', 'ml'], ['java', 'spring']],
#     'job_title': ['data scientist', 'backend developer'],
#     'min_experience': [12, 24]
# })
#
# resumes = pd.DataFrame({
#     'job_title': ['data scientist', 'backend developer'],
#     'skills': [['python', 'data analysis'], ['java', 'spring boot']],
#     'experience': [24, 36]
# })
# resumes['candidate_id'] = range(1, len(resumes) + 1)

In [50]:
# job_descriptions = pd.read_csv('./data/processed/job_descriptions_processed.csv', converters={'skills': literal_eval})
# resumes = pd.read_csv('./data/processed/resume-dataset-processed.csv', converters={'skills': literal_eval})
job_descriptions = pd.read_csv('./data/processed/job_descriptions_processed.csv')
resumes = pd.read_csv('./data/processed/resume-dataset-processed.csv', converters={'skills': literal_eval})
job_descriptions.head()
resumes.head()
print("Data is loaded")

job_descriptions = job_descriptions.head(1000)
job_descriptions['skills'] = job_descriptions['skills'].apply(literal_eval)

Data is loaded


In [51]:
# One-hot encoding skills
all_skills = list(set(sum(job_descriptions['skills'].tolist() + resumes['skills'].tolist(), [])))
mlb = MultiLabelBinarizer(classes=all_skills)
job_skills_encoded = mlb.fit_transform(job_descriptions['skills'])
resume_skills_encoded = mlb.fit_transform(resumes['skills'])

job_skills_tensor = torch.tensor(job_skills_encoded, dtype=torch.float)
resume_skills_tensor = torch.tensor(resume_skills_encoded, dtype=torch.float)

# Function to create feature matrix from selected columns
def create_feature_matrix(df, feature_columns):
    features = []
    for _, row in df.iterrows():
        features.append(torch.tensor([row[col] for col in feature_columns], dtype=torch.float))
    return torch.stack(features)

# Create feature matrices for job descriptions and resumes
job_exp_features = create_feature_matrix(job_descriptions, ['min_experience'])
resume_exp_features = create_feature_matrix(resumes, ['experience'])

# Ensure the dimensions of tensors match correctly for concatenation
job_features = torch.cat([job_skills_tensor, job_exp_features], dim=1)
resume_features = torch.cat([resume_skills_tensor, resume_exp_features], dim=1)

x_one_hot = torch.cat([job_features, resume_features], dim=0)

# Embedding skills using nn.Embedding
skill_to_index = {skill: idx for idx, skill in enumerate(all_skills)}
embedding_dim = 50
embedding = nn.Embedding(len(all_skills), embedding_dim)

# Function to get average embedding for a list of skills
def get_skill_embedding(skills, embedding, skill_to_index):
    if not skills:  # If the skills list is empty
        return torch.zeros(embedding_dim)
    skill_indices = [skill_to_index[skill] for skill in skills if skill in skill_to_index]
    if not skill_indices:
        return torch.zeros(embedding_dim)
    skill_tensor = torch.tensor(skill_indices, dtype=torch.long)
    skill_embeddings = embedding(skill_tensor)
    return skill_embeddings.mean(dim=0)

# Encode skills as embeddings
job_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in job_descriptions['skills']])
resume_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in resumes['skills']])

# Ensure the dimensions of tensors match correctly for concatenation
job_features_embedded = torch.cat([job_skills_embedded, job_exp_features], dim=1)
resume_features_embedded = torch.cat([resume_skills_embedded, resume_exp_features], dim=1)

x_embeddings = torch.cat([job_features_embedded, resume_features_embedded], dim=0)

In [52]:
# Function to create edge index for bipartite graph
def create_edge_index(job_descriptions, resumes):
    edges = []
    num_jobs = len(job_descriptions)
    num_resumes = len(resumes)

    for i, job in job_descriptions.iterrows():
        for j, resume in resumes.iterrows():
            if set(job['skills']).intersection(set(resume['skills'])):
                if i < num_jobs and (j + num_jobs) < (num_jobs + num_resumes):
                    edges.append([i, j + num_jobs])  # Offset for bipartite graph

    if not edges:  # Ensure there are edges
        return torch.empty((2, 0), dtype=torch.long)

    return torch.tensor(edges, dtype=torch.long).t().contiguous()

edge_index = create_edge_index(job_descriptions, resumes)

# Create Data objects for GCN
data_embeddings = Data(x=x_embeddings, edge_index=edge_index)

In [53]:
# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

In [54]:
input_dim = x_embeddings.size(1)  # Number of input features (50 + 1 = 51)
hidden_dim = 16  # Size of hidden layers
output_dim = 2  # Size of the output layer

model = GCN(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Create realistic targets for binary classification
num_jobs = len(job_descriptions)
num_resumes = len(resumes)
targets = torch.zeros(num_jobs + num_resumes, dtype=torch.long)
targets[num_jobs:] = 1

In [55]:
# Training loop with detailed debug information
def train(data, model, criterion, optimizer, targets, epochs=100):
    print("Train start")
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        print(f'Output at epoch {epoch}: {out}')  # Debug: Print the model output
        loss = criterion(out, targets)
        print(f'Loss at epoch {epoch}: {loss}')  # Debug: Print the loss
        loss.backward(retain_graph=True)  # Backward pass
        optimizer.step()
        print(f'Epoch {epoch}, Loss: {loss.item()}')
    print("Train done")

train(data_embeddings, model, criterion, optimizer, targets)

Train start
Output at epoch 0: tensor([[ -0.8028,  -1.9630],
        [ -0.5606,  -1.4222],
        [ -0.1072,   0.6472],
        ...,
        [ -0.3958,  -3.2646],
        [ -0.1667,   0.4853],
        [ -4.1196, -10.8563]], grad_fn=<AddBackward0>)
Loss at epoch 0: 2.507761001586914
Epoch 0, Loss: 2.507761001586914
Output at epoch 1: tensor([[-1.9773, -0.3470],
        [-1.1136, -0.7303],
        [-0.1406,  0.7823],
        ...,
        [-2.0282, -0.8590],
        [-0.2523,  0.6624],
        [-9.5281, -3.3557]], grad_fn=<AddBackward0>)
Loss at epoch 1: 0.5744983553886414
Epoch 1, Loss: 0.5744983553886414
Output at epoch 2: tensor([[ -2.6845,   0.6221],
        [ -1.4318,  -0.2815],
        [ -0.1643,   0.8625],
        ...,
        [ -2.9917,   0.5603],
        [ -0.3103,   0.7860],
        [-12.7769,   1.1816]], grad_fn=<AddBackward0>)
Loss at epoch 2: 0.8741853833198547
Epoch 2, Loss: 0.8741853833198547
Output at epoch 3: tensor([[-3.1046e+00,  1.1863e+00],
        [-1.6243e+00, -1.0

In [56]:
# Enhanced Evaluation Function
def evaluate(model, data, job_descriptions, resumes):
    model.eval()
    with torch.no_grad():
        out = model(data)
        probabilities = F.softmax(out, dim=1)  # Get probabilities
        pred = probabilities.argmax(dim=1)

        accuracy = accuracy_score(targets, pred)
        precision = precision_score(targets, pred)
        recall = recall_score(targets, pred)
        f1 = f1_score(targets, pred)

        print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')
        torch.save(model.state_dict(), f"./models/gcn-recommendation-system-{accuracy}-acc-{uuid.uuid4()}-v1.pth")

        # Detailed Matching Results
        # num_jobs = len(job_descriptions)
        # num_resumes = len(resumes)
        # matches = []
        # for i in range(num_jobs):
        #     for j in range(num_resumes):
        #         candidate_index = j + num_jobs
        #         match_percentage = probabilities[candidate_index][1].item() * 100  # Match percentage for the positive class
        #         if pred[candidate_index] == 1:  # If candidate is predicted to match a job
        #             matches.append((job_descriptions.iloc[i], resumes.iloc[j], match_percentage))
        # 
        # print("Matching Results:")
        # for job, candidate, match_percentage in matches:
        #     print(f"Job: {job['job_id']} - {job['job_title']}, Candidate: {candidate['candidate_id']} - {candidate['job_title']}, Match Percentage: {match_percentage:.2f}%")
        #     break

evaluate(model, data_embeddings, job_descriptions, resumes)

Accuracy: 0.9505606523955148, Precision: 0.926949654491609, Recall: 0.9760914760914761, F1 Score: 0.9508860759493671


In [57]:
import torch
print(torch.cuda.is_available())

True
