In [None]:
from torch_geometric.graphgym import optim
import torch
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from torch_geometric.data import Data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Example DataFrame structures for job descriptions and resumes
job_descriptions = pd.DataFrame({
    'job_id': [1, 2],
    'skills': [['python', 'ml'], ['java', 'spring']],
    'job_title': ['data scientist', 'backend developer'],
    'min_experience': [12, 24],
    'max_experience': [36, 48]
})

resumes = pd.DataFrame({
    'job_title': ['data scientist', 'backend developer'],
    'skills': [['python', 'data analysis'], ['java', 'spring boot']],
    'experience': [24, 36]
})
resumes['candidate_id'] = range(1, len(resumes) + 1)

# One-hot encoding skills
all_skills = list(set(sum(job_descriptions['skills'].tolist() + resumes['skills'].tolist(), [])))
mlb = MultiLabelBinarizer(classes=all_skills)
job_skills_encoded = mlb.fit_transform(job_descriptions['skills'])
resume_skills_encoded = mlb.transform(resumes['skills'])

job_skills_tensor = torch.tensor(job_skills_encoded, dtype=torch.float)
resume_skills_tensor = torch.tensor(resume_skills_encoded, dtype=torch.float)

# Function to create feature matrix from selected columns
def create_feature_matrix(df, feature_columns):
    features = []
    for _, row in df.iterrows():
        features.append(torch.tensor([row[col] for col in feature_columns], dtype=torch.float))
    return torch.stack(features)

# Append skill features to experience features
job_exp_features = create_feature_matrix(job_descriptions, ['min_experience', 'max_experience'])
resume_exp_features = create_feature_matrix(resumes, ['experience'])

# Add dummy column to resume_exp_features to match job_exp_features dimensions
dummy_column = torch.zeros(resume_exp_features.size(0), 1)
resume_exp_features = torch.cat([resume_exp_features, dummy_column], dim=1)

# Ensure the dimensions of tensors match correctly for concatenation
job_features = torch.cat([job_skills_tensor, job_exp_features], dim=1)
resume_features = torch.cat([resume_skills_tensor, resume_exp_features], dim=1)

x_one_hot = torch.cat([job_features, resume_features], dim=0)

# Embedding skills using nn.Embedding
skill_to_index = {skill: idx for idx, skill in enumerate(all_skills)}
embedding_dim = 50
embedding = nn.Embedding(len(all_skills), embedding_dim)

# Function to get average embedding for a list of skills
def get_skill_embedding(skills, embedding, skill_to_index):
    skill_indices = [skill_to_index[skill] for skill in skills if skill in skill_to_index]
    if not skill_indices:
        return torch.zeros(embedding_dim)
    skill_tensor = torch.tensor(skill_indices, dtype=torch.long)
    skill_embeddings = embedding(skill_tensor)
    return skill_embeddings.mean(dim=0)

# Encode skills as embeddings
job_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in job_descriptions['skills']])
resume_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in resumes['skills']])

# Ensure the dimensions of tensors match correctly for concatenation
job_features_embedded = torch.cat([job_skills_embedded, job_exp_features], dim=1)
resume_features_embedded = torch.cat([resume_skills_embedded, resume_exp_features], dim=1)

x_embeddings = torch.cat([job_features_embedded, resume_features_embedded], dim=0)

# Function to create edge index for bipartite graph
def create_edge_index(job_descriptions, resumes):
    edges = []
    for i, job in job_descriptions.iterrows():
        for j, resume in resumes.iterrows():
            if set(job['skills']).intersection(set(resume['skills'])):
                edges.append([i, j + len(job_descriptions)])  # Offset for bipartite graph
    return torch.tensor(edges, dtype=torch.long).t().contiguous()

edge_index = create_edge_index(job_descriptions, resumes)

# Create Data objects for GCN
data_one_hot = Data(x=x_one_hot, edge_index=edge_index)
data_embeddings = Data(x=x_embeddings, edge_index=edge_index)

# Define the GCN model
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

input_dim = x_embeddings.size(1)  # Number of input features (52 in this case)
hidden_dim = 16  # Size of hidden layers
output_dim = 2  # Size of the output layer

model = GCN(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Create realistic targets for binary classification
num_jobs = len(job_descriptions)
num_resumes = len(resumes)
targets = torch.zeros(num_jobs + num_resumes, dtype=torch.long)
targets[num_jobs:] = 1

# Training loop with detailed debug information
def train(data, model, criterion, optimizer, targets, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        print(f'Output at epoch {epoch}: {out}')  # Debug: Print the model output
        loss = criterion(out, targets)
        print(f'Loss at epoch {epoch}: {loss}')  # Debug: Print the loss
        loss.backward(retain_graph=True)
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

train(data_embeddings, model, criterion, optimizer, targets)

# Evaluation
def evaluate(model, data, targets):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        accuracy = accuracy_score(targets, pred)
        precision = precision_score(targets, pred)
        recall = recall_score(targets, pred)
        f1 = f1_score(targets, pred)
        print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

evaluate(model, data_embeddings, targets)
