In [29]:
import spacy
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [30]:
nltk.download(['stopwords', 'wordnet'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/semeniuk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/semeniuk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
def parse_experience_expectations(experience_str):
    match = re.match(r'(\d+)\s+to\s+(\d+)\s+Years', experience_str)
    if match:
        min_exp, pref_exp = match.groups()
        return int(min_exp) * 12, int(pref_exp) * 12
    return None, None

def split_skills(skills_str):
    pattern = re.compile(r'([A-Z][a-z]*(?:\s[a-z]+)*)')
    skills = pattern.findall(skills_str)
    skills = [skill.lower() for skill in skills]
    return skills

jobs_df = pd.read_csv('./data/training/job_descriptions.csv')
jobs_df = jobs_df.reindex(np.random.permutation(jobs_df.index))
jobs_df = jobs_df.copy().iloc[0:10,]

experience = jobs_df['Experience'].apply(lambda x: pd.Series(parse_experience_expectations(x)))
jobs_df = pd.DataFrame({
    'job_title': jobs_df['Job Title'].str.lower(),
    'skills': jobs_df['skills'].apply(split_skills),
    'min_experience': experience[0],
    'max_experience': experience[1],
})

In [32]:
def create_spacy_pattern(title):
    words = title.split()
    pattern = [{"LOWER": word.lower()} for word in words]
    return {"label": "TITLE", "pattern": pattern}

nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "./data/preprocessing/jz_skill_patterns.jsonl"
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

titles_df = pd.read_json('./data/preprocessing/job-titles.json')
title_patterns = [create_spacy_pattern(title) for title in titles_df['job-titles'].to_numpy()]
ruler.add_patterns(title_patterns)

def get_skills(doc):
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return list(set(skills))

def get_title(doc):
    for ent in doc.ents:
        if ent.label_ == "TITLE":
            return ent.text
    return None

def extract_experience(cv_text):
    # Define a pattern for experience
    experience_pattern = r'\b(\d+)\s*(year|month)\b'

    # Find all experience mentions in the CV text
    experience_found = re.findall(experience_pattern, cv_text)

    # Filter out None or empty entries and sum up experience
    total_months = 0
    for value, unit in experience_found:
        if unit == "year":
            total_months += int(value) * 12
        elif unit == "month":
            total_months += int(value)

    return total_months

def clean_text(text):
    review = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", text)
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    return " ".join(review)

resume_df = pd.read_csv('./data/training/resume-dataset.csv')
resume_df['candidate_id'] = range(1, len(resume_df) + 1)
resume_df = resume_df.reindex(np.random.permutation(resume_df.index))
resume_df = resume_df.copy().iloc[0:10,]

processed_resumes = []
for text in resume_df['Resume']:
    cv_text = clean_text(text)
    doc = nlp(cv_text)
    skills = get_skills(doc)
    title = get_title(doc)
    experience_data = extract_experience(cv_text)
    processed_resumes.append({ "job_title": title, "skills": skills, "experience": experience_data })

resume_df = pd.DataFrame(processed_resumes)
resume_df.head()

Unnamed: 0,job_title,skills,experience
0,system engineer,"[big data, yarn, analytics, linux, shell, tabl...",0
1,php developer,"[search engine, codeigniter, design, framework...",0
2,teacher,[],0
3,programmer,"[support, software, middleware, deployment, da...",0
4,hadoop developer,"[specification, analytics, visualization, java...",0


In [33]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from torch_geometric.data import Data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch_geometric.nn import GCNConv

# Example DataFrame structures for job descriptions and resumes
# job_descriptions = pd.DataFrame({
#     'job_id': [1, 2],
#     'skills': [['python', 'ml'], ['java', 'spring']],
#     'job_title': ['data scientist', 'backend developer'],
#     'min_experience': [12, 24]
# })
#
# resumes = pd.DataFrame({
#     'job_title': ['data scientist', 'backend developer'],
#     'skills': [['python', 'data analysis'], ['java', 'spring boot']],
#     'experience': [24, 36]
# })
# resumes['candidate_id'] = range(1, len(resumes) + 1)

job_descriptions = jobs_df
resumes = resume_df
job_descriptions.head()
resumes.head()

# One-hot encoding skills
all_skills = list(set(sum(job_descriptions['skills'].tolist() + resumes['skills'].tolist(), [])))
mlb = MultiLabelBinarizer(classes=all_skills)
job_skills_encoded = mlb.fit_transform(job_descriptions['skills'])
resume_skills_encoded = mlb.fit_transform(resumes['skills'])

job_skills_tensor = torch.tensor(job_skills_encoded, dtype=torch.float)
resume_skills_tensor = torch.tensor(resume_skills_encoded, dtype=torch.float)

# Function to create feature matrix from selected columns
def create_feature_matrix(df, feature_columns):
    features = []
    for _, row in df.iterrows():
        features.append(torch.tensor([row[col] for col in feature_columns], dtype=torch.float))
    return torch.stack(features)

# Create feature matrices for job descriptions and resumes
job_exp_features = create_feature_matrix(job_descriptions, ['min_experience'])
resume_exp_features = create_feature_matrix(resumes, ['experience'])

# Ensure the dimensions of tensors match correctly for concatenation
job_features = torch.cat([job_skills_tensor, job_exp_features], dim=1)
resume_features = torch.cat([resume_skills_tensor, resume_exp_features], dim=1)

x_one_hot = torch.cat([job_features, resume_features], dim=0)

# Embedding skills using nn.Embedding
skill_to_index = {skill: idx for idx, skill in enumerate(all_skills)}
embedding_dim = 50
embedding = nn.Embedding(len(all_skills), embedding_dim)

# Debug: Print skill_to_index to check the mapping
print(f'skill_to_index: {skill_to_index}')

# Function to get average embedding for a list of skills
def get_skill_embedding(skills, embedding, skill_to_index):
    if not skills:  # If the skills list is empty
        return torch.zeros(embedding_dim)
    skill_indices = [skill_to_index[skill] for skill in skills if skill in skill_to_index]
    # Debug: Print skill indices to ensure they are within valid range
    print(f'skill_indices: {skill_indices}')
    if not skill_indices:
        return torch.zeros(embedding_dim)
    skill_tensor = torch.tensor(skill_indices, dtype=torch.long)
    skill_embeddings = embedding(skill_tensor)
    return skill_embeddings.mean(dim=0)

# Encode skills as embeddings
job_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in job_descriptions['skills']])
resume_skills_embedded = torch.stack([get_skill_embedding(skills, embedding, skill_to_index) for skills in resumes['skills']])

# Ensure the dimensions of tensors match correctly for concatenation
job_features_embedded = torch.cat([job_skills_embedded, job_exp_features], dim=1)
resume_features_embedded = torch.cat([resume_skills_embedded, resume_exp_features], dim=1)

x_embeddings = torch.cat([job_features_embedded, resume_features_embedded], dim=0)

# Function to create edge index for bipartite graph
def create_edge_index(job_descriptions, resumes):
    edges = []
    for i, job in job_descriptions.iterrows():
        for j, resume in resumes.iterrows():
            if set(job['skills']).intersection(set(resume['skills'])):
                edges.append([i, j + len(job_descriptions)])  # Offset for bipartite graph
    if not edges:  # Ensure there are edges
        return torch.empty((2, 0), dtype=torch.long)
    return torch.tensor(edges, dtype=torch.long).t().contiguous()

edge_index = create_edge_index(job_descriptions, resumes)

# Create Data objects for GCN
data_embeddings = Data(x=x_embeddings, edge_index=edge_index)

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

input_dim = x_embeddings.size(1)  # Number of input features (50 + 1 = 51)
hidden_dim = 16  # Size of hidden layers
output_dim = 2  # Size of the output layer

model = GCN(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Create realistic targets for binary classification
num_jobs = len(job_descriptions)
num_resumes = len(resumes)
targets = torch.zeros(num_jobs + num_resumes, dtype=torch.long)
targets[num_jobs:] = 1

# Training loop with detailed debug information
def train(data, model, criterion, optimizer, targets, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        print(f'Output at epoch {epoch}: {out}')  # Debug: Print the model output
        loss = criterion(out, targets)
        print(f'Loss at epoch {epoch}: {loss}')  # Debug: Print the loss
        loss.backward(retain_graph=True)  # Backward pass without retaining the graph
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

train(data_embeddings, model, criterion, optimizer, targets)

# Evaluation
def evaluate(model, data, targets):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        accuracy = accuracy_score(targets, pred)
        precision = precision_score(targets, pred)
        recall = recall_score(targets, pred)
        f1 = f1_score(targets, pred)
        print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

evaluate(model, data_embeddings, targets)

skill_to_index: {'specification': 0, 'environmental impact assessment': 1, 'x': 2, 'sketch': 3, 'testing and experimentation': 4, 'hadoop': 5, 'mode': 6, 'i design tools': 7, 'nosql': 8, 'usability testing and user research': 9, 'network security': 10, 'network performance analysis': 11, 'handover': 12, 'information architecture and user flows': 13, 'mysql': 14, 'mongo': 15, 'u': 16, 'b': 17, 'sales techniques': 18, 'bootstrap': 19, 'd': 20, 'github': 21, 'research and development processes': 22, 'network protocols': 23, 'wireshark': 24, 'support': 25, 'green building materials': 26, 'audit and compliance': 27, 'testing': 28, 'certificate': 29, 'p': 30, 'tableau': 31, 'negotiation skills': 32, 'big data': 33, 'analytics': 34, 'visualization': 35, 'quality metrics and': 36, 'server': 37, 'java': 38, 'scalability': 39, 'operating system': 40, 'knowledge base': 41, 'q': 42, 'router': 43, 'communication skills': 44, 'yarn': 45, 'machine learning': 46, 'crisis communication planning': 47, '