In [None]:
import pickle


class Vacancy:
    def __init__(self, uuid: str, name: str, context: str, keys: list):
        self.uuid = uuid
        self.name = name
        self.context = name + '. ' + context
        self.keys = keys
        self.keys.append(name)
        self.keys.extend(name.split(' '))
        self.essense = []

    def add_context(self, context: str | list[str]):
        self.context = context

    def add_keys(self, keys: list):
        self.keys.extend(keys)

    def add_key(self, key: str):
        self.keys.append(key)

    def add_essense(self, essense: list[str]):
        self.essense.extend(essense)

    def save(self, path: str):
        with open(path, 'wb') as fl:
            pickle.dump(self, fl)


class Resume:
    def __init__(self, uuid: str, years: int, keys: list, experience_day,
             experience, edu_keys, result=None):
        self.uuid = uuid
        self.years = years
        self.keys = keys + edu_keys
        self.experience_day = experience_day
        self.experience = experience
        self.target = result

    def add_experience(self, new_exp: str):
        self.experience.append(new_exp)

    def add_keys(self, keys: list):
        self.keys.extend(keys)

    def add_key(self, key: str):
        
        self.keys.append(key)

    def save(self, path: str):
        with open(path, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(path: str):
        with open(path, 'rb') as f:
            return pickle.load(f)

In [None]:
from classes import Vacancy, Resume

from translatepy import Translator
from translatepy.translators.google import GoogleTranslate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import re
from datetime import datetime
import json
import pickle

translator = Translator()


def tranlate_str(text):
    return translator.translate(text, "English").result


def load_vacancy(data):
    keys = []
    if data['keywords'] is not None:
        text = tranlate_str(data['keywords'])
        keys = text.split(', ')

    obj = Vacancy(data['uuid'], tranlate_str(data['name']), tranlate_str(data['description']),
                  keys)
    return obj


def load_resume(data, result=None):
    worked_days = []
    lst_exp = []
    edu_keys = []

    today = datetime.today()

    if 'experienceItem' in data.keys():
        experienceItem = data['experienceItem']

        for i in experienceItem:
            val = ''
            if i['position'] is not None and i['position'].strip() != '':
                val += tranlate_str(i['position'])
            if i['description'] is not None and i['description'].strip() != '':
                val += tranlate_str(i['description'])
            lst_exp.append(val)
            if i['ends'] is None:
                worked_days.append((today - datetime.strptime(i['starts'], '%Y-%m-%d')).days)
            else:
                worked_days.append(
                    (datetime.strptime(i['ends'], '%Y-%m-%d') - datetime.strptime(i['starts'], '%Y-%m-%d')).days)

    if 'educationItem' in data.keys():
        educationItem = data['educationItem']
        for i in educationItem:
            if i['result'] != '' and i['result'] is not None:
                text = tranlate_str(i['result'])
                edu_keys.append(text)
                edu_keys.extend(text.split(' '))
            elif i['specialty'] != '' and i['specialty'] is not None:
                text = tranlate_str(i['specialty'])
                edu_keys.append(text)
                edu_keys.extend(text.split(' '))
            elif i['faculty'] != '' and i['faculty'] is not None:
                text = tranlate_str(i['faculty'])
                edu_keys.append(text)
                edu_keys.extend(text.split(' '))

    date = 33
    years = 0
    if 'birth_date' in data.keys():
        if data['birth_date'] is not None:
            date = datetime.strptime(data['birth_date'], '%Y-%m-%d')
            years = (today - date).days // 365

    keys = []
    if data['key_skills'] is not None:
        text = tranlate_str(data['key_skills'])
        keys.extend(text.split(', '))
    obj = Resume(data['uuid'], years, keys,
                 worked_days, lst_exp, edu_keys, result)
    return obj


def load_train(way):
    with open(way, 'rb') as f:
        data = json.load(f)

    vacancies = []
    lst_failed_resumes = []
    lst_confirmed_resumes = []
    for j, i in enumerate(data):
        vacancy, failed_resumes, confirmed_resumes = i['vacancy'], i['failed_resumes'], i['confirmed_resumes']
        vacancies.append(load_vacancy(vacancy))
        lst_failed = []
        lst_confirmed = []
        for resume in failed_resumes:
            lst_failed.append(load_resume(resume, result=0))
        for resume in confirmed_resumes:
            lst_confirmed.append(load_resume(resume, result=1))
        lst_failed_resumes.append(lst_failed)
        lst_confirmed_resumes.append(lst_confirmed)

    return vacancies, lst_failed_resumes, lst_confirmed_resumes


def load_test(way):
    with open(way, 'rb') as f:
        data = json.load(f)
    vacancies = []
    lst_resumes = []
    vacancy, resumes = data['vacancy'], data['resumes']
    print('start')
    vacancies.append(load_vacancy(vacancy))
    print('vacancy done')
    lst = []
    for resume in resumes:
        lst.append(load_resume(resume))
    lst_resumes.append(lst)
    print('resume done')
    return vacancies, lst_resumes


way_test = 'test_data.json'
test_vacancies, test_lst_resumes = load_test(way_test)
print("add in class(translated)")

tokenizer2 = AutoTokenizer.from_pretrained("Voicelab/vlt5-base-keywords", legacy=False)
model2 = AutoModelForSeq2SeqLM.from_pretrained("Voicelab/vlt5-base-keywords")

with open('all_hard_skills.pkl', 'rb') as f:
    skills = pickle.load(f)


def generate_keywords(text):
    task_prefix = "Keywords: "
    inputs = text.split('.')
    set_key_words = set()

    for sample in inputs:
        input_sequences = [task_prefix + sample]
        input_ids = tokenizer2(
            input_sequences, return_tensors="pt", truncation=True
        ).input_ids
        output = model2.generate(input_ids, no_repeat_ngram_size=3, num_beams=4)
        predicted = tokenizer2.decode(output[0], skip_special_tokens=True)
        set_key_words.update(predicted.split(', '))

    return list(set_key_words)


for i in range(len(test_vacancies)):
    desr = test_vacancies[i].context

    skill = set()
    text = re.sub(r'[^\w\s]', ' ', desr).replace('  ', ' ').lower()
    for word in text.split(' '):
        if word in skills:
            skill.add(word)

    test_vacancies[i].add_essense(list(skill))
    test_vacancies[i].add_essense(generate_keywords(desr))

print("saving")

with open("test_vacancies.pkl", 'wb') as f:
    pickle.dump(test_vacancies, f)

with open("test_lst_resumes.pkl", 'wb') as f:
    pickle.dump(test_lst_resumes, f)


# Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch
import torch.nn as nn
import torch.nn.functional as F

class BigBinaryClassifier(nn.Module):
    def __init__(self):
        super(BigBinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(16, 64)   # Первый полносвязный слой с 16 входами и 64 нейронами
        self.fc2 = nn.Linear(64, 128)  # Второй полносвязный слой с 64 входами и 128 нейронами
        self.fc3 = nn.Linear(128, 64)  # Третий полносвязный слой с 128 входами и 64 нейронами
        self.fc4 = nn.Linear(64, 32)   # Четвертый полносвязный слой с 64 входами и 32 нейронами
        self.fc5 = nn.Linear(32, 1)    # Пятый полносвязный слой с 32 входами и одним выходным нейроном

    def forward(self, x):
        # Применяем функции активации ReLU к выходу каждого слоя
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = torch.sigmoid(self.fc5(x))  # Применяем сигмоидальную функцию к выходу пятого слоя
        return x

# Пример использования модели
model = BigBinaryClassifier()
print(model)


# Make_features

In [None]:
from classes import Vacancy, Resume

In [None]:
# модель для сравнения сущностей
from sentence_transformers import SentenceTransformer, util
import tqdm as notebook_tqdm

feature_param = 5
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
def text_similar(sentence1, sentence2, model): ## сравнение текста -> int
    embeddings = model.encode([sentence1, sentence2], convert_to_tensor=True)
    # Вычисление косинусного расстояния между векторами
    cosine_scores = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    
    return cosine_scores.item()

def calc_hard_skills_sim(person_skills, vac_skills, param): ## -> сравнение важных скилов list(int)
    # Initialize a matrix to store similarity scores between all pairs of person and vacancy skills
    matrix = []
    for v_skill in vac_skills:
        feature = 1 if v_skill in person_skills else 0
        matrix.append(feature)
    #print(matrix)
    if len(matrix) < param:
        if len(matrix) / param <= 0.5:
            return None
        return matrix + [0] * (param - len(matrix))
    return matrix[:param]

def calc_essence_sim(person_skills, essence_vac, model, param):## -> сравнение важных сущностей list(int)
    # Initialize a matrix to store similarity scores between all pairs of person and vacancy skills
    matrix = []
    for v_skill in essence_vac:
        maxi = -1
        for p_skill in person_skills:
            # Calculate cosine similarity between each pair of person and vacancy skills
            cos_score = text_similar(p_skill, v_skill, model)
            maxi = max(maxi, cos_score)
        matrix.append(maxi)
    if len(matrix) < param:
        return matrix + [0] * (param - len(matrix))
    return matrix[:param]

def rate_skills(name_vac, hard_skills_vac, model):## -> ранжирование списка list(str)
    scores = []
    for i in hard_skills_vac:
        scores.append(text_similar(name_vac, i, model))
    # Получение индексов максимальных элементов (пять первых)
    max_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    imp_skiils = [hard_skills_vac[i] for i in max_indexes]
    return imp_skiils

def experiance_disсribe_vac(lst_exp_desc, time_exp, discribe_vac, embending_model, param):  
    '''
    lst_exp_desc - список с описанием вакансий,
    time_exp - время работы на определенной вакансии
    discribe_vac - описание вакансии
    embending_model - модель для емб
    return lst[int]
    '''
    scores = []
    for exp, time in zip(lst_exp_desc, time_exp):
        scores.append(text_similar(exp, discribe_vac, embending_model) * time / 365)
    scores.sort(reverse=True)
    if len(scores) < param:
        return scores + [0] * (param - len(scores))
    return scores[:param]

In [None]:
import pickle
#/Users/igorkopylov/Downloads/essence.pkl
with open("/Users/igorkopylov/Downloads/train_lst_confirmed_resumes.pkl", "rb") as f:
    confirmed_resumes = pickle.load(f, encoding = 'utf8')

with open("/Users/igorkopylov/Downloads/train_lst_failed_resumes.pkl", "rb") as f:
    failed_resumes = pickle.load(f, encoding = 'utf8')

with open("/Users/igorkopylov/Downloads/train_vacancies.pkl", "rb") as f:
    vacancies = pickle.load(f, encoding = 'utf8')


for i in range(len(vacancies)):
    resumes.append(confirmed_resumes[i] + failed_resumes[i])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter  # Import SummaryWriter for TensorBoard logging
from sklearn.model_selection import train_test_split
# Create a SummaryWriter instance
writer = SummaryWriter()

In [None]:
def make_features(lst_objects, embedding_model, feature_param):
    # Создание пустого списка для тензор
    data = []
    target = []
    for indx, vac in enumerate(lst_objects[1]):
        discribe_vac = vac.context
        name_vac = vac.name
        essence_vac = vac.essense
        hard_skills_vac = vac.keys
        for res in lst_objects[0][indx]:
            lst_exp_desc = res.experience
            hard_skills_per = res.keys
            exp_day = res.experience_day
            age = res.years
            feature = []
            # try:
            # сравнение hard_skills_per с essence_vac
            imp_essence = rate_skills(name_vac, essence_vac, embedding_model)
            essence_emb = calc_essence_sim(hard_skills_per, imp_essence, embedding_model, feature_param)
            # сравнение hard_skills_per с essence_vac
            imp_skills_vac = rate_skills(name_vac, hard_skills_vac, embedding_model)
            skills_emb = calc_hard_skills_sim(hard_skills_per, imp_skills_vac, feature_param)
            if skills_emb is None:
                skills_emb = essence_emb
            # Сравнение рабочего опыта резюмиста и описание вакансии
            exp_emb = experiance_disсribe_vac(lst_exp_desc, exp_day, discribe_vac, embedding_model, feature_param)
            # lst_exp_desc, time_exp, discribe_vac, embending_model, param
            feature.extend(skills_emb + essence_emb + exp_emb + [age])
            print(feature)
            #except:
                #feature = [0] * (feature_param * 3)
            data.append(feature)
            target.append([res.target])
        print(indx)
    return data, target
    

In [None]:
X, y = make_features((resumes, vacancies), embedding_model, feature_param)

In [None]:
import pandas as pd

df = pd.read_csv('fail_feature.csv')
conf = pd.read_csv('conf.csv')
print(conf.shape)
df['target'] = 0
conf['target'] = 1
#df['target']
df['target'] = df['target'].astype(float)
print(df.shape)

df = pd.concat([df, conf])

In [None]:

X_train, X_test, y_train, y_test = train_test_split(df.drop("target", axis=1), df["target"], test_size=0.2, random_state=42)

X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)
print(X_train.shape, y_train.shape)#, dtype=torch.float32)
#print(X_train.shape, X_test.shape, dtype=torch.float32)
#Создание DataLoader для обучения и валидации
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(X_test, y_test)
val_loader = DataLoader(val_dataset, batch_size=32)

In [66]:
model = BigBinaryClassifier()
print(model)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

BigBinaryClassifier(
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=32, bias=True)
  (fc5): Linear(in_features=32, out_features=1, bias=True)
)


In [67]:
# Обучение модели
num_epochs = 40
for epoch in range(num_epochs):
    model.train()  # Установка модели в режим обучения
    train_loss = 0.0
    for inputs, targets in train_loader:
        inputs = inputs.squeeze()
        #print(inputs.shape)
        optimizer.zero_grad()  # Обнуление градиентов
        #print(inputs.dtype, targets.dtype)
        outputs = model(inputs).squeeze()  # Прямой проход
        loss = criterion(outputs, targets)  # Вычисление функции потерь
        loss.backward()  # Обратное распространение
        optimizer.step()  # Обновление параметров
        train_loss += loss.item() * inputs.size(0)
    train_loss /= len(train_loader.dataset)

    # Валидация модели
    model.eval()  # Установка модели в режим оценки
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs
            outputs = model(inputs).squeeze()#.squeeze()
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)
            predicted = torch.round(outputs)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    val_loss /= len(val_loader.dataset)
    val_accuracy = correct / total

    # Запись в TensorBoard
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/val', val_loss, epoch)
    writer.add_scalar('Accuracy/val', val_accuracy, epoch)

    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

print('Обучение завершено.')

# Закрытие SummaryWriter после завершения обучения
writer.close() # [1, 32, 16] 


Epoch 1/40, Train Loss: 0.6718, Val Loss: 0.6400, Val Accuracy: 0.7259
Epoch 2/40, Train Loss: 0.6371, Val Loss: 0.6105, Val Accuracy: 0.7259
Epoch 3/40, Train Loss: 0.6216, Val Loss: 0.5970, Val Accuracy: 0.7259
Epoch 4/40, Train Loss: 0.6162, Val Loss: 0.5923, Val Accuracy: 0.7259
Epoch 5/40, Train Loss: 0.6157, Val Loss: 0.5899, Val Accuracy: 0.7259
Epoch 6/40, Train Loss: 0.6158, Val Loss: 0.5900, Val Accuracy: 0.7259
Epoch 7/40, Train Loss: 0.6159, Val Loss: 0.5903, Val Accuracy: 0.7259
Epoch 8/40, Train Loss: 0.6157, Val Loss: 0.5891, Val Accuracy: 0.7259
Epoch 9/40, Train Loss: 0.6153, Val Loss: 0.5911, Val Accuracy: 0.7259
Epoch 10/40, Train Loss: 0.6159, Val Loss: 0.5919, Val Accuracy: 0.7259
Epoch 11/40, Train Loss: 0.6146, Val Loss: 0.5894, Val Accuracy: 0.7259
Epoch 12/40, Train Loss: 0.6148, Val Loss: 0.5889, Val Accuracy: 0.7259
Epoch 13/40, Train Loss: 0.6148, Val Loss: 0.5896, Val Accuracy: 0.7259
Epoch 14/40, Train Loss: 0.6145, Val Loss: 0.5905, Val Accuracy: 0.7259
E

In [None]:
 %load_ext tensorboard
 %tensorboard --logdir=runs

In [68]:
with open("model_Big.pkl", "wb") as f:
    pickle.dump(model, f)

In [58]:
with open("test_lst_resumes.pkl", "rb") as f:
    resumes = pickle.load(f, encoding = 'utf8')

with open("test_vacancies.pkl", "rb") as f:
    vacancies = pickle.load(f, encoding = 'utf8')

In [59]:
X, y = make_features((resumes, vacancies), embedding_model, feature_param)

[0, 0, 0, 0, 0, 0.5625385642051697, 0.6015891432762146, 0.636419415473938, 0.4720681607723236, 0.3762739300727844, 0.9434269696065825, 0.7756954114731044, 0.7622999227210266, 0.16681434136547454, 0.07727329396221735, 32]
[0, 0, 0, 0, 0, 0.585588276386261, 0.6995006203651428, 0.7412429451942444, 0.750575065612793, 0.48060208559036255, 2.7349927660537094, 1.0866385348855632, 0.8867967448822439, 0.5692268995389547, 0, 33]
[0, 0, 0, 0, 0, 0.5319784283638, 0.6450060606002808, 0.7060478925704956, 0.8253291845321655, 0.40796905755996704, 1.3799522256197996, 0.7214927385931146, 0.40830624626107415, 0, 0, 34]
[0, 0, 1, 1, 0, 0.6349282264709473, 0.7856550812721252, 0.9999999403953552, 0.698638916015625, 0.42550888657569885, 1.898953263808603, 0.8865201892918103, 0.7409509951121187, 0.5991556210060642, 0.37904770308977936, 34]
[0, 0, 0, 0, 0, 0.16846641898155212, 0.18248777091503143, 0.27730822563171387, 0.16293789446353912, 0.32951778173446655, 1.3034175513541861, 1.2485530889197571, 1.118334615

In [62]:
X_test = torch.tensor(X, dtype=torch.float32)
X_test.shape

torch.Size([11, 16])

In [64]:
model = torch.load('BigBinary.pth')

# Преобразование данных в тензор
X_test = torch.tensor(X, dtype=torch.float32)

# Предсказание на тестовых данных
with torch.no_grad():
    outputs = model(X_test)
    predicted_labels = torch.round(outputs).squeeze().tolist()

print(predicted_labels)


TypeError: 'collections.OrderedDict' object is not callable