In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample
from transformers import BertTokenizer, BertModel
import torch
import json

# Проверка доступности GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Читаем json файл с резюме и вакансиями
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

#Извлекаем отклонённые резюме
df_f = pd.json_normalize(data, record_path=['failed_resumes'], meta=[['vacancy', 'uuid'], ['vacancy', 'name'], ['vacancy', 'keywords'], ['vacancy', 'description'], ['vacancy', 'comment']])
#Извлекаем принятые резюме
df_c = pd.json_normalize(data, record_path=['confirmed_resumes'], meta=[['vacancy', 'uuid'], ['vacancy', 'name'], ['vacancy', 'keywords'], ['vacancy', 'description'], ['vacancy', 'comment']])
#Добавляем коды статусов
df_f['status'] = 0
df_c['status'] = 1
#Собираем в единый датасет
frames = [df_f, df_c]
df = pd.concat(frames)

# Преобразование дат рождения в возраст
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
df['age'] = (pd.Timestamp('now') - df['birth_date']).dt.days // 365

# Заполнение пропусков
df['city'].fillna('Unknown', inplace=True)
df['age'].fillna(df['age'].mean(), inplace=True)

# Кодирование категориальных данных
label_encoders = {}
categorical_columns = ['country', 'city', 'vacancy.uuid', 'vacancy.name']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Загрузка токенизатора и модели BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

def get_bert_embeddings_batch(texts):
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Применение BERT для текстовых столбцов партиями
text_columns = ['about', 'key_skills', 'vacancy.description']
batch_size = 32
for col in text_columns:
    embeddings = []
    texts = df[col].astype(str).fillna('').tolist()
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = get_bert_embeddings_batch(batch_texts)
        embeddings.append(batch_embeddings)
    embeddings = np.vstack(embeddings)
    df_bert = pd.DataFrame(embeddings, columns=[f"{col}_bert_{i}" for i in range(embeddings.shape[1])])
    
    # Сброс индексов перед конкатенацией
    df.reset_index(drop=True, inplace=True)
    df_bert.reset_index(drop=True, inplace=True)
    
    df = pd.concat([df, df_bert], axis=1)

# Удаление ненужных столбцов
drop_columns = ['uuid', 'first_name', 'last_name', 'birth_date', 'about', 'key_skills', 'experienceItem', 'educationItem', 'languageItems', 'vacancy.description', 'vacancy.keywords', 'vacancy.comment', 'languageItem']
df.drop(columns=drop_columns, inplace=True)

# Балансировка классов
df_majority = df[df.status == 0]
df_minority = df[df.status == 1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # Дублирование меньшинства
                                 n_samples=len(df_majority),    # Доведение до размера большинства
                                 random_state=123) # Случайный сид

df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Разделение данных на признаки и целевую переменную
X = df_balanced.drop(columns=['status'])
y = df_balanced['status']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование признаков
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Оптимизация гиперпараметров с помощью GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Предсказание и оценка модели
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba)}")


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


              precision    recall  f1-score   support

           0       0.89      0.85      0.87        99
           1       0.84      0.89      0.86        87

    accuracy                           0.87       186
   macro avg       0.87      0.87      0.87       186
weighted avg       0.87      0.87      0.87       186

ROC AUC Score: 0.9464762568210844


In [15]:
import joblib

# Save the model
joblib.dump(best_model, 'best_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [16]:
import joblib

# Load the model
loaded_model = joblib.load('best_model.pkl')

# Load the scaler
loaded_scaler = joblib.load('scaler.pkl')

# Now you can use loaded_model and loaded_scaler for predictions
X_test_transformed = loaded_scaler.transform(X_test)
y_pred = loaded_model.predict(X_test_transformed)
y_pred_proba = loaded_model.predict_proba(X_test_transformed)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba)}")


              precision    recall  f1-score   support

           0       0.52      0.68      0.59        99
           1       0.43      0.28      0.34        87

    accuracy                           0.49       186
   macro avg       0.47      0.48      0.46       186
weighted avg       0.47      0.49      0.47       186

ROC AUC Score: 0.4427609427609428




In [17]:
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba)}")

              precision    recall  f1-score   support

           0       0.89      0.85      0.87        99
           1       0.84      0.89      0.86        87

    accuracy                           0.87       186
   macro avg       0.87      0.87      0.87       186
weighted avg       0.87      0.87      0.87       186

ROC AUC Score: 0.9464762568210844


In [18]:
import pickle

# Save the model
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [19]:
import pickle

# Load the model
with open('best_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Load the scaler
with open('scaler.pkl', 'rb') as scaler_file:
    loaded_scaler = pickle.load(scaler_file)

# Now you can use loaded_model and loaded_scaler for predictions
X_test_transformed = loaded_scaler.transform(X_test)
y_pred = loaded_model.predict(X_test_transformed)
y_pred_proba = loaded_model.predict_proba(X_test_transformed)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba)}")


              precision    recall  f1-score   support

           0       0.52      0.68      0.59        99
           1       0.43      0.28      0.34        87

    accuracy                           0.49       186
   macro avg       0.47      0.48      0.46       186
weighted avg       0.47      0.49      0.47       186

ROC AUC Score: 0.4427609427609428


