# Model Notebook

Este notebook foi usado para salvar os dados e modelos necessários para o tratamento final dos dados para apresentação

In [1]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample
import pandas as pd
import pickle
import json
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import fitz
import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


Lendo o dataframe com os dados mais balanceados do notebook anterior

In [2]:
df = pd.read_pickle('labeled_df_emb')

df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority) * 3,
                                   random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

## Machine Learning
 Separando atributo e alvo

In [3]:
X = df_balanced[["similarity_score"]]  
y = df_balanced["label"]

Separação de treino e test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

Treino do modelo de Regressão Logística com pesos balanceados

In [5]:
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train, y_train)

Treino do modelo XGBoost com gerenciador de desbalanceamento

In [6]:
scale = y_train.value_counts()[0] / y_train.value_counts()[1]
xgb = XGBClassifier(scale_pos_weight=scale, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Obtendo probabilidade e avaliando

In [7]:
logreg_probs = logreg.predict_proba(X_test)[:, 1]
xgb_probs = xgb.predict_proba(X_test)[:, 1]

ensemble_probs = (logreg_probs + xgb_probs) / 2
from sklearn.metrics import roc_auc_score, f1_score

threshold = 0.5
ensemble_preds = (ensemble_probs >= threshold).astype(int)

print("Ensemble Model Evaluation:")
print(classification_report(y_test, ensemble_preds))
print(f"ROC AUC Score: {roc_auc_score(y_test, ensemble_probs):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds):.4f}")

Ensemble Model Evaluation:
              precision    recall  f1-score   support

           0       0.82      0.54      0.65       135
           1       0.32      0.64      0.43        45

    accuracy                           0.57       180
   macro avg       0.57      0.59      0.54       180
weighted avg       0.69      0.57      0.60       180

ROC AUC Score: 0.6226
F1 Score: 0.4265


In [8]:
df_balanced

Unnamed: 0,applicant_id,job_id,similarity_score,status,label
161176,45043,8922,0.711959,,0
12242,9065,11075,1.000000,,0
92552,10203,11075,1.000000,,0
400084,12571,9421,1.000000,,0
117953,4881,9420,1.000000,,0
...,...,...,...,...,...
416013,43069,13737,0.667984,Encaminhado ao Requisitante,1
416019,43069,13718,0.639748,Encaminhado ao Requisitante,1
416550,43123,13969,0.786721,Encaminhado ao Requisitante,1
416724,43140,11619,0.709323,Encaminhado ao Requisitante,1


Salvando os dados

In [9]:
joblib.dump(logreg, "logistic_model.pkl")
joblib.dump(xgb, "xgboost_model.pkl")

['xgboost_model.pkl']

 Salvando o embedding dos dados de vagas

In [10]:
def preprocess(text):
    import re, string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('portuguese'))
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language="portuguese")
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

def extract_job_requirements(job):
    skills = job["perfil_vaga"].get("competencia_tecnicas_e_comportamentais", "")
    activities = job["perfil_vaga"].get("principais_atividades", "")
    return skills.lower() + " " + activities.lower()


with open(r'vagas.json', encoding='utf-8') as f:
    jobs = json.load(f)

embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

job_ids = list(jobs.keys())
job_texts = [preprocess(extract_job_requirements(jobs[jid])) for jid in job_ids]

job_embeddings = embedding_model.encode(job_texts, show_progress_bar=True)

job_titles = [jobs[jid]["informacoes_basicas"]["titulo_vaga"] for jid in job_ids]


joblib.dump({
    "job_ids": job_ids,
    "job_titles": job_titles,
    "job_embeddings": job_embeddings
}, "job_data.pkl")

Batches: 100%|██████████| 441/441 [01:11<00:00,  6.18it/s]


['job_data.pkl']

Salvando os dados

In [11]:
with open("vagas.pkl", "wb") as f:
    pickle.dump(jobs, f)

Preparando os dados para a apresentação no Streamlit:

In [12]:
warnings.simplefilter("ignore")

with open("vagas.pkl", "rb") as f:
    jobs = pickle.load(f)

logreg = joblib.load("logistic_model.pkl")
xgb = joblib.load("xgboost_model.pkl")
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

Obtendo os embeddings dos dados de vagas


In [13]:
job_data = joblib.load("job_data.pkl")
job_ids = job_data["job_ids"]
job_titles = job_data["job_titles"]
job_embeddings = job_data["job_embeddings"]

Função que processará o texto do Currículo

In [14]:
def preprocess(text):
    import re, string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('portuguese'))
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language="portuguese")
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

A função a seguir usará os modelos carregados e embeddings para calcular a similaridade e predizer a probabilidade de ser contratado com o text de um Currículo, ordenando e exibindo os dados


In [15]:
def predict_jobs_for_cv(cv_text, top_n=5):
    cleaned_cv = preprocess(cv_text)
    cv_vec = embedding_model.encode([cleaned_cv])

    sims = cosine_similarity(cv_vec, job_embeddings).flatten()

    results = []
    for i, sim in enumerate(sims):
        logreg_prob = logreg.predict_proba([[sim]])[0][1]
        xgb_prob = xgb.predict_proba([[sim]])[0][1]
        ensemble_prob = (logreg_prob + xgb_prob) / 2
        
        job = jobs.get(job_ids[i], {})
        title = job.get("informacoes_basicas", {}).get("titulo_vaga", "N/A")
        area = job.get("perfil_vaga", {}).get("areas_atuacao", "N/A")
        skills = job.get("perfil_vaga", {}).get("competencia_tecnicas_e_comportamentais", "")
        activities = job.get("perfil_vaga", {}).get("principais_atividades", "")

        results.append({
            "job_id": job_ids[i],
            "title": title,
            "area": area,
            "skills": skills,
            "activities": activities,
            "similarity": sim,
            "hire_prob": ensemble_prob
        })

    top_jobs = sorted(results, key=lambda x: x["hire_prob"], reverse=True)[:top_n]

    for idx, job in enumerate(top_jobs, 1):
        print(f"\nVaga recomendada #{idx}")
        print(f"Cargo: {job['title']}")
        print(f"Área: {job['area']}")
        print(f"Score de similaridade: {job['similarity']:.2f}")
        print(f"Probabilidade: {job['hire_prob']:.2%}")
        print(f"Competências: {job['skills'][:200]}...")
        print(f"Atividades: {job['activities'][:200]}...")


Buscando os arquivos PDF e convertendo para TXT

In [16]:
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

cv_path = r"" 

cv_text = extract_text_from_pdf(cv_path)

Chamando a função de predição:

In [17]:
top_jobs = predict_jobs_for_cv(cv_text, top_n=5)
print(top_jobs)


Vaga recomendada #1
Cargo: PASS THRU - SAp FI 2021-2632813 FAbiano
Área: Gestão e Alocação de Recursos de TI-
Score de similaridade: 0.61
Probabilidade: 79.69%
Competências: Indicacao Fabiano...
Atividades: Indicacao Fabiano...

Vaga recomendada #2
Cargo: 4282616 SAP SCM WM
Área: TI - SAP-
Score de similaridade: 0.61
Probabilidade: 79.58%
Competências: SAP SCM WM...
Atividades: SAP SCM WM...

Vaga recomendada #3
Cargo: Java, Spring Boot, AWS, Agile - 11833067
Área: TI - Projetos-
Score de similaridade: -0.16
Probabilidade: 76.96%
Competências: Java Programming language
Skill Java
Java, Spring Boot, AWS, Agile.
Outros detalhes do trabalho: 1 - Microservices and Light Weight Architecture (P2 - Intermediate) | 2 - Spring Boot (P2 - Intermediat...
Atividades: Java Programming language
Skill Java
Java, Spring Boot, AWS, Agile.
Outros detalhes do trabalho: 1 - Microservices and Light Weight Architecture (P2 - Intermediate) | 2 - Spring Boot (P2 - Intermediat...

Vaga recomendada #4
Cargo: S