In [55]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download('rslp')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import time
import string
import unicodedata
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import glob
import spacy.cli
spacy.cli.download("pt_core_news_sm")
import spacy
nlp = spacy.load("pt_core_news_sm")
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer
stemmer = RSLPStemmer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [138]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Carrega base consolidada

In [139]:
df_path = '/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/Dados/silver/dados_processed/df_join_prospect_base.parquet'
df = pd.read_parquet(df_path)
df = df.head(100)

# 2. Métodos

In [142]:
def remove_person_names(text: str) -> str:
    doc = nlp(text)
    return " ".join([token.text for token in doc if token.ent_type_ != "PER"])

def normalize_accents(text: str) -> str:
    return unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")

def remove_punctuation(text: str) -> str:
    table = str.maketrans({key: " " for key in string.punctuation})
    return text.translate(table)

def normalize_str(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\d+", " ", text)           # remove números
    text = remove_punctuation(text)            # remove pontuação
    text = normalize_accents(text)             # remove acentos
    text = re.sub(r"\s+", " ", text).strip()   # normaliza espaços
    return text

def tokenizer(text: str):
    stop_words_br = set(nltk.corpus.stopwords.words("portuguese"))
    if isinstance(text, str):
        text = normalize_str(text)                                              # normaliza string
        text = remove_person_names(text)                                        # remove nomes
        tokens = word_tokenize(text, language="portuguese")                     # tokeniza para a lingua portuguesa
        tokens = [t for t in tokens if t not in stop_words_br and len(t) > 2]
        tokens = [stemmer.stem(t) for t in tokens]                              # stemiza tokens
        return tokens
    return None

def tokenize_and_vectorize_fixed(df, campo_vetor, fitted_vectorizer, filename_prefix, batch_idx):
    """
    Transforma um lote de dados usando um vetorizador TF-IDF já treinado.
    Salva o lote como arquivo Parquet.
    """
    # Transformar o texto em matriz TF-IDF usando vocabulário existente
    vector_matrix = fitted_vectorizer.transform(df[campo_vetor].fillna(""))

    # Criar DataFrame com os vetores
    df_tfidf = pd.DataFrame(
        vector_matrix.toarray(),
        columns=fitted_vectorizer.get_feature_names_out(),
        index=df.index
    )

    # Salvar batch
    output_file = f"{filename_prefix}_batch_{batch_idx}.parquet"
    df_tfidf.to_parquet(output_file)
    print(f"Lote {batch_idx} salvo com formato {df_tfidf.shape} em {output_file}")

    return df_tfidf

def combine_vector_batches(batch_files, output_file):
    """
    Combina todos os arquivos de lote em um único DataFrame e salva como Parquet.
    """
    print("Combinando todos os lotes...")
    combined_dfs = []

    for i, file in enumerate(batch_files):
        df_batch = pd.read_parquet(file)  # ✅ Lê o arquivo Parquet
        combined_dfs.append(df_batch)
        print(f"Lote {i} carregado: {df_batch.shape}")

    # Combinar todos os DataFrames
    df_combined = pd.concat(combined_dfs, ignore_index=True)  # ignore_index=True reinicia os índices
    df_combined.to_parquet(output_file)
    print(f"Conjunto combinado salvo: {df_combined.shape} -> {output_file}")

    return df_combined

def combine_tfidf_batches(df, campo_vetor, vectorizer, batch_size=1000, output_dir="output"):
    """
    Treina o TF-IDF no dataset inteiro, processa em batches e combina todos os batches.
    """
    # Treina o vetorizador em todos os dados
    print("Treinando o vetorizador em todo o conjunto de dados...")
    vectorizer.fit(df[campo_vetor].fillna(""))

    print(f"Tamanho do vocabulário: {len(vectorizer.vocabulary_)}")
    print("Exemplo de features:", list(vectorizer.get_feature_names_out())[:10])

    # Processa em batches
    print("Processando lotes com vocabulário consistente...")
    filename_prefix = f"/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/notebooks/ramos/application_processed_{campo_vetor}"
    batch_files = []

    for i in range(0, len(df), batch_size):
        batch_df = df.iloc[i:i+batch_size]
        batch_idx = i // batch_size

        # Vetoriza batch
        #tokenize_and_vectorize_fixed(batch_df, campo_vetor, vectorizer, filename_prefix, batch_idx)
        X_tfidf, df_tfidf = vetoriza_input(df, campo_vetor, vectorizer)

        # Salvar batch
        output_file = f"{filename_prefix}_batch_{batch_idx}.parquet"
        df_tfidf.to_parquet(output_file)
        print(f"Lote {batch_idx} salvo com formato {df_tfidf.shape} em {output_file}")

        batch_files.append(f"{filename_prefix}_batch_{batch_idx}.parquet")

        print(f"Lote {batch_idx} concluído (linhas {i} até {min(i+batch_size, len(df))})")

    print(f"\n{len(batch_files)} lotes processados com sucesso!")
    print(f"Todos os lotes agora possuem as mesmas {len(vectorizer.vocabulary_)} features")

    # Combina todos os batches
    combined_output_file = f"/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/notebooks/ramos/talent_pool_vectors_combined_{campo_vetor}.parquet"
    df_tfidf_combined = combine_vector_batches(batch_files, combined_output_file)

    return df_tfidf_combined, vectorizer, batch_files

def vetoriza_input(df_input, campo_vetor, vectorizer):
    # Fit the vectorizer to the text data
    vectorizer.fit(df_input[campo_vetor].fillna(""))

    # Transform the text into TF-IDF vectors
    X_tfidf = vectorizer.transform(df_input[campo_vetor].fillna(""))

    # Insere vetores numa coluna única num dataframe
    df_tfidf = df_input.copy()
    df_tfidf['vetor_cv'] = [arr for arr in X_tfidf.toarray()]

    return X_tfidf, df_tfidf

def compute_similarity_batched(df_tfidf, campo_vetor, batch_size_sim=500, output_prefix='similarity_batch'):
    """Calcular similaridade do cosseno em lotes para lidar com grandes conjuntos de dados"""
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np

    n_samples = len(df_tfidf)
    print(f"Calculando similaridade para {n_samples} amostras em lotes de {batch_size_sim}")

    # Criar matriz de similaridade em lotes para gerenciar memória
    similarity_files = []

    for i in range(0, n_samples, batch_size_sim):
        batch_end = min(i + batch_size_sim, n_samples)
        batch_data = df_tfidf.iloc[i:batch_end]

        # Calcular similaridade entre este lote e TODOS os dados
        batch_similarity = cosine_similarity(batch_data, df_tfidf)

        # Salvar similaridade do lote
        batch_file = f'/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/notebooks/ramos/{output_prefix}_{i}_{batch_end}_{campo_vetor}.npy'
        np.save(batch_file, batch_similarity)
        similarity_files.append(batch_file)

        print(f"Similaridade do lote {i}-{batch_end} calculada: {batch_similarity.shape}")

    return similarity_files

# 3. Tratamento para input inicial de dados do Streamlit

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Inputs de currículos
cv_input = [
    'Desenvolvedor python senior são paulo',
    'Desenvolvedor pandas senior rio de janeiro',
    'Python junior programador são paulo'
]

# Dataframe com os inputs do streamleat
df_input = pd.DataFrame({'cv_candidato': cv_input})

campo_vetor = 'cv_candidato'

# Define TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=10000,  # no limit on number of features
    min_df=1,           # include terms appearing in at least 1 document
    max_df=1.0          # include terms appearing in all documents
)

X_tfidf, df_tfidf = vetoriza_input(df_input, campo_vetor)

cosine_sim_matrix = cosine_similarity(X_tfidf)

# 4. Vetorização e cálculo da similaridade

In [152]:
lista_campos_vetor = ['cv_pt'] # trocar para os campos a serem utilizados

for campo_vetor in lista_campos_vetor:
  print(f"Vetorizando campo: {campo_vetor}")
  print("Criando vocabulário a partir de todos os dados...")

  # Ajusta o vetorizador em TODOS os dados para criar vocabulário consistente
  vectorizer = TfidfVectorizer(
      tokenizer=tokenizer,
      max_features=10000,
      min_df=2,
      max_df=0.8
  )

  # Processa e combina batches
  df_tfidf_combined, vectorizer, batch_files = combine_tfidf_batches(df, campo_vetor, vectorizer, batch_size=100, output_dir="output")

  # Salva em parquet
  df_tfidf_combined.to_parquet('/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/Dados/gold/df_join_tfidf.parquet')

Vetorizando campo: cv_pt
Criando vocabulário a partir de todos os dados...
Treinando o vetorizador em todo o conjunto de dados...




Tamanho do vocabulário: 342
Exemplo de features: ['aasp', 'abril', 'academ', 'acompanh', 'acopl', 'administr', 'ado', 'advent', 'advog', 'agenc']
Processando lotes com vocabulário consistente...
Lote 0 salvo com formato (100, 23) em /content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/notebooks/ramos/application_processed_cv_pt_batch_0.parquet
Lote 0 concluído (linhas 0 até 100)

1 lotes processados com sucesso!
Todos os lotes agora possuem as mesmas 342 features
Combinando todos os lotes...
Lote 0 carregado: (100, 23)
Conjunto combinado salvo: (100, 23) -> /content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/notebooks/ramos/talent_pool_vectors_combined_cv_pt.parquet


# 4. Sistema de Recomendação

In [158]:
# Step 5: Efficient Recommendation System
class TalentRecommendationSystem:
    def __init__(self, df_tfidf, df_application_original, vectorizer):
        self.df_tfidf = df_tfidf
        self.df_application = df_application_original
        self.vectorizer = vectorizer
        self.similarity_cache = {}

    def get_similar_candidates(self, candidate_idx, top_n=10, similarity_threshold=0.1):
        """Get most similar candidates for a given candidate"""
        from sklearn.metrics.pairwise import cosine_similarity

        # Get the TF-IDF vector for the candidate
        candidate_vector = self.df_tfidf.iloc[candidate_idx:candidate_idx+1]

        # Compute similarity with all candidates
        similarities = cosine_similarity(candidate_vector, self.df_tfidf)[0]

        # Get indices of most similar candidates (excluding self)
        similar_indices = np.argsort(similarities)[::-1][1:top_n+1]  # Exclude self (index 0)
        similar_scores = similarities[similar_indices]

        # Filter by threshold
        valid_mask = similar_scores >= similarity_threshold
        similar_indices = similar_indices[valid_mask]
        similar_scores = similar_scores[valid_mask]

        # Create results
        results = []
        for idx, score in zip(similar_indices, similar_scores):
            candidate_info = {
                'index': int(idx),
                'similarity_score': float(score),
                'nivel_profissional': self.df_application.iloc[idx].get('nivel_profissional', 'N/A'),
                'area_atuacao': self.df_application.iloc[idx].get('area_atuacao', 'N/A'),
                'nivel_academico': self.df_application.iloc[idx].get('nivel_academico', 'N/A'),
                'conhecimentos_preview': str(self.df_application.iloc[idx].get(campo_vetor, ''))[:200] + '...'
            }
            results.append(candidate_info)

        return results

    def recommend_for_job_description(self, job_description, top_n=10):
        """Find candidates similar to a job description"""
        from sklearn.metrics.pairwise import cosine_similarity

        # Vectorize the job description using the same vectorizer
        job_vector = self.vectorizer.transform([job_description])

        # Compute similarity with all candidates
        similarities = cosine_similarity(job_vector, self.df_tfidf)[0]

        # Get top candidates
        top_indices = np.argsort(similarities)[::-1][:top_n]
        top_scores = similarities[top_indices]

        # Create results
        results = []
        for idx, score in zip(top_indices, top_scores):
            candidate_info = {
                'index': int(idx),
                'match_score': float(score),
                'nivel_profissional': self.df_application.iloc[idx].get('nivel_profissional', 'N/A'),
                'area_atuacao': self.df_application.iloc[idx].get('area_atuacao', 'N/A'),
                'nivel_academico': self.df_application.iloc[idx].get('nivel_academico', 'N/A'),
                'conhecimentos_preview': str(self.df_application.iloc[idx].get(campo_vetor, ''))[:200] + '...'
            }
            results.append(candidate_info)

        return results

# Initialize the recommendation system
print("Initializing Talent Recommendation System...")
talent_recommender = TalentRecommendationSystem(
    df_tfidf_combined,
    df,
    vectorizer
)

print("✅ Recommendation system ready!")
print(f"Loaded {len(df_tfidf_combined)} candidate profiles")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)} features")

Initializing Talent Recommendation System...
✅ Recommendation system ready!
Loaded 100 candidate profiles
Vocabulary size: 342 features


In [None]:
# Step 6: Test the Recommendation System

print("="*60)
print("TESTING TALENT RECOMMENDATION SYSTEM")
print("="*60)

# Test 1: Find similar candidates to a specific candidate
print("\n🔍 TEST 1: Find Similar Candidates")
test_candidate_idx = 50  # Example candidate
similar_candidates = talent_recommender.get_similar_candidates(
    test_candidate_idx,
    top_n=5,
    similarity_threshold=0.1
)

print(f"\nTop 5 candidates similar to candidate #{test_candidate_idx}:")
for i, candidate in enumerate(similar_candidates, 1):
    print(f"\n{i}. Similarity: {candidate['similarity_score']:.3f}")
    print(f"   Level: {candidate['nivel_profissional']}")
    print(f"   Area: {candidate['area_atuacao']}")
    print(f"   Education: {candidate['nivel_academico']}")
    print(f"   Preview: {candidate['conhecimentos_preview'][:100]}...")

# Test 2: Find candidates for a job description
print(f"\n{'='*60}")
print("🎯 TEST 2: Job Matching")
job_description = """
Procuramos um desenvolvedor Python sênior com experiência em:
- Desenvolvimento web com Django ou Flask
- Bancos de dados PostgreSQL e MongoDB
- APIs REST e microserviços
- Docker e Kubernetes
- Machine Learning com scikit-learn
- Experiência com AWS ou Azure
"""

matching_candidates = talent_recommender.recommend_for_job_description(
    job_description,
    top_n=5
)

print(f"\nTop 5 candidates for the job description:")
for i, candidate in enumerate(matching_candidates, 1):
    print(f"\n{i}. Match Score: {candidate['match_score']:.3f}")
    print(f"   Level: {candidate['nivel_profissional']}")
    print(f"   Area: {candidate['area_atuacao']}")
    print(f"   Education: {candidate['nivel_academico']}")
    print(f"   Preview: {candidate['conhecimentos_preview'][:150]}...")

# Performance metrics
print(f"\n{'='*60}")
print("📊 SYSTEM PERFORMANCE METRICS")
print("="*60)
print(f"Total candidates indexed: {len(df_tfidf_combined):,}")
print(f"Feature dimensions: {df_tfidf_combined.shape[1]:,}")
print(f"Memory usage (TF-IDF matrix): ~{df_tfidf_combined.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
print(f"Vocabulary size: {len(vectorizer.vocabulary_):,} unique terms")

# Save the system for future use
print(f"\n💾 Saving recommendation system components...")
import joblib

# Save vectorizer
joblib.dump(vectorizer, '/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/notebooks/ramos/talent_vectorizer.pkl')
print("✅ Vectorizer saved")

# Save candidate mapping
candidate_mapping = {
    'indices': df_tfidf_combined.index.tolist(),
    'total_candidates': len(df_tfidf_combined)
}
import json
with open('/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/notebooks/ramos/candidate_mapping.json', 'w') as f:
    json.dump(candidate_mapping, f)
print("✅ Candidate mapping saved")

print(f"\n🎉 Talent Recommendation System Successfully Implemented!")
print("Key improvements over the original approach:")
print("✅ Consistent vocabulary across all batches")
print("✅ Memory-efficient batch processing")
print("✅ Scalable similarity computation")
print("✅ Fast candidate matching and job description matching")
print("✅ Reusable system components saved")

TESTING TALENT RECOMMENDATION SYSTEM

🔍 TEST 1: Find Similar Candidates

Top 5 candidates similar to candidate #50:

1. Similarity: 0.488
   Level: 
   Area: 
   Education: None
   Preview: revelo casado, 25 anos - brasileiro carteira de habilitação: ab formação acadêmica - bacharel em sis...

2. Similarity: 0.376
   Level: 
   Area: 
   Education: None
   Preview: leandro macris alves de souza revelo dados pessoais brasileiro – 29/09/86 - cnh a/b. jd. capuava – n...

3. Similarity: 0.376
   Level: 
   Area: 
   Education: None
   Preview: fullstack developer revelo resumo trabalhando com desenvolvimento desde 2016, com experiência no des...

4. Similarity: 0.371
   Level: 
   Area: 
   Education: None
   Preview: revelo francisco beltrão – pr resumo profissional profissional formado em sistemas de informação e c...

5. Similarity: 0.371
   Level: 
   Area: 
   Education: None
   Preview: revelo analista de testes perfil profissional sou analista de testes com três anos de experiência.