In [10]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download("stopwords")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import time
import string
import unicodedata
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import glob

[nltk_data] Downloading package stopwords to /home/lucas-
[nltk_data]     nunes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Carrega base de application

In [26]:
# df_application = pd.read_csv('/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 5/Dados/silver/dados_processed/application_processed.csv')
df_application = pd.read_csv('/home/lucas-nunes/workspace/Postech/challenges/5_data/data/silver/processed/application_processed.csv')
df_application.columns

Index(['job_id', 'objetivo_profissional', 'data_criacao', 'local',
       'sabendo_de_nos_por', 'data_atualizacao', 'codigo_profissional',
       'data_aceite', 'fonte_indicacao', 'telefone_celular', 'sexo',
       'estado_civil', 'pcd', 'endereco', 'titulo_profissional',
       'area_atuacao', 'conhecimentos_tecnicos', 'certificacoes',
       'outras_certificacoes', 'remuneracao', 'nivel_profissional',
       'nivel_academico', 'nivel_ingles', 'nivel_espanhol', 'outro_idioma',
       'cv_pt', 'instituicao_ensino_superior', 'cursos', 'ano_conclusao',
       'data_admissao', 'data_ultima_promocao', 'conhecimentos_tecnicos_list',
       'certificacoes_list', 'outras_certificacoes_list',
       'remuneracao_numeric', 'cv_pt_cleaned', 'telefone_celular_normalized'],
      dtype='object')

In [27]:
df_application.shape

(42482, 37)

# 2. Seleciona coluna do currículo

In [28]:
# Count words for each line
df_application['cv_pt_cleaned']


0        assistente administrativo santosbatista itapec...
1        formação acadêmica ensino médio (2º grau) em e...
2        objetivo: área administrativa | financeira res...
3        formação ensino médio completo informática int...
4        última atualização em 09/11/2021 ­ sp ensino s...
                               ...                        
42477                                                  NaN
42478                                                  NaN
42479                                                  NaN
42480                                                  NaN
42481                                                  NaN
Name: cv_pt_cleaned, Length: 42482, dtype: object

# 3. Limpeza do texto

In [29]:
# Create df only with the first 5 lines preserving the column name cv_pt_cleaned
# df_application_teste = df_application[:100]
df_application_teste = df_application

In [6]:
import spacy.cli
spacy.cli.download("pt_core_news_sm")
import spacy

# carregar modelo para português
nlp = spacy.load("pt_core_news_sm")

def remove_person_names(text: str) -> str:
    doc = nlp(text)
    return " ".join([token.text for token in doc if token.ent_type_ != "PER"])

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m43.3 MB/s[0m  [33m0:00:00[0m[31m40.7 MB/s[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download('rslp')

from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer

# inicializa stemmer
stemmer = RSLPStemmer()

def normalize_accents(text: str) -> str:
    return unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")

def remove_punctuation(text: str) -> str:
    table = str.maketrans({key: " " for key in string.punctuation})
    return text.translate(table)

def normalize_str(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\d+", " ", text)           # remove números
    text = remove_punctuation(text)            # remove pontuação
    text = normalize_accents(text)             # remove acentos
    text = re.sub(r"\s+", " ", text).strip()   # normaliza espaços
    return text

def tokenizer(text: str):
    stop_words_br = set(nltk.corpus.stopwords.words("portuguese"))
    #stop_words_en = set(nltk.corpus.stopwords.words("english"))
    if isinstance(text, str):
        text = normalize_str(text)                                              # normaliza string
        text = remove_person_names(text)                                        # remove nomes
        tokens = word_tokenize(text, language="portuguese")                     # tokeniza para a lingua portuguesa
        tokens = [t for t in tokens if t not in stop_words_br and len(t) > 2]
        #tokens = [t for t in tokens if t not in stop_words_en and len(t) > 2]
        tokens = [stemmer.stem(t) for t in tokens]                              # stemiza tokens
        return tokens
    return None

[nltk_data] Downloading package punkt to /home/lucas-
[nltk_data]     nunes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/lucas-
[nltk_data]     nunes/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package rslp to /home/lucas-nunes/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [32]:
df_application_teste['cv_tokenizado'] = df_application_teste['cv_pt_cleaned'].apply(tokenizer)

In [36]:
df_application_teste = df_application_teste[['cv_pt_cleaned', 'cv_tokenizado']]

In [38]:
df_application_teste.to_parquet('/home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_token_2.parquet')

In [3]:
df_application_teste = pd.read_parquet('/home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_token_2.parquet')

In [4]:
df_application_teste

Unnamed: 0,cv_pt_cleaned,cv_tokenizado
0,assistente administrativo santosbatista itapec...,"[assist, administr, santosbat, itapecer, serr,..."
1,formação acadêmica ensino médio (2º grau) em e...,"[formaca, academ, ensin, medi, grau, ensin, me..."
2,objetivo: área administrativa | financeira res...,"[obje, are, administr, financ, resum, profiss,..."
3,formação ensino médio completo informática int...,"[formaca, ensin, medi, complet, informa, inter..."
4,última atualização em 09/11/2021 ­ sp ensino s...,"[ult, atualizaca, ensin, superi, administraca,..."
...,...,...
42477,,
42478,,
42479,,
42480,,


In [37]:

batch_df_test = df_application_teste.iloc[5:6]
test = tokenize_and_vectorize(batch_df_test, tokenizer, '', '')




In [38]:
test

Unnamed: 0,abert,aca,academ,acompanh,administr,ajust,alcanc,ambip,anal,analis,...,trat,tur,unidad,uninov,unip,vend,vinh,vivenc,volum,word
0,0.043153,0.086306,0.043153,0.258919,0.12946,0.043153,0.043153,0.043153,0.12946,0.086306,...,0.043153,0.043153,0.043153,0.043153,0.043153,0.215766,0.043153,0.043153,0.043153,0.043153


In [39]:
# FIXED APPROACH: Create a single vectorizer on all data first, then process in batches

print("Step 1: Creating vocabulary from all data...")
# Fit vectorizer on ALL data to create consistent vocabulary
vectorizer = TfidfVectorizer(
    tokenizer=tokenizer, 
    max_features=10000,  # Limit vocabulary size to manage memory
    min_df=2,           # Ignore terms that appear in less than 2 documents
    max_df=0.8          # Ignore terms that appear in more than 80% of documents
)

# Fit on all data to create the vocabulary
print("Fitting vectorizer on full dataset...")
vectorizer.fit(df_application_teste["cv_pt_cleaned"].fillna(""))

print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print("Sample features:", list(vectorizer.get_feature_names_out())[:10])

def tokenize_and_vectorize_fixed(df, fitted_vectorizer, filename_prefix, batch_idx):
    """Transform batch using the pre-fitted vectorizer"""
    # Transform (not fit_transform) to use existing vocabulary
    vector_matrix = fitted_vectorizer.transform(df["cv_pt_cleaned"].fillna(""))
    
    # Convert to DataFrame with consistent column names
    df_tfidf = pd.DataFrame(
        vector_matrix.toarray(), 
        columns=fitted_vectorizer.get_feature_names_out(),
        index=df.index  # Preserve original indices
    )
    
    # Save batch
    output_file = f"{filename_prefix}_batch_{batch_idx}.parquet"
    df_tfidf.to_parquet(output_file)
    print(f"Saved batch {batch_idx} with shape {df_tfidf.shape} to {output_file}")
    
    return df_tfidf

# Process in batches with consistent vocabulary
print("\nStep 2: Processing batches with consistent vocabulary...")
batch_size = 1000
filename_prefix = '/home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_vector_fixed'

batch_files = []
for i in range(0, len(df_application_teste), batch_size):
    batch_df = df_application_teste.iloc[i:i+batch_size]
    batch_idx = i // batch_size
    
    # Process batch with consistent vocabulary
    tokenize_and_vectorize_fixed(batch_df, vectorizer, filename_prefix, batch_idx)
    batch_files.append(f"{filename_prefix}_batch_{batch_idx}.parquet")
    
    print(f'Completed batch {batch_idx} (rows {i} to {min(i+batch_size, len(df_application_teste))})')

print(f"\nProcessed {len(batch_files)} batches successfully!")
print("All batches now have the same {len(vectorizer.vocabulary_)} features")

Step 1: Creating vocabulary from all data...
Fitting vectorizer on full dataset...




Vocabulary size: 10000
Sample features: ['...', 'aaa', 'ab', 'aba', 'abac', 'abaix', 'abandon', 'abap', 'abastec', 'abat']

Step 2: Processing batches with consistent vocabulary...
Saved batch 0 with shape (1000, 10000) to /home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_vector_fixed_batch_0.parquet
Completed batch 0 (rows 0 to 1000)
Saved batch 0 with shape (1000, 10000) to /home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_vector_fixed_batch_0.parquet
Completed batch 0 (rows 0 to 1000)
Saved batch 1 with shape (1000, 10000) to /home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_vector_fixed_batch_1.parquet
Completed batch 1 (rows 1000 to 2000)
Saved batch 1 with shape (1000, 10000) to /home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_vector_fixed_batch_1.parquet
Completed batch 1 (rows 1000 to 2000)
Saved batch 2 with shape (1000, 10000) to /home/lucas-nunes/workspace/Postech/challen

In [40]:
# Step 3: Combine all batches efficiently
def combine_vector_batches(batch_files, output_file):
    """Combine all batch files into a single file efficiently"""
    print("Combining all batches...")
    
    combined_dfs = []
    for i, file in enumerate(batch_files):
        df_batch = pd.read_parquet(file)
        combined_dfs.append(df_batch)
        print(f"Loaded batch {i}: {df_batch.shape}")
    
    # Combine all batches
    df_combined = pd.concat(combined_dfs, ignore_index=False)  # Keep original indices
    
    # Save combined result
    df_combined.to_parquet(output_file)
    print(f"Combined dataset saved: {df_combined.shape} -> {output_file}")
    
    return df_combined

# Combine all batches
combined_output_file = '/home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_pool_vectors_combined.parquet'
df_tfidf_combined = combine_vector_batches(batch_files, combined_output_file)

print(f"Final TF-IDF matrix shape: {df_tfidf_combined.shape}")

# Step 4: Efficient similarity computation for large datasets
def compute_similarity_batched(df_tfidf, batch_size_sim=500, output_prefix='similarity_batch'):
    """Compute cosine similarity in batches to handle large datasets"""
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    
    n_samples = len(df_tfidf)
    print(f"Computing similarity for {n_samples} samples in batches of {batch_size_sim}")
    
    # Create similarity matrix in batches to manage memory
    similarity_files = []
    
    for i in range(0, n_samples, batch_size_sim):
        batch_end = min(i + batch_size_sim, n_samples)
        batch_data = df_tfidf.iloc[i:batch_end]
        
        # Compute similarity between this batch and ALL data
        batch_similarity = cosine_similarity(batch_data, df_tfidf)
        
        # Save batch similarity
        batch_file = f'/home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/{output_prefix}_{i}_{batch_end}.npy'
        np.save(batch_file, batch_similarity)
        similarity_files.append(batch_file)
        
        print(f"Computed similarity batch {i}-{batch_end}: {batch_similarity.shape}")
    
    return similarity_files

# Compute similarity in manageable batches
print("\nStep 4: Computing cosine similarity in batches...")
similarity_files = compute_similarity_batched(df_tfidf_combined, batch_size_sim=500)

Combining all batches...
Loaded batch 0: (1000, 10000)
Loaded batch 0: (1000, 10000)
Loaded batch 1: (1000, 10000)
Loaded batch 1: (1000, 10000)
Loaded batch 2: (1000, 10000)
Loaded batch 2: (1000, 10000)
Loaded batch 3: (1000, 10000)
Loaded batch 3: (1000, 10000)
Loaded batch 4: (1000, 10000)
Loaded batch 4: (1000, 10000)
Loaded batch 5: (1000, 10000)
Loaded batch 5: (1000, 10000)
Loaded batch 6: (1000, 10000)
Loaded batch 6: (1000, 10000)
Loaded batch 7: (1000, 10000)
Loaded batch 7: (1000, 10000)
Loaded batch 8: (1000, 10000)
Loaded batch 8: (1000, 10000)
Loaded batch 9: (1000, 10000)
Loaded batch 9: (1000, 10000)
Loaded batch 10: (1000, 10000)
Loaded batch 10: (1000, 10000)
Loaded batch 11: (1000, 10000)
Loaded batch 11: (1000, 10000)
Loaded batch 12: (1000, 10000)
Loaded batch 12: (1000, 10000)
Loaded batch 13: (1000, 10000)
Loaded batch 13: (1000, 10000)
Loaded batch 14: (1000, 10000)
Loaded batch 14: (1000, 10000)
Loaded batch 15: (1000, 10000)
Loaded batch 15: (1000, 10000)
Loa

KeyboardInterrupt: 

In [None]:
# Step 5: Efficient Recommendation System
class TalentRecommendationSystem:
    def __init__(self, df_tfidf, df_application_original, vectorizer):
        self.df_tfidf = df_tfidf
        self.df_application = df_application_original
        self.vectorizer = vectorizer
        self.similarity_cache = {}
        
    def get_similar_candidates(self, candidate_idx, top_n=10, similarity_threshold=0.1):
        """Get most similar candidates for a given candidate"""
        from sklearn.metrics.pairwise import cosine_similarity
        
        # Get the TF-IDF vector for the candidate
        candidate_vector = self.df_tfidf.iloc[candidate_idx:candidate_idx+1]
        
        # Compute similarity with all candidates
        similarities = cosine_similarity(candidate_vector, self.df_tfidf)[0]
        
        # Get indices of most similar candidates (excluding self)
        similar_indices = np.argsort(similarities)[::-1][1:top_n+1]  # Exclude self (index 0)
        similar_scores = similarities[similar_indices]
        
        # Filter by threshold
        valid_mask = similar_scores >= similarity_threshold
        similar_indices = similar_indices[valid_mask]
        similar_scores = similar_scores[valid_mask]
        
        # Create results
        results = []
        for idx, score in zip(similar_indices, similar_scores):
            candidate_info = {
                'index': int(idx),
                'similarity_score': float(score),
                'nivel_profissional': self.df_application.iloc[idx].get('nivel_profissional', 'N/A'),
                'area_atuacao': self.df_application.iloc[idx].get('area_atuacao', 'N/A'),
                'nivel_academico': self.df_application.iloc[idx].get('nivel_academico', 'N/A'),
                'conhecimentos_preview': str(self.df_application.iloc[idx].get('cv_pt_cleaned', ''))[:200] + '...'
            }
            results.append(candidate_info)
            
        return results
    
    def recommend_for_job_description(self, job_description, top_n=10):
        """Find candidates similar to a job description"""
        from sklearn.metrics.pairwise import cosine_similarity
        
        # Vectorize the job description using the same vectorizer
        job_vector = self.vectorizer.transform([job_description])
        
        # Compute similarity with all candidates
        similarities = cosine_similarity(job_vector, self.df_tfidf)[0]
        
        # Get top candidates
        top_indices = np.argsort(similarities)[::-1][:top_n]
        top_scores = similarities[top_indices]
        
        # Create results
        results = []
        for idx, score in zip(top_indices, top_scores):
            candidate_info = {
                'index': int(idx),
                'match_score': float(score),
                'nivel_profissional': self.df_application.iloc[idx].get('nivel_profissional', 'N/A'),
                'area_atuacao': self.df_application.iloc[idx].get('area_atuacao', 'N/A'),
                'nivel_academico': self.df_application.iloc[idx].get('nivel_academico', 'N/A'),
                'conhecimentos_preview': str(self.df_application.iloc[idx].get('cv_pt_cleaned', ''))[:200] + '...'
            }
            results.append(candidate_info)
            
        return results

# Initialize the recommendation system
print("Initializing Talent Recommendation System...")
talent_recommender = TalentRecommendationSystem(
    df_tfidf_combined, 
    df_application_teste, 
    vectorizer
)

print("✅ Recommendation system ready!")
print(f"Loaded {len(df_tfidf_combined)} candidate profiles")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)} features")

In [None]:
# Step 6: Test the Recommendation System

print("="*60)
print("TESTING TALENT RECOMMENDATION SYSTEM")
print("="*60)

# Test 1: Find similar candidates to a specific candidate
print("\n🔍 TEST 1: Find Similar Candidates")
test_candidate_idx = 100  # Example candidate
similar_candidates = talent_recommender.get_similar_candidates(
    test_candidate_idx, 
    top_n=5, 
    similarity_threshold=0.1
)

print(f"\nTop 5 candidates similar to candidate #{test_candidate_idx}:")
for i, candidate in enumerate(similar_candidates, 1):
    print(f"\n{i}. Similarity: {candidate['similarity_score']:.3f}")
    print(f"   Level: {candidate['nivel_profissional']}")
    print(f"   Area: {candidate['area_atuacao']}")
    print(f"   Education: {candidate['nivel_academico']}")
    print(f"   Preview: {candidate['conhecimentos_preview'][:100]}...")

# Test 2: Find candidates for a job description
print(f"\n{'='*60}")
print("🎯 TEST 2: Job Matching")
job_description = """
Procuramos um desenvolvedor Python sênior com experiência em:
- Desenvolvimento web com Django ou Flask
- Bancos de dados PostgreSQL e MongoDB
- APIs REST e microserviços
- Docker e Kubernetes
- Machine Learning com scikit-learn
- Experiência com AWS ou Azure
"""

matching_candidates = talent_recommender.recommend_for_job_description(
    job_description, 
    top_n=5
)

print(f"\nTop 5 candidates for the job description:")
for i, candidate in enumerate(matching_candidates, 1):
    print(f"\n{i}. Match Score: {candidate['match_score']:.3f}")
    print(f"   Level: {candidate['nivel_profissional']}")
    print(f"   Area: {candidate['area_atuacao']}")
    print(f"   Education: {candidate['nivel_academico']}")
    print(f"   Preview: {candidate['conhecimentos_preview'][:150]}...")

# Performance metrics
print(f"\n{'='*60}")
print("📊 SYSTEM PERFORMANCE METRICS")
print("="*60)
print(f"Total candidates indexed: {len(df_tfidf_combined):,}")
print(f"Feature dimensions: {df_tfidf_combined.shape[1]:,}")
print(f"Memory usage (TF-IDF matrix): ~{df_tfidf_combined.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
print(f"Vocabulary size: {len(vectorizer.vocabulary_):,} unique terms")

# Save the system for future use
print(f"\n💾 Saving recommendation system components...")
import joblib

# Save vectorizer
joblib.dump(vectorizer, '/home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/talent_vectorizer.pkl')
print("✅ Vectorizer saved")

# Save candidate mapping
candidate_mapping = {
    'indices': df_tfidf_combined.index.tolist(),
    'total_candidates': len(df_tfidf_combined)
}
import json
with open('/home/lucas-nunes/workspace/Postech/challenges/5_data/data/gold/candidate_mapping.json', 'w') as f:
    json.dump(candidate_mapping, f)
print("✅ Candidate mapping saved")

print(f"\n🎉 Talent Recommendation System Successfully Implemented!")
print("Key improvements over the original approach:")
print("✅ Consistent vocabulary across all batches")
print("✅ Memory-efficient batch processing") 
print("✅ Scalable similarity computation")
print("✅ Fast candidate matching and job description matching")
print("✅ Reusable system components saved")

In [None]:
# Calcula a matriz de similaridade do cosseno
cosine_sim = linear_kernel(df_tfidf, df_tfidf)

In [None]:
cosine_sim

array([[1.        , 0.09650452, 0.32610838, ..., 0.04060293, 0.01643154,
        0.03483504],
       [0.09650452, 1.        , 0.20202968, ..., 0.07989363, 0.02201943,
        0.07469299],
       [0.32610838, 0.20202968, 1.        , ..., 0.14084257, 0.01658457,
        0.09780909],
       ...,
       [0.04060293, 0.07989363, 0.14084257, ..., 1.        , 0.01148794,
        0.03139983],
       [0.01643154, 0.02201943, 0.01658457, ..., 0.01148794, 1.        ,
        0.21295057],
       [0.03483504, 0.07469299, 0.09780909, ..., 0.03139983, 0.21295057,
        1.        ]], shape=(100, 100))