In [1]:
from dotenv import load_dotenv

load_dotenv("../.env.prod")
import os
from utils.chunker import chunker, chunk_single_text
import pandas as pd
from transformers import AutoTokenizer, AutoModel

import json
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = pd.read_parquet("../files/processed/final_datasets/train.parquet")
ds.columns

Index(['candidate_id', 'vacant_id', 't_apply', 'stage_max', 'publish_date',
       'label', 'vacant_city_loc', 'vacant_full_text', 'vacant_city_ids',
       'vacant_remote', 'candidate_full_text', 'candidate_city_loc',
       'candidate_city_id', 'candidate_fourier_features',
       'no_valid_vacant_city_ids', 'selected_city_id', 'selected_distance',
       'exact_match', 'vacant_fourier_feature'],
      dtype='object')

In [3]:
tokenizer = AutoTokenizer.from_pretrained(os.getenv("model_name"), use_fast=False)


In [5]:
model_qa= "sentence-transformers/multi-qa-mpnet-base-cos-v1"
tokenizer_qa = AutoTokenizer.from_pretrained(model_qa, use_fast=False)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
tokenizer_qa

MPNetTokenizer(name_or_path='sentence-transformers/multi-qa-mpnet-base-cos-v1', vocab_size=30527, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	104: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30526: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, 

In [10]:
tokenizer

MPNetTokenizer(name_or_path='sentence-transformers/all-mpnet-base-v2', vocab_size=30527, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	104: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30526: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalize

In [None]:
vacants = (
    ds[["vacant_id", "vacant_full_text"]]
    .drop_duplicates("vacant_id")
    .reset_index(drop=True)
)

vacant_chunks = vacants["vacant_full_text"].apply(
    lambda txt: chunk_single_text(
        text=txt,
        name="job",          
        tokenizer=tokenizer,
    )
).apply(pd.Series)
vacant_chunks["job_chunks_input_ids"] = vacant_chunks["job_chunks_input_ids"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)
vacant_chunks["job_chunks_attention_mask"] = vacant_chunks["job_chunks_attention_mask"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)

vacants_chunked = pd.concat(
    [vacants[["vacant_id"]], vacant_chunks],
    axis=1
)
# columns: vacant_id, job_chunks_input_ids, job_chunks_attention_mask


Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


In [6]:


# --- 2.2 Unique candidates ---
cands = (
    ds[["candidate_id", "candidate_full_text"]]
    .drop_duplicates("candidate_id")
    .reset_index(drop=True)
)

cand_chunks = cands["candidate_full_text"].apply(
    lambda txt: chunk_single_text(
        text=txt,
        name="cand",
        tokenizer=tokenizer,
    )
).apply(pd.Series)
cand_chunks["cand_chunks_input_ids"] = cand_chunks["cand_chunks_input_ids"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)
cand_chunks["cand_chunks_attention_mask"] = cand_chunks["cand_chunks_attention_mask"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)

cands_chunked = pd.concat(
    [cands[["candidate_id"]], cand_chunks],
    axis=1
)

In [7]:
ds = (
    ds
    .merge(vacants_chunked, on="vacant_id", how="left")
    .merge(cands_chunked, on="candidate_id", how="left")
)

In [10]:
ds.columns

Index(['candidate_id', 'vacant_id', 't_apply', 'stage_max', 'publish_date',
       'label', 'vacant_city_loc', 'vacant_full_text', 'vacant_city_ids',
       'candidate_full_text', 'candidate_city_loc', 'candidate_city_id',
       'candidate_fourier_features', 'no_valid_vacant_city_ids',
       'selected_city_id', 'selected_distance', 'exact_match',
       'vacant_fourier_feature', 'job_chunks_input_ids',
       'job_chunks_attention_mask', 'cand_chunks_input_ids',
       'cand_chunks_attention_mask'],
      dtype='object')

In [None]:
ds = ds[['candidate_id', 'vacant_id', 't_apply', 'stage_max', 'publish_date',
       'label', 'vacant_full_text', 'vacant_city_ids',
       'candidate_full_text', 'candidate_city_id',
       'candidate_fourier_features', 'no_valid_vacant_city_ids',
       'selected_city_id', 'selected_distance', 'exact_match',
       'vacant_fourier_feature', 'job_chunks_input_ids',
       'job_chunks_attention_mask', 'cand_chunks_input_ids', 'vacant_remote',
       'cand_chunks_attention_mask']]

In [None]:
ds.loc[0]

candidate_id                                                                 91
vacant_id                                                                245572
t_apply                                        2024-02-15 18:32:30.977000+00:00
stage_max                                                                   1.0
publish_date                                   2024-01-19 12:53:16.588000+00:00
label                                                                         0
vacant_full_text              Estamos en la\nbúsqueda de un Auxiliar Jurídic...
vacant_city_ids                                                         [16963]
candidate_full_text           Abogado especualista en derecho administrativo...
candidate_city_id                                                         17050
candidate_fourier_features    [0.2214912176, -0.46965277190000004, 0.3965891...
no_valid_vacant_city_ids                                                  False
selected_city_id                        

In [None]:
ds["vacant_city_ids"] = ds["vacant_city_ids"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)
ds["vacant_fourier_feature"] = ds["vacant_fourier_feature"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)
ds["candidate_fourier_features"] = ds["candidate_fourier_features"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)

In [None]:
ds.loc[0].to_dict()

In [24]:
# ahora sí, escribir parquet
ds.to_parquet(
    "../files/processed/final_datasets/train.parquet",
    engine="fastparquet",
    index=False,
)