In [10]:
from dotenv import load_dotenv

load_dotenv("../.env.prod")
import os
from utils.chunker import chunker, chunk_single_text
import pandas as pd
from transformers import AutoTokenizer, AutoModel

import json
import numpy as np

In [2]:
ds = pd.read_parquet("../files/processed/final_datasets/test.parquet")
ds.columns

Index(['candidate_id', 'vacant_id', 't_apply', 'stage_max', 'publish_date',
       'label', 'scenario', 'vacant_city_loc', 'vacant_full_text',
       'vacant_city_ids', 'candidate_full_text', 'candidate_city_loc',
       'candidate_city_id', 'candidate_fourier_features',
       'no_valid_vacant_city_ids', 'selected_city_id', 'selected_distance',
       'exact_match', 'vacant_fourier_feature'],
      dtype='object')

In [3]:
tokenizer = AutoTokenizer.from_pretrained(os.getenv("model_name"), use_fast=False)


In [None]:
vacants = (
    ds[["vacant_id", "vacant_full_text"]]
    .drop_duplicates("vacant_id")
    .reset_index(drop=True)
)

vacant_chunks = vacants["vacant_full_text"].apply(
    lambda txt: chunk_single_text(
        text=txt,
        name="job",          # or "vacant" if you prefer
        tokenizer=tokenizer,
    )
).apply(pd.Series)
vacant_chunks["job_chunks_input_ids"] = vacant_chunks["job_chunks_input_ids"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)
vacant_chunks["job_chunks_attention_mask"] = vacant_chunks["job_chunks_attention_mask"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)

vacants_chunked = pd.concat(
    [vacants[["vacant_id"]], vacant_chunks],
    axis=1
)


cands = (
    ds[["candidate_id", "candidate_full_text"]]
    .drop_duplicates("candidate_id")
    .reset_index(drop=True)
)

cand_chunks = cands["candidate_full_text"].apply(
    lambda txt: chunk_single_text(
        text=txt,
        name="cand",
        tokenizer=tokenizer,
    )
).apply(pd.Series)
cand_chunks["cand_chunks_input_ids"] = cand_chunks["cand_chunks_input_ids"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)
cand_chunks["cand_chunks_attention_mask"] = cand_chunks["cand_chunks_attention_mask"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)

cands_chunked = pd.concat(
    [cands[["candidate_id"]], cand_chunks],
    axis=1
)

In [7]:
ds = (
    ds
    .merge(vacants_chunked, on="vacant_id", how="left")
    .merge(cands_chunked, on="candidate_id", how="left")
)

In [8]:
ds = ds[['candidate_id', 'vacant_id', 't_apply', 'stage_max', 'publish_date',
       'label', 'vacant_full_text', 'vacant_city_ids',
       'candidate_full_text', 'candidate_city_id',
       'candidate_fourier_features', 'no_valid_vacant_city_ids',
       'selected_city_id', 'selected_distance', 'exact_match',
       'vacant_fourier_feature', 'job_chunks_input_ids',
       'job_chunks_attention_mask', 'cand_chunks_input_ids',
       'cand_chunks_attention_mask']]

In [11]:
ds["vacant_city_ids"] = ds["vacant_city_ids"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)
ds["vacant_fourier_feature"] = ds["vacant_fourier_feature"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)
ds["candidate_fourier_features"] = ds["candidate_fourier_features"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)

In [12]:
# ahora sí, escribir parquet
ds.to_parquet(
    "../files/processed/final_datasets/test.parquet",
    engine="fastparquet",
    index=False,
)

In [13]:
from dotenv import load_dotenv

load_dotenv("../.env.prod")
import os
from utils.chunker import chunker, chunk_single_text
import pandas as pd
from transformers import AutoTokenizer, AutoModel


ds = pd.read_parquet("../files/processed/final_datasets/val.parquet")
ds.columns
tokenizer = AutoTokenizer.from_pretrained(os.getenv("model_name"), use_fast=False)

vacants = (
    ds[["vacant_id", "vacant_full_text"]]
    .drop_duplicates("vacant_id")
    .reset_index(drop=True)
)

vacant_chunks = vacants["vacant_full_text"].apply(
    lambda txt: chunk_single_text(
        text=txt,
        name="job",          # or "vacant" if you prefer
        tokenizer=tokenizer,
    )
).apply(pd.Series)
vacant_chunks["job_chunks_input_ids"] = vacant_chunks["job_chunks_input_ids"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)
vacant_chunks["job_chunks_attention_mask"] = vacant_chunks["job_chunks_attention_mask"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)

vacants_chunked = pd.concat(
    [vacants[["vacant_id"]], vacant_chunks],
    axis=1
)


# --- 2.2 Unique candidates ---
cands = (
    ds[["candidate_id", "candidate_full_text"]]
    .drop_duplicates("candidate_id")
    .reset_index(drop=True)
)

cand_chunks = cands["candidate_full_text"].apply(
    lambda txt: chunk_single_text(
        text=txt,
        name="cand",
        tokenizer=tokenizer,
    )
).apply(pd.Series)
cand_chunks["cand_chunks_input_ids"] = cand_chunks["cand_chunks_input_ids"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)
cand_chunks["cand_chunks_attention_mask"] = cand_chunks["cand_chunks_attention_mask"].map(
    lambda x: json.dumps(x, ensure_ascii=False)
)

cands_chunked = pd.concat(
    [cands[["candidate_id"]], cand_chunks],
    axis=1
)
ds = (
    ds
    .merge(vacants_chunked, on="vacant_id", how="left")
    .merge(cands_chunked, on="candidate_id", how="left")
)
ds = ds[['candidate_id', 'vacant_id', 't_apply', 'stage_max', 'publish_date',
       'label', 'vacant_full_text', 'vacant_city_ids',
       'candidate_full_text', 'candidate_city_id',
       'candidate_fourier_features', 'no_valid_vacant_city_ids',
       'selected_city_id', 'selected_distance', 'exact_match',
       'vacant_fourier_feature', 'job_chunks_input_ids',
       'job_chunks_attention_mask', 'cand_chunks_input_ids',
       'cand_chunks_attention_mask']]
ds["vacant_city_ids"] = ds["vacant_city_ids"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)
ds["vacant_fourier_feature"] = ds["vacant_fourier_feature"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)
ds["candidate_fourier_features"] = ds["candidate_fourier_features"].map(
    lambda x: json.dumps(
        x.tolist() if isinstance(x, np.ndarray) else x,
        ensure_ascii=False
    )
)
# ahora sí, escribir parquet
ds.to_parquet(
    "../files/processed/final_datasets/val.parquet",
    engine="fastparquet",
    index=False,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
