In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Catching off-line data

In [2]:
# after that I ask to gemini to generate new data for categories with less count
all_data = pd.read_csv("data/filtered/normalized.csv")
all_data = all_data.dropna()
all_data = all_data.drop_duplicates()
all_data.shape

(18240, 2)

### Text pre process

In [3]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

def encode_labels(categories):
    label_encoder = LabelEncoder()
    category_encoded = label_encoder.fit_transform(categories)
    print(category_encoded)
    num_categories = len(label_encoder.classes_)
    y = to_categorical(category_encoded)
    return label_encoder, y, num_categories

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

2025-06-26 16:23:50.115416: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750965830.226662    6011 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750965830.255898    6011 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750965830.507211    6011 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750965830.507233    6011 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750965830.507237    6011 computation_placer.cc:177] computation placer alr

In [4]:
categories = all_data['category']
label_encoder, y, num_categories = encode_labels(categories)
print(num_categories, y)

[48 70 46 ... 72 72 72]
95 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
resumes_vec = tfidf_vec.fit_transform(all_data['resume'])
inputs = len(tfidf_vec.get_feature_names_out())
print("Tamanho:", len(tfidf_vec.get_feature_names_out()), "Vocabulário:", tfidf_vec.get_feature_names_out(), )
resumes_vec

Tamanho: 48295 Vocabulário: ['aa' 'aaa' 'aaacom' ... 'zyvox' 'zz' 'zzxzx']


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2122670 stored elements and shape (18240, 48295)>

In [6]:
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(resumes_vec, y, test_size=0.4, random_state=42)
X_train_vec.shape, X_test_vec.shape, y_train_vec.shape, y_test_vec.shape

((10944, 48295), (7296, 48295), (10944, 95), (7296, 95))

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 20000
max_length = 500
oov_tok = "<OOV>"
token = Tokenizer(num_words=num_words, oov_token=oov_tok)
token.fit_on_texts(all_data['resume'])
inputs = len(token.word_index)
resumes_token = token.texts_to_sequences(all_data['resume'])
resumes_token = pad_sequences(resumes_token, padding="post", truncating="post", maxlen=max_length)
inputs, resumes_token

(48319,
 array([[  34, 1307, 1032, ...,    0,    0,    0],
        [  34,  350, 2123, ...,    0,    0,    0],
        [  34,  769,   59, ...,    0,    0,    0],
        ...,
        [   9,  621,   61, ...,    0,    0,    0],
        [ 154,  307,  602, ...,   23, 2651,   23],
        [ 154,  307,  602, ...,   23, 2651,   23]], dtype=int32))

In [8]:
X_train_tkn, X_test_tkn, y_train_tkn, y_test_tkn = train_test_split(resumes_token, y, test_size=0.4, random_state=42)
X_train_tkn.shape, X_test_tkn.shape, y_train_tkn.shape, y_test_tkn.shape

((10944, 500), (7296, 500), (10944, 95), (7296, 95))

### Load models

In [9]:
import joblib
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

rfc = joblib.load('models/RFC_model.pkl')
svc = joblib.load('models/SVC_model.pkl')
sgd = joblib.load('models/SGD_model.pkl')
nu_svc = joblib.load('models/NuSVC_model.pkl')
mpl = joblib.load('models/MLP_model.pkl')
lr = joblib.load('models/LR_model.pkl')
linear_svc = joblib.load('models/LinearSVC_model.pkl')
knc = joblib.load('models/KNC_model.pkl')
lstm = tf.keras.models.load_model('models/LSTM_model.keras')

I0000 00:00:1750965866.919532    6011 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9977 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6
  saveable.load_own_variables(weights_store.get(inner_path))


In [51]:
import keras
bert = keras.Sequential([
    keras.layers.Input(shape=(), dtype=tf.string),
    keras.layers.TFSMLayer('models/BERT_model.tf', call_endpoint='serving_default')
])

2025-06-26 14:10:31.030057: W tensorflow/core/common_runtime/graph_constructor.cc:846] Node 'add/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 5 outputs. Output shapes may be inaccurate.
2025-06-26 14:10:31.030280: W tensorflow/core/common_runtime/graph_constructor.cc:846] Node 'lambda/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 3 outputs. Output shapes may be inaccurate.
2025-06-26 14:10:31.254292: W tensorflow/core/common_runtime/graph_constructor.cc:846] Node 'lambda_8/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 7 outputs. Output shapes may be inaccurate.


In [48]:
try:
    # An empty list makes all GPUs invisible.
    tf.config.set_visible_devices([], 'GPU')
    print("GPU has been hidden from TensorFlow. All operations will use the CPU.")
except (ValueError, RuntimeError) as e:
    # This can fail if devices are already initialized. Best to run in a fresh script.
    print(f"Could not hide GPU, might be already initialized: {e}")
# --- END OF FIX ---
bert = keras.Sequential([
    keras.layers.Input(shape=(), dtype=tf.string),
    keras.layers.TFSMLayer('models/BERT_model.tf', call_endpoint='serving_default')
])

Could not hide GPU, might be already initialized: Visible devices cannot be modified after being initialized


2025-06-26 14:05:13.575562: W tensorflow/core/common_runtime/graph_constructor.cc:846] Node 'add/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 5 outputs. Output shapes may be inaccurate.
2025-06-26 14:05:13.575785: W tensorflow/core/common_runtime/graph_constructor.cc:846] Node 'lambda/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 3 outputs. Output shapes may be inaccurate.
2025-06-26 14:05:13.799175: W tensorflow/core/common_runtime/graph_constructor.cc:846] Node 'lambda_8/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 7 outputs. Output shapes may be inaccurate.


In [49]:
bert.summary()

In [None]:
input_data = ['this is a test sentence', 'and here is another one']
tensor_input = tf.constant(input_data, dtype=tf.string)
with tf.device('/CPU:0'):
    bert_pred = bert.predict(tensor_input)

In [10]:
pred = {
    # 'target': decoded_label(label_encoder, y_test_vec),
    'rfc': rfc.predict(X_test_vec),
    'svc': svc.predict(X_test_vec),
    'sgd': sgd.predict(X_test_vec),
    'nu_svc': nu_svc.predict(X_test_vec),
    'mpl': mpl.predict(X_test_vec),
    'lr': lr.predict(X_test_vec),
    'linear_svc': linear_svc.predict(X_test_vec),
    'knc': knc.predict(X_test_vec),
    'lstm': [int(np.argmax(y)) for y in lstm.predict(X_test_tkn)],
    # 'bert': bert_pred
}

I0000 00:00:1750965950.086600    6118 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step


In [11]:
results = pd.DataFrame(pred)

### Test decision

In [12]:
from scipy import stats
from sklearn.metrics import accuracy_score

y_test = [int(np.argmax(y)) for y in y_test_vec]
statistic_predictions = []
for x in np.array(results):
    if np.std(x) == 0:
        statistic_predictions.append(x[0])
    else:
        mode = stats.mode(x)
        if mode.count > 5:
            statistic_predictions.append(mode.mode)
        else:
            mean = np.mean(x)
            nearest = len(label_encoder.classes_)
            for v in x:
                diff = abs(v - mean)
                nearest = nearest if nearest < diff else v
            statistic_predictions.append(nearest)

accuracy = accuracy_score(y_test, statistic_predictions)
accuracy

0.9801260964912281

In [19]:
knn = joblib.load('models/KNN_hybrid.pkl')
dynamic_predictions = []
for t in np.array(results):
    distances, indexes = knn.kneighbors(t.reshape(1, -1))
    distances = np.ravel(distances)
    indexes = np.ravel(indexes)
    x_nn = np.array(results.iloc[indexes])

    options = list(set(np.concatenate((np.ravel(x_nn), t)).tolist()))
    best_option = options[0]
    options_weight = {o: 0 for o in options}
    for o in options_weight.keys():
        options_weight[o] += float(np.count_nonzero(t == o))
        for i, x in enumerate(x_nn):
            d = distances[i] if distances[i] != 0 else 1
            options_weight[o] += float(np.count_nonzero(x == o) * 1/d)

        max_option = options_weight[o] if options_weight[o] > best_option else best_option

    dynamic_predictions.append(best_option)
#
accuracy = accuracy_score(y_test, dynamic_predictions)
accuracy

0.4702576754385965