In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Catching off-line data

In [15]:
trust_models = pd.read_csv("data/results/trust_model.csv").drop(columns=['BERT'], axis=1)
trust_models.head()

Unnamed: 0,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM
0,0.966374,0.953947,0.963816,0.964181,0.954313,0.898757,0.96674,0.950658,0.964547


In [3]:
# after that I ask to gemini to generate new data for categories with less count
all_data = pd.read_csv("data/filtered/normalized.csv")
all_data = all_data.dropna()
all_data = all_data.drop_duplicates()
all_data.shape

(18240, 2)

### Text pre process

In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

def encode_labels(categories):
    label_encoder = LabelEncoder()
    category_encoded = label_encoder.fit_transform(categories)
    print(category_encoded)
    num_categories = len(label_encoder.classes_)
    y = to_categorical(category_encoded)
    return label_encoder, y, num_categories

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

2025-06-28 03:04:51.363397: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751090691.476971  131153 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751090691.507257  131153 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751090691.759273  131153 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751090691.759294  131153 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751090691.759296  131153 computation_placer.cc:177] computation placer alr

In [5]:
categories = all_data['category']
label_encoder, y, num_categories = encode_labels(categories)
print(num_categories, y)

[48 70 46 ... 72 72 72]
95 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
resumes_vec = tfidf_vec.fit_transform(all_data['resume'])
inputs = len(tfidf_vec.get_feature_names_out())
print("Tamanho:", len(tfidf_vec.get_feature_names_out()), "Vocabulário:", tfidf_vec.get_feature_names_out(), )
resumes_vec

Tamanho: 48295 Vocabulário: ['aa' 'aaa' 'aaacom' ... 'zyvox' 'zz' 'zzxzx']


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2122670 stored elements and shape (18240, 48295)>

In [7]:
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(resumes_vec, y, test_size=0.4, random_state=42)
X_train_vec.shape, X_test_vec.shape, y_train_vec.shape, y_test_vec.shape

((10944, 48295), (7296, 48295), (10944, 95), (7296, 95))

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 20000
max_length = 500
oov_tok = "<OOV>"
token = Tokenizer(num_words=num_words, oov_token=oov_tok)
token.fit_on_texts(all_data['resume'])
inputs = len(token.word_index)
resumes_token = token.texts_to_sequences(all_data['resume'])
resumes_token = pad_sequences(resumes_token, padding="post", truncating="post", maxlen=max_length)
inputs, resumes_token

(48319,
 array([[  34, 1307, 1032, ...,    0,    0,    0],
        [  34,  350, 2123, ...,    0,    0,    0],
        [  34,  769,   59, ...,    0,    0,    0],
        ...,
        [   9,  621,   61, ...,    0,    0,    0],
        [ 154,  307,  602, ...,   23, 2651,   23],
        [ 154,  307,  602, ...,   23, 2651,   23]], dtype=int32))

In [9]:
X_train_tkn, X_test_tkn, y_train_tkn, y_test_tkn = train_test_split(resumes_token, y, test_size=0.4, random_state=42)
X_train_tkn.shape, X_test_tkn.shape, y_train_tkn.shape, y_test_tkn.shape

((10944, 500), (7296, 500), (10944, 95), (7296, 95))

### Load models

In [10]:
import joblib
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

rfc = joblib.load('models/RFC_model.pkl')
svc = joblib.load('models/SVC_model.pkl')
sgd = joblib.load('models/SGD_model.pkl')
nu_svc = joblib.load('models/NuSVC_model.pkl')
mpl = joblib.load('models/MLP_model.pkl')
lr = joblib.load('models/LR_model.pkl')
linear_svc = joblib.load('models/LinearSVC_model.pkl')
knc = joblib.load('models/KNC_model.pkl')
lstm = tf.keras.models.load_model('models/LSTM_model.keras')
# bert = tf.keras.models.load_model('models/BERT_model.keras')

2025-06-28 03:05:18.538798: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2025-06-28 03:05:18.538820: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:178] verbose logging is disabled. Rerun with verbose logging (usually --v=1 or --vmodule=cuda_diagnostics=1) to get more diagnostic output from this module
2025-06-28 03:05:18.538824: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:183] retrieving CUDA diagnostic information for host: ja1-SSD
2025-06-28 03:05:18.538827: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:190] hostname: ja1-SSD
2025-06-28 03:05:18.539004: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:197] libcuda reported version is: 570.158.1
2025-06-28 03:05:18.539021: I external/local_xla/xla/stream_executor/cuda/cud

In [11]:
pred = {
    # 'target': decoded_label(label_encoder, y_test_vec),
    'rfc': rfc.predict(X_test_vec),
    'svc': svc.predict(X_test_vec),
    'sgd': sgd.predict(X_test_vec),
    'nu_svc': nu_svc.predict(X_test_vec),
    'mpl': mpl.predict(X_test_vec),
    'lr': lr.predict(X_test_vec),
    'linear_svc': linear_svc.predict(X_test_vec),
    'knc': knc.predict(X_test_vec),
    'lstm': [int(np.argmax(y)) for y in lstm.predict(X_test_tkn)],
    # 'bert': bert_pred
}

[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 105ms/step


In [12]:
results = pd.DataFrame(pred)

### Test decision

In [34]:
from scipy import stats
from sklearn.metrics import classification_report

trust_models = np.ravel(np.array(trust_models))
y_test = [int(np.argmax(y)) for y in y_test_vec]
statistic_predictions = []
for x in np.array(results):
    if np.std(x) == 0:
        statistic_predictions.append(x[0])
    else:
        mode = stats.mode(x)
        if mode.count > 5:
            statistic_predictions.append(int(mode.mode))
        else:
            mean = np.average(x, weights=trust_models)
            # mean = np.mean(x)
            nearest = len(label_encoder.classes_)
            for v in x:
                diff = abs(v - mean)
                nearest = nearest if nearest < diff else v
            statistic_predictions.append(int(nearest))

report = classification_report(y_test, statistic_predictions, output_dict=True)
{
    'accuracy': report['accuracy'],
    'macro avg': report['macro avg'],
    'weighted avg': report['weighted avg'],
}

{'accuracy': 0.9801260964912281,
 'macro avg': {'precision': 0.9833448650633813,
  'recall': 0.982827632000708,
  'f1-score': 0.9827130000470556,
  'support': 7296.0},
 'weighted avg': {'precision': 0.9809227207390692,
  'recall': 0.9801260964912281,
  'f1-score': 0.9800934609348122,
  'support': 7296.0}}

In [38]:
knn = joblib.load('models/KNN_hybrid.pkl')
dynamic_predictions = []
for t in np.array(results):
    distances, indexes = knn.kneighbors(t.reshape(1, -1))
    distances = np.ravel(distances)
    indexes = np.ravel(indexes)
    x_nn = np.array(results.iloc[indexes])

    options = list(set(np.concatenate((np.ravel(x_nn), t)).tolist()))
    best_option = options[0]
    options_weight = {o: 0 for o in options}
    for o in options_weight.keys():
        options_weight[o] += float(np.count_nonzero(t == o))
        for i, x in enumerate(x_nn):
            d = distances[i] if distances[i] != 0 else 1
            options_weight[o] += float(np.count_nonzero(x == o) * trust_models[i]/d)

        best_option = options_weight[o] if options_weight[o] > best_option else best_option

    dynamic_predictions.append(int(best_option))
#
accuracy = classification_report(y_test, dynamic_predictions)
print(accuracy)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       107
           1       0.00      0.00      0.00        95
           2       0.00      0.00      0.00        63
           3       0.00      0.00      0.00        76
           4       0.00      0.00      0.00        72
           5       0.09      0.04      0.05        79
           6       0.00      0.00      0.00        64
           7       0.12      0.03      0.05        93
           8       0.00      0.00      0.00        53
           9       0.07      1.00      0.13        57
          10       0.39      0.82      0.53        98
          11       0.37      0.83      0.51        70
          12       0.97      0.76      0.85        95
          13       0.00      0.00      0.00        99
          14       0.00      0.00      0.00        56
          15       0.00      0.00      0.00        58
          16       0.26      0.95      0.41        75
          17       0.24    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
