### Preparing the env dependencies

In [1]:
import warnings
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization
warnings.simplefilter("ignore")

2025-06-23 12:19:04.716888: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750691944.728905    8422 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750691944.732849    8422 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750691944.743195    8422 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750691944.743210    8422 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750691944.743211    8422 computation_placer.cc:177] computation placer alr

### Catching off-line data

In [2]:
# after that I ask to gemini to generate new data for categories with less count
all_data = pd.read_csv("data/filtered/normalized.csv")
all_data = all_data.dropna()
all_data = all_data.drop_duplicates()
all_data.shape

(18240, 2)

### Vectorizer

#### Label Encoder

In [3]:
def encode_labels(categories):
    label_encoder = LabelEncoder()
    category_encoded = label_encoder.fit_transform(categories)
    print(category_encoded)
    num_categories = len(label_encoder.classes_)
    y = to_categorical(category_encoded)
    return label_encoder, y, num_categories

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

In [4]:
categories = all_data['category']
label_encoder, y, num_categories = encode_labels(categories)
print(num_categories, y)

[48 70 46 ... 72 72 72]
95 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


#### TF-IDF

In [5]:
tfidf_vec = TfidfVectorizer()
resumes_vec = tfidf_vec.fit_transform(all_data['resume'])
inputs = len(tfidf_vec.get_feature_names_out())
print("Tamanho:", len(tfidf_vec.get_feature_names_out()), "Vocabulário:", tfidf_vec.get_feature_names_out(), )
resumes_vec

Tamanho: 48295 Vocabulário: ['aa' 'aaa' 'aaacom' ... 'zyvox' 'zz' 'zzxzx']


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2122670 stored elements and shape (18240, 48295)>

In [6]:
X_train_vec, X_temp_vec, y_train_vec, y_temp_vec = train_test_split(resumes_vec, y, test_size=0.3, random_state=42)
X_val_vec, X_test_vec, y_val_vec, y_test_vec = train_test_split(X_temp_vec, y_temp_vec, test_size=0.5, random_state=42)
X_train_vec.shape, X_temp_vec.shape, X_val_vec.shape, X_test_vec.shape

((12768, 48295), (5472, 48295), (2736, 48295), (2736, 48295))

#### Tokens

In [7]:
num_words = 20000
max_length = 1000
token = Tokenizer(num_words=num_words, oov_token="<OOV>")
token.fit_on_texts(all_data['resume'])
inputs = len(token.word_index)
resumes_token = token.texts_to_sequences(all_data['resume'])
resumes_token = pad_sequences(resumes_token, padding="post", truncating="post", maxlen=max_length)
inputs, resumes_token

(48319,
 array([[  34, 1307, 1032, ...,    0,    0,    0],
        [  34,  350, 2123, ...,    0,    0,    0],
        [  34,  769,   59, ...,    0,    0,    0],
        ...,
        [   9,  621,   61, ...,    0,    0,    0],
        [ 154,  307,  602, ...,    0,    0,    0],
        [ 154,  307,  602, ...,    0,    0,    0]], dtype=int32))

In [8]:
X_train_tkn, X_temp_tkn, y_train_tkn, y_temp_tkn = train_test_split(resumes_token, y, test_size=0.3, random_state=42)
X_val_tkn, X_test_tkn, y_val_tkn, y_test_tkn = train_test_split(X_temp_tkn, y_temp_tkn, test_size=0.5, random_state=42)
X_train_tkn.shape, X_temp_tkn.shape, X_val_tkn.shape, X_test_tkn.shape

((12768, 1000), (5472, 1000), (2736, 1000), (2736, 1000))

## Training

In [17]:
# Play this only in the first execution
# result_vec_decoded = decoded_label(label_encoder, y_test_vec)
# result_vec_df = pd.DataFrame(result_vec_decoded, columns=['target'])
# result_vec_df.to_csv('data/results/tf-idf/results.csv', encoding='utf-8', index=False)
#
# result_tkn_decoded = decoded_label(label_encoder, y_test_tkn)
# result_tkn_df = pd.DataFrame(result_tkn_decoded, columns=['target'])
# result_tkn_df.to_csv('data/results/token/results.csv', encoding='utf-8', index=False)

In [10]:
def save_results(prediction, model_name, vec='tf-idf', categorical=True):
    filename = f'data/results/{vec}/results.csv'
    result_decoded = decoded_label(label_encoder, prediction, categorical)
    result = pd.read_csv(filename)
    result[model_name] = result_decoded
    result.to_csv(filename, encoding='utf-8', index=False)
    print('Results saved in {}'.format(filename))

### Random Forests

In [25]:
def random_forest_model(X_train, X_test, X_val, y_train, y_test, y_val, vec='tf-idf'):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    rng = np.random.RandomState(0)
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
        'criterion': ["gini", "entropy"]
    }
    forest = RandomForestClassifier(n_estimators=700, criterion='gini', bootstrap=False)
    rsh = HalvingGridSearchCV(estimator=forest, param_grid=param_grid, random_state=rng, factor=5, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = RandomForestClassifier(
        n_estimators=rsh.best_params_['n_estimators'],
        criterion=rsh.best_params_['criterion'],
        bootstrap=False
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'RFC', vec, False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [19]:
model_vec, acc_vec = random_forest_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loa

Score:  0.9057017543859649 Params: {'criterion': 'gini', 'n_estimators': 800}
Results saved in data/results/tf-idf/results.csv
0.966374269005848
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        49
           1       0.95      0.88      0.91        40
           2       1.00      0.91      0.95        23
           3       1.00      1.00      1.00        32
           4       1.00      0.84      0.91        31
           5       0.96      0.77      0.86        31
           6       1.00      1.00      1.00        20
           7       0.91      0.59      0.71        34
           8       0.96      1.00      0.98        22
           9       1.00      1.00      1.00        18
          10       0.97      1.00      0.99        39
          11       1.00      1.00      1.00        34
          12       0.92      0.87      0.89        38
          13       1.00      1.00      1.00        42
          14       1.00      1.00      1.00 

In [26]:
model_tkn, acc_tkn = random_forest_model(X_train_tkn, X_test_tkn, X_val_tkn, y_train_tkn, y_test_tkn, y_val_tkn, 'token')

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loa

Score:  0.5847953216374269 Params: {'criterion': 'gini', 'n_estimators': 600}
Results saved in data/results/token/results.csv
0.7726608187134503
              precision    recall  f1-score   support

           0       0.81      0.45      0.58        49
           1       0.30      0.28      0.29        40
           2       0.78      0.30      0.44        23
           3       0.77      0.84      0.81        32
           4       0.61      0.65      0.62        31
           5       0.67      0.13      0.22        31
           6       0.90      0.90      0.90        20
           7       0.71      0.15      0.24        34
           8       1.00      1.00      1.00        22
           9       1.00      0.72      0.84        18
          10       0.48      0.38      0.43        39
          11       1.00      1.00      1.00        34
          12       0.35      0.24      0.28        38
          13       1.00      0.86      0.92        42
          14       1.00      1.00      1.00 

In [20]:
best_model = model_tkn if acc_tkn > acc_vec else model_vec
filename = 'models/RFC_model.pkl'
joblib.dump(model_vec, filename)
print('Model RF saved in {}'.format(filename))

Model RF saved in models/RFC_model.pkl


### MLP

In [23]:
def mlp_model(X_train, X_test, X_val, y_train, y_test, y_val, vec='tf-idf'):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    rng = np.random.RandomState(0)
    param_grid = {
        'hidden_layer_sizes': [(8,32,16), (16,64,32), (32,128,64), (32,64,128,64,32)], # 56, 112, 224, 320, 640
        'max_iter': [200, 400, 600, 800, 1000],
        'activation': ['relu', 'tanh', 'identity'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['adaptive', 'constant'],
    }
    model = MLPClassifier(hidden_layer_sizes=200, max_iter=400, learning_rate='adaptive')
    rsh = HalvingGridSearchCV(estimator=model, param_grid=param_grid, random_state=rng, factor=3, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = MLPClassifier(
        hidden_layer_sizes=rsh.best_params_['hidden_layer_sizes'],
        max_iter=rsh.best_params_['max_iter'],
        activation=rsh.best_params_['activation'],
        solver=rsh.best_params_['solver'],
        alpha=rsh.best_params_['alpha'],
        learning_rate=rsh.best_params_['learning_rate'],
    )
    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    print('Accuracy: {} updated'.format(accuracy))
    save_results(prediction, 'MLP', vec, False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [18]:
mlp_model_vec, mlp_acc_vec = mlp_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec, 'tf-idf')

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.8607456140350878 Params: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (32, 128, 64), 'learning_rate': 'constant', 'max_iter': 400, 'solver': 'adam'}
0.9539473684210527
              precision    recall  f1-score   support

           0       0.92      0.96      0.94        49
           1       0.94      0.82      0.88        40
           2       0.81      0.91      0.86        23
           3       1.00      1.00      1.00        32
           4       0.97      0.90      0.93        31
           5       0.71      0.77      0.74        31
           6       1.00      1.00      1.00        20
           7       0.75      0.62      0.68        34
           8       1.00      1.00      1.00        22
           9       1.00      1.00      1.00        18
          10       0.97      0.82      0.89        39
          11       1.00      1.00      1.00        34
          12       0.89      0.87      0.88        38
          13       1.00      1.00      1.00      

In [24]:
mlp_model_tkn, mlp_acc_tkn = mlp_model(X_train_tkn, X_test_tkn, X_val_tkn, y_train_tkn, y_test_tkn, y_val_tkn, 'token')

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.21089181286549707 Params: {'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (32, 128, 64), 'learning_rate': 'constant', 'max_iter': 600, 'solver': 'adam'}
Accuracy: 0.27010233918128657 updated
Results saved in data/results/token/results.csv
0.27010233918128657
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        49
           1       0.08      0.10      0.09        40
           2       0.19      0.22      0.20        23
           3       0.00      0.00      0.00        32
           4       0.00      0.00      0.00        31
           5       0.11      0.06      0.08        31
           6       0.11      0.20      0.14        20
           7       0.07      0.09      0.08        34
           8       1.00      1.00      1.00        22
           9       0.32      0.67      0.44        18
          10       0.13      0.05      0.07        39
          11       0.82      0.79      0.81        34
          

In [13]:
best_mlp_model = mlp_model_tkn if mlp_acc_tkn > mlp_acc_vec else mlp_model_vec
filename = 'models/MLP_model.pkl'
joblib.dump(best_mlp_model, filename)
print('Model MLP saved in {}'.format(filename))

Model MLP saved in models/MLP_model.pkl


### SVC

In [34]:
def svc_model(X_train, X_test, X_val, y_train, y_test, y_val, vec='tf-idf'):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    rng = np.random.RandomState(0)
    param_grid = {
        'C': [0.1, 1, 10, 100, 1000],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': [0.1, 1, 10, 100],
        'degree': [0, 1, 2, 3, 4, 5, 6]
    }
    svc = SVC(C=0.1, kernel='poly')
    rsh = HalvingGridSearchCV(estimator=svc, param_grid=param_grid, random_state=rng, factor=3, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = SVC(
        C=rsh.best_params_['C'],
        kernel=rsh.best_params_['kernel'],
        gamma=rsh.best_params_['gamma'],
        degree=rsh.best_params_['degree'],
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'SVC', vec, False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [30]:
svc_model_vec, svc_acc_vec = svc_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec, 'tf-idf')

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.8951023391812866 Params: {'C': 10, 'degree': 0, 'gamma': 1, 'kernel': 'sigmoid'}
Accuracy: 0.9638157894736842 updated
Results saved in data/results/tf-idf/results.csv
0.9638157894736842
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        49
           1       0.79      0.85      0.82        40
           2       0.78      0.91      0.84        23
           3       0.97      1.00      0.98        32
           4       0.93      0.81      0.86        31
           5       0.76      0.84      0.80        31
           6       1.00      1.00      1.00        20
           7       0.85      0.65      0.73        34
           8       1.00      1.00      1.00        22
           9       0.95      1.00      0.97        18
          10       0.91      1.00      0.95        39
          11       1.00      1.00      1.00        34
          12       0.92      0.89      0.91        38
          13       1.00      1.00      1.00    

In [35]:
svc_model_tkn, svc_acc_tkn = svc_model(X_train_tkn, X_test_tkn, X_val_tkn, y_train_tkn, y_test_tkn, y_val_tkn, 'token')

Results saved in data/results/token/results.csv
0.015350877192982455
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        49
           1       0.00      0.00      0.00        40
           2       0.00      0.00      0.00        23
           3       0.00      0.00      0.00        32
           4       0.00      0.00      0.00        31
           5       0.00      0.00      0.00        31
           6       0.00      0.00      0.00        20
           7       0.00      0.00      0.00        34
           8       0.00      0.00      0.00        22
           9       0.00      0.00      0.00        18
          10       0.00      0.00      0.00        39
          11       0.00      0.00      0.00        34
          12       0.00      0.00      0.00        38
          13       0.00      0.00      0.00        42
          14       0.00      0.00      0.00        28
          15       0.00      0.00      0.00        23
          16

In [31]:
best_svc_model = svc_model_tkn if svc_acc_tkn > svc_acc_vec else svc_model_vec
filename = 'models/SVC_model.pkl'
joblib.dump(best_svc_model, filename)
print('Model SVC saved in {}'.format(filename))

Model SVC saved in models/SVC_model.pkl


### Linear SVC

In [10]:
def linear_svc_model(X_train, X_test, X_val, y_train, y_test, y_val, vec='tf-idf'):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'loss': ['hinge', 'squared_hinge'],
        'tol': [0.0001, 0.001],
        'multi_class': ['ovr', 'crammer_singer'],
    }
    svc = LinearSVC(C=10, loss='squared_hinge', max_iter=100000)
    rsh = HalvingGridSearchCV(estimator=svc, param_grid=param_grid, random_state=42, factor=2, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = LinearSVC(
        C=rsh.best_params_['C'],
        penalty=rsh.best_params_['penalty'],
        loss=rsh.best_params_['loss'],
        tol=rsh.best_params_['tol'],
        multi_class=rsh.best_params_['multi_class'],
        max_iter=100000,
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'LinearSVC', vec, False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [13]:
linear_svc_model_vec, linear_svc_acc_vec = linear_svc_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec, 'tf-idf')

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.9071637426900585 Params: {'C': 1, 'loss': 'squared_hinge', 'multi_class': 'crammer_singer', 'penalty': 'l2', 'tol': 0.001}
Results saved in data/results/tf-idf/results.csv
0.9641812865497076
              precision    recall  f1-score   support

           0       0.89      0.98      0.93        49
           1       0.92      0.85      0.88        40
           2       0.95      0.91      0.93        23
           3       1.00      1.00      1.00        32
           4       0.96      0.87      0.92        31
           5       0.83      0.81      0.82        31
           6       1.00      1.00      1.00        20
           7       0.80      0.59      0.68        34
           8       1.00      1.00      1.00        22
           9       1.00      1.00      1.00        18
          10       0.93      1.00      0.96        39
          11       1.00      1.00      1.00        34
          12       0.92      0.89      0.91        38
          13       1.00      1.00      1.0

In [None]:
linear_svc_model_tkn, linear_svc_acc_tkn = linear_svc_model(X_train_tkn, X_test_tkn, X_val_tkn, y_train_tkn, y_test_tkn, y_val_tkn, 'token')

In [15]:
best_linear_svc_model = linear_svc_model_tkn if linear_svc_acc_tkn > linear_svc_acc_vec else linear_svc_model_vec
filename = 'models/LinearSVC_model.pkl'
joblib.dump(best_linear_svc_model, filename)
print('Model Linear SVC saved in {}'.format(filename))

Model Linear SVC saved in models/LinearSVC_model.pkl


### NuSVC

In [9]:
def nu_svc_model(X_train, X_test, X_val, y_train, y_test, y_val, vec='tf-idf'):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    param_grid = {
        'nu': [0.1, 0.3, 0.5, 0.7, 0.9],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        'tol': [0.0001, 0.001],
        'gamma': ['scale', 'auto'],
    }
    # svc = NuSVC(nu=0.01, kernel='poly', max_iter=100000)
    # rsh = HalvingGridSearchCV(estimator=svc, param_grid=param_grid, random_state=42, factor=2, n_jobs=-1)
    # rsh.fit(X_val, y_val_categorical)
    # score = rsh.score(X_test, y_test_categorical)
    # print("Score: ", score, "Params:", rsh.best_params_)

    model = NuSVC(
        nu=param_grid['nu'][1],
        kernel=param_grid['kernel'][1],
        tol=param_grid['tol'][0],
        gamma=param_grid['gamma'][0],
        max_iter=100000,
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    if accuracy > 0.9539473684210527:
        print('Accuracy: ', accuracy, 'updated')
        save_results(prediction, 'NuSVC', vec, False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [11]:
nu_svc_model_vec, nu_svc_acc_vec = nu_svc_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec, 'tf-idf')

Results saved in data/results/tf-idf/results.csv
0.9539473684210527
              precision    recall  f1-score   support

           0       0.92      0.94      0.93        49
           1       0.67      0.90      0.77        40
           2       1.00      0.91      0.95        23
           3       1.00      1.00      1.00        32
           4       0.96      0.87      0.92        31
           5       0.86      0.77      0.81        31
           6       1.00      1.00      1.00        20
           7       0.83      0.59      0.69        34
           8       1.00      1.00      1.00        22
           9       1.00      1.00      1.00        18
          10       0.90      0.90      0.90        39
          11       1.00      1.00      1.00        34
          12       0.92      0.89      0.91        38
          13       1.00      1.00      1.00        42
          14       1.00      1.00      1.00        28
          15       1.00      0.91      0.95        23
          16 

In [None]:
nu_svc_model_tkn, nu_svc_acc_tkn = nu_svc_model(X_train_tkn, X_test_tkn, X_val_tkn, y_train_tkn, y_test_tkn, y_val_tkn, 'token')

In [12]:
best_nu_svc_model = nu_svc_model_tkn if nu_svc_acc_tkn > nu_svc_acc_vec else nu_svc_model_vec
filename = 'models/NuSVC_model.pkl'
joblib.dump(best_nu_svc_model, filename)
print('Model NuSVC saved in {}'.format(filename))

Model NuSVC saved in models/NuSVC_model.pkl


### BERT Classification

### Word Embedding

In [20]:
embedding_dim=144
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
model = Sequential([
    Embedding(num_words, embedding_dim, input_length=max_length, name="embedding"),
    GlobalAveragePooling1D(),
    Dense(72, activation='relu'),
    Dropout(0.2),
    Dense(num_categories, activation='softmax')
])



In [21]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [22]:
model.fit(X_train_tkn, y_train_tkn, epochs=10, batch_size=30, verbose=True, validation_data=(X_test_tkn, y_test_tkn))

Epoch 1/10


  output, from_logits = _get_logits(









[1m415/426[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.0139 - loss: 0.2035










[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0140 - loss: 0.2009




[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.0140 - loss: 0.2006 - val_accuracy: 0.0267 - val_loss: 0.0544
Epoch 2/10
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0226 - loss: 0.0563 - val_accuracy: 0.0424 - val_loss: 0.0532
Epoch 3/10
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0249 - loss: 0.0547 - val_accuracy: 0.0435 - val_loss: 0.0521
Epoch 4/10
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0362 - loss: 0.0533 - val_accuracy: 0.0680 - val_loss: 0.0502
Epoch 5/10
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0582 - loss: 0.0513 - val_accuracy: 0.1663 - val_loss: 0.0481
Epoch 6/10
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1152 - loss: 0.0489 - val_accuracy: 0.1912 - val_loss: 0.0460
Epoch 7/10
[1m426/426[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x7eaa3f7e0410>

In [23]:
loss, accuracy = model.evaluate(X_test_tkn, y_test_tkn)
print("Loss: ", loss, "Accuracy: ", accuracy)








[1m66/86[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.3336 - loss: 0.0371




[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.3318 - loss: 0.0371
Loss:  0.03700545057654381 Accuracy:  0.3260233998298645
