### Preparing the env dependencies

In [None]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'
import warnings
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn import metrics
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
warnings.simplefilter("ignore")

### Catching off-line data

In [2]:
# after that I ask to gemini to generate new data for categories with less count
all_data = pd.read_csv("data/filtered/normalized.csv")
all_data = all_data.dropna()
all_data = all_data.drop_duplicates()
all_data.shape

(18240, 2)

### Vectorizer

#### Label Encoder

In [3]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

def encode_labels(categories):
    label_encoder = LabelEncoder()
    category_encoded = label_encoder.fit_transform(categories)
    print(category_encoded)
    num_categories = len(label_encoder.classes_)
    y = to_categorical(category_encoded)
    return label_encoder, y, num_categories

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

In [4]:
categories = all_data['category']
label_encoder, y, num_categories = encode_labels(categories)
print(num_categories, y)

[48 70 46 ... 72 72 72]
95 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


#### TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
resumes_vec = tfidf_vec.fit_transform(all_data['resume'])
inputs = len(tfidf_vec.get_feature_names_out())
print("Tamanho:", len(tfidf_vec.get_feature_names_out()), "Vocabulário:", tfidf_vec.get_feature_names_out(), )
resumes_vec

Tamanho: 48295 Vocabulário: ['aa' 'aaa' 'aaacom' ... 'zyvox' 'zz' 'zzxzx']


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2122670 stored elements and shape (18240, 48295)>

In [6]:
X_train_vec, X_temp_vec, y_train_vec, y_temp_vec = train_test_split(resumes_vec, y, test_size=0.3, random_state=42)
X_val_vec, X_test_vec, y_val_vec, y_test_vec = train_test_split(X_temp_vec, y_temp_vec, test_size=0.5, random_state=42)
X_train_vec.shape, X_temp_vec.shape, X_val_vec.shape, X_test_vec.shape

((12768, 48295), (5472, 48295), (2736, 48295), (2736, 48295))

#### Tokens

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 20000
max_length = 500
oov_tok = "<OOV>"
token = Tokenizer(num_words=num_words, oov_token=oov_tok)
token.fit_on_texts(all_data['resume'])
inputs = len(token.word_index)
resumes_token = token.texts_to_sequences(all_data['resume'])
resumes_token = pad_sequences(resumes_token, padding="post", truncating="post", maxlen=max_length)
inputs, resumes_token

(48319,
 array([[  34, 1307, 1032, ...,    0,    0,    0],
        [  34,  350, 2123, ...,    0,    0,    0],
        [  34,  769,   59, ...,    0,    0,    0],
        ...,
        [   9,  621,   61, ...,    0,    0,    0],
        [ 154,  307,  602, ...,   23, 2651,   23],
        [ 154,  307,  602, ...,   23, 2651,   23]], dtype=int32))

In [6]:
X_train_tkn, X_temp_tkn, y_train_tkn, y_temp_tkn = train_test_split(resumes_token, y, test_size=0.3, random_state=42)
X_val_tkn, X_test_tkn, y_val_tkn, y_test_tkn = train_test_split(X_temp_tkn, y_temp_tkn, test_size=0.5, random_state=42)
X_train_tkn.shape, X_temp_tkn.shape, X_val_tkn.shape, X_test_tkn.shape

((12768, 500), (5472, 500), (2736, 500), (2736, 500))

## Training

In [17]:
# Play this only in the first execution
# result_vec_decoded = decoded_label(label_encoder, y_test_vec)
# result_vec_df = pd.DataFrame(result_vec_decoded, columns=['target'])
# result_vec_df.to_csv('data/results/tf-idf/classifiers-results.csv', encoding='utf-8', index=False)
#
# result_tkn_decoded = decoded_label(label_encoder, y_test_tkn)
# result_tkn_df = pd.DataFrame(result_tkn_decoded, columns=['target'])
# result_tkn_df.to_csv('data/results/token/classifiers-results.csv', encoding='utf-8', index=False)

In [7]:
def save_results(prediction, model_name, categorical=True):
    filename = f'data/results/classifiers-results.csv'
    result_decoded = decoded_label(label_encoder, prediction, categorical)
    result = pd.read_csv(filename)
    result[model_name] = result_decoded
    result.to_csv(filename, encoding='utf-8', index=False)
    print('Results saved in {}'.format(filename))

### Random Forests

In [25]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    rng = np.random.RandomState(0)
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
        'criterion': ["gini", "entropy"]
    }
    forest = RandomForestClassifier(n_estimators=700, criterion='gini', bootstrap=False)
    rsh = HalvingGridSearchCV(estimator=forest, param_grid=param_grid, random_state=rng, factor=5, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = RandomForestClassifier(
        n_estimators=rsh.best_params_['n_estimators'],
        criterion=rsh.best_params_['criterion'],
        bootstrap=False
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'RFC', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [19]:
model_vec, acc_vec = random_forest_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loa

Score:  0.9057017543859649 Params: {'criterion': 'gini', 'n_estimators': 800}
Results saved in data/results/tf-idf/results.csv
0.966374269005848
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        49
           1       0.95      0.88      0.91        40
           2       1.00      0.91      0.95        23
           3       1.00      1.00      1.00        32
           4       1.00      0.84      0.91        31
           5       0.96      0.77      0.86        31
           6       1.00      1.00      1.00        20
           7       0.91      0.59      0.71        34
           8       0.96      1.00      0.98        22
           9       1.00      1.00      1.00        18
          10       0.97      1.00      0.99        39
          11       1.00      1.00      1.00        34
          12       0.92      0.87      0.89        38
          13       1.00      1.00      1.00        42
          14       1.00      1.00      1.00 

In [20]:
filename = 'models/RFC_model.pkl'
joblib.dump(model_vec, filename)
print('Model RF saved in {}'.format(filename))

Model RF saved in models/RFC_model.pkl


### MLP

In [23]:
from sklearn.neural_network import MLPClassifier

def mlp_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    rng = np.random.RandomState(0)
    param_grid = {
        'hidden_layer_sizes': [(8,32,16), (16,64,32), (32,128,64), (32,64,128,64,32)], # 56, 112, 224, 320, 640
        'max_iter': [200, 400, 600, 800, 1000],
        'activation': ['relu', 'tanh', 'identity'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['adaptive', 'constant'],
    }
    model = MLPClassifier(hidden_layer_sizes=200, max_iter=400, learning_rate='adaptive')
    rsh = HalvingGridSearchCV(estimator=model, param_grid=param_grid, random_state=rng, factor=3, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = MLPClassifier(
        hidden_layer_sizes=rsh.best_params_['hidden_layer_sizes'],
        max_iter=rsh.best_params_['max_iter'],
        activation=rsh.best_params_['activation'],
        solver=rsh.best_params_['solver'],
        alpha=rsh.best_params_['alpha'],
        learning_rate=rsh.best_params_['learning_rate'],
    )
    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    print('Accuracy: {} updated'.format(accuracy))
    save_results(prediction, 'MLP', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [18]:
mlp_model_vec, mlp_acc_vec = mlp_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.8607456140350878 Params: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (32, 128, 64), 'learning_rate': 'constant', 'max_iter': 400, 'solver': 'adam'}
0.9539473684210527
              precision    recall  f1-score   support

           0       0.92      0.96      0.94        49
           1       0.94      0.82      0.88        40
           2       0.81      0.91      0.86        23
           3       1.00      1.00      1.00        32
           4       0.97      0.90      0.93        31
           5       0.71      0.77      0.74        31
           6       1.00      1.00      1.00        20
           7       0.75      0.62      0.68        34
           8       1.00      1.00      1.00        22
           9       1.00      1.00      1.00        18
          10       0.97      0.82      0.89        39
          11       1.00      1.00      1.00        34
          12       0.89      0.87      0.88        38
          13       1.00      1.00      1.00      

In [13]:
filename = 'models/MLP_model.pkl'
joblib.dump(mlp_model_vec, filename)
print('Model MLP saved in {}'.format(filename))

Model MLP saved in models/MLP_model.pkl


### SVC

In [34]:
from sklearn.svm import SVC

def svc_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    rng = np.random.RandomState(0)
    param_grid = {
        'C': [0.1, 1, 10, 100, 1000],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': [0.1, 1, 10, 100],
        'degree': [0, 1, 2, 3, 4, 5, 6]
    }
    svc = SVC(C=0.1, kernel='poly')
    rsh = HalvingGridSearchCV(estimator=svc, param_grid=param_grid, random_state=rng, factor=3, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = SVC(
        C=rsh.best_params_['C'],
        kernel=rsh.best_params_['kernel'],
        gamma=rsh.best_params_['gamma'],
        degree=rsh.best_params_['degree'],
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'SVC', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [30]:
svc_model_vec, svc_acc_vec = svc_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.8951023391812866 Params: {'C': 10, 'degree': 0, 'gamma': 1, 'kernel': 'sigmoid'}
Accuracy: 0.9638157894736842 updated
Results saved in data/results/tf-idf/results.csv
0.9638157894736842
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        49
           1       0.79      0.85      0.82        40
           2       0.78      0.91      0.84        23
           3       0.97      1.00      0.98        32
           4       0.93      0.81      0.86        31
           5       0.76      0.84      0.80        31
           6       1.00      1.00      1.00        20
           7       0.85      0.65      0.73        34
           8       1.00      1.00      1.00        22
           9       0.95      1.00      0.97        18
          10       0.91      1.00      0.95        39
          11       1.00      1.00      1.00        34
          12       0.92      0.89      0.91        38
          13       1.00      1.00      1.00    

In [31]:
filename = 'models/SVC_model.pkl'
joblib.dump(svc_model_vec, filename)
print('Model SVC saved in {}'.format(filename))

Model SVC saved in models/SVC_model.pkl


### Linear SVC

In [84]:
from sklearn.svm import LinearSVC

def linear_svc_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'loss': ['hinge', 'squared_hinge'],
        'tol': [0.0001, 0.001],
        'multi_class': ['ovr', 'crammer_singer'],
    }
    # svc = LinearSVC(C=10, loss='squared_hinge', max_iter=100000)
    # rsh = HalvingGridSearchCV(estimator=svc, param_grid=param_grid, random_state=42, factor=2, n_jobs=-1)
    # rsh.fit(X_val, y_val_categorical)
    # score = rsh.score(X_test, y_test_categorical)
    # print("Score: ", score, "Params:", rsh.best_params_)

    model = LinearSVC(
        C=param_grid['C'][3],
        penalty=param_grid['penalty'][1],
        loss=param_grid['loss'][1],
        tol=param_grid['tol'][1],
        multi_class=param_grid['multi_class'][1],
        max_iter=100000,
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'LinearSVC', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [13]:
linear_svc_model_vec, linear_svc_acc_vec = linear_svc_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.9071637426900585 Params: {'C': 1, 'loss': 'squared_hinge', 'multi_class': 'crammer_singer', 'penalty': 'l2', 'tol': 0.001}
Results saved in data/results/tf-idf/results.csv
0.9641812865497076
              precision    recall  f1-score   support

           0       0.89      0.98      0.93        49
           1       0.92      0.85      0.88        40
           2       0.95      0.91      0.93        23
           3       1.00      1.00      1.00        32
           4       0.96      0.87      0.92        31
           5       0.83      0.81      0.82        31
           6       1.00      1.00      1.00        20
           7       0.80      0.59      0.68        34
           8       1.00      1.00      1.00        22
           9       1.00      1.00      1.00        18
          10       0.93      1.00      0.96        39
          11       1.00      1.00      1.00        34
          12       0.92      0.89      0.91        38
          13       1.00      1.00      1.0

In [15]:
filename = 'models/LinearSVC_model.pkl'
joblib.dump(linear_svc_model_vec, filename)
print('Model Linear SVC saved in {}'.format(filename))

Model Linear SVC saved in models/LinearSVC_model.pkl


### NuSVC

In [41]:
from sklearn.svm import NuSVC

def nu_svc_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    param_grid = {
        'nu': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'tol': [0.0001, 0.001],
        'gamma': ['scale', 'auto', 0.1, 1],
    }
    svc = NuSVC(nu=0.01, kernel='poly', max_iter=100000)
    rsh = HalvingGridSearchCV(estimator=svc, param_grid=param_grid, random_state=42, factor=2, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = NuSVC(
        nu=rsh.best_params_['nu'],
        kernel=rsh.best_params_['kernel'],
        tol=rsh.best_params_['tol'],
        gamma=rsh.best_params_['gamma'],
        max_iter=100000,
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'NuSVC', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [26]:
nu_svc_model_vec, nu_svc_acc_vec = nu_svc_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.8961988304093568 Params: {'gamma': 0.1, 'kernel': 'rbf', 'nu': 0.2, 'tol': 0.0001}
0.9543128654970761
              precision    recall  f1-score   support

           0       0.90      0.96      0.93        49
           1       0.82      0.82      0.82        40
           2       0.87      0.87      0.87        23
           3       1.00      1.00      1.00        32
           4       0.97      0.90      0.93        31
           5       0.76      0.81      0.78        31
           6       1.00      1.00      1.00        20
           7       0.80      0.59      0.68        34
           8       0.92      1.00      0.96        22
           9       1.00      1.00      1.00        18
          10       0.95      0.92      0.94        39
          11       1.00      1.00      1.00        34
          12       0.92      0.92      0.92        38
          13       1.00      1.00      1.00        42
          14       1.00      1.00      1.00        28
          15       1.00

In [62]:
# best_nu_svc_model = nu_svc_model_tkn if nu_svc_acc_tkn > nu_svc_acc_vec else nu_svc_model_vec
filename = 'models/NuSVC_model.pkl'
joblib.dump(nu_svc_model_vec, filename)
print('Model NuSVC saved in {}'.format(filename))

Model NuSVC saved in models/NuSVC_model.pkl


### SGD

In [83]:
from sklearn.linear_model import SGDClassifier

def sgd_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    param_grid = {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber'],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'learning_rate': ['adaptive', 'invscaling', 'optimal'],
        'epsilon': [0.001, 0.01, 0.1],
    }
    sgd = SGDClassifier(alpha=0.001, loss='perceptron', learning_rate='adaptive', max_iter=5000)
    rsh = GridSearchCV(estimator=sgd, param_grid=param_grid, cv=2, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = SGDClassifier(
        alpha=rsh.best_params_['alpha'][0],
        loss=rsh.best_params_['loss'][4],
        penalty=rsh.best_params_['penalty'][1],
        learning_rate=rsh.best_params_['learning_rate'][2],
        epsilon=rsh.best_params_['epsilon'][0],
        max_iter=100000,
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    # if accuracy > 0.9638157894736842: parameters code 0-4-1-2-0
    save_results(prediction, 'SGD', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [75]:
sgd_model_vec, sgd_acc_vec = sgd_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.8866959064327485 Params: {'alpha': 0.0001, 'epsilon': 0.01, 'learning_rate': 'optimal', 'loss': 'log_loss', 'penalty': 'l1'}
0.8987573099415205
              precision    recall  f1-score   support

           0       0.83      0.92      0.87        49
           1       0.65      0.78      0.70        40
           2       0.80      0.17      0.29        23
           3       1.00      0.97      0.98        32
           4       0.90      0.90      0.90        31
           5       0.56      0.29      0.38        31
           6       1.00      1.00      1.00        20
           7       0.60      0.35      0.44        34
           8       0.92      1.00      0.96        22
           9       0.00      0.00      0.00        18
          10       0.91      0.77      0.83        39
          11       1.00      1.00      1.00        34
          12       0.72      0.61      0.66        38
          13       1.00      1.00      1.00        42
          14       1.00      1.00  

In [61]:
# best_sgd_model = sgd_model_tkn if sgd_acc_tkn > sgd_acc_vec else sgd_model_vec
filename = 'models/SGD_model.pkl'
joblib.dump(sgd_model_vec, filename)
print('Model SGD saved in {}'.format(filename))

Model SGD saved in models/SGD_model.pkl


### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

def lr_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    param_grid = {
        'penalty': ['l2', 'elasticnet'],
        'tol': [0.0001, 0.001],
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag'],
    }
    lr_model = LogisticRegression(max_iter=5000)
    rsh = HalvingGridSearchCV(estimator=lr_model, param_grid=param_grid, factor=3, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = LogisticRegression(
        penalty=rsh.best_params_['penalty'],
        tol=rsh.best_params_['tol'],
        C=rsh.best_params_['C'],
        solver=rsh.best_params_['solver'],
        max_iter=5000,
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    save_results(prediction, 'LR', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [12]:
lr_model_vec, lr_acc_vec = lr_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.9064327485380117 Params: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg', 'tol': 0.0001}
Results saved in data/results/tf-idf/results.csv
0.9667397660818714
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        49
           1       0.95      0.90      0.92        40
           2       0.88      0.91      0.89        23
           3       1.00      1.00      1.00        32
           4       0.97      0.90      0.93        31
           5       0.76      0.81      0.78        31
           6       1.00      1.00      1.00        20
           7       0.81      0.65      0.72        34
           8       1.00      1.00      1.00        22
           9       1.00      1.00      1.00        18
          10       0.93      1.00      0.96        39
          11       1.00      1.00      1.00        34
          12       0.92      0.95      0.94        38
          13       1.00      1.00      1.00        42
          14       1

In [21]:
# best_lr_model = lr_model_tkn if lr_acc_tkn > lr_acc_vec else lr_model_vec
filename = 'models/LR_model.pkl'
joblib.dump(lr_model_vec, filename)
print('Model LR saved in {}'.format(filename))

Model LR saved in models/LR_model.pkl


### KNeighbors

In [38]:
from sklearn.neighbors import KNeighborsClassifier

def knc_model(X_train, X_test, X_val, y_train, y_test, y_val):
    y_train_categorical = [int(np.argmax(y)) for y in y_train]
    y_test_categorical = [int(np.argmax(y)) for y in y_test]
    y_val_categorical = [int(np.argmax(y)) for y in y_val]
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [10, 20, 30, 40, 50],
        'metric': ['euclidean', 'manhattan', 'cityblock', 'cosine'],
        'p': [0.1, 0.2, 0.5, 1, 2],
    }
    knc = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance', leaf_size=10)
    rsh = HalvingGridSearchCV(estimator=knc, param_grid=param_grid, factor=5, n_jobs=-1)
    rsh.fit(X_val, y_val_categorical)
    score = rsh.score(X_test, y_test_categorical)
    print("Score: ", score, "Params:", rsh.best_params_)

    model = KNeighborsClassifier(
        n_neighbors=rsh.best_params_['n_neighbors'],
        weights=rsh.best_params_['weights'],
        algorithm=rsh.best_params_['algorithm'],
        leaf_size=rsh.best_params_['leaf_size'],
        metric=rsh.best_params_['metric'],
        p=rsh.best_params_['p'],
    )

    model.fit(X_val, y_val_categorical)
    model.fit(X_train, y_train_categorical)
    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test_categorical, prediction)
    if accuracy > 0.9506578947368421: # parameters code 1-1-0-0-0-2
        save_results(prediction, 'KNC', False)
    print(accuracy)
    print(metrics.classification_report(y_test_categorical, prediction))
    return model, accuracy

In [39]:
knc_model_vec, knc_acc_vec = knc_model(X_train_vec, X_test_vec, X_val_vec, y_train_vec, y_test_vec, y_val_vec)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Score:  0.8603801169590644 Params: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'cosine', 'n_neighbors': 5, 'p': 0.1, 'weights': 'distance'}
0.9506578947368421
              precision    recall  f1-score   support

           0       0.81      0.94      0.87        49
           1       0.88      0.75      0.81        40
           2       0.81      0.91      0.86        23
           3       1.00      1.00      1.00        32
           4       0.96      0.87      0.92        31
           5       0.84      0.84      0.84        31
           6       1.00      1.00      1.00        20
           7       0.80      0.59      0.68        34
           8       1.00      1.00      1.00        22
           9       0.78      1.00      0.88        18
          10       0.97      0.79      0.87        39
          11       1.00      1.00      1.00        34
          12       0.89      0.84      0.86        38
          13       1.00      1.00      1.00        42
          14       1.00  

In [40]:
# best_knc_model = knc_model_tkn if knc_acc_tkn > knc_acc_vec else knc_model_vec
filename = 'models/KNC_model.pkl'
joblib.dump(knc_model_vec, filename)
print('Model KNC saved in {}'.format(filename))

Model KNC saved in models/KNC_model.pkl


### LSTM

In [45]:
embeddings_index = {}
with open('models/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [48]:
embedding_dim = 100  # Must match the GloVe file dimension
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in token.word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [52]:
from tensorflow.keras.models import Sequential

def lstm_model(X_train, X_test, X_val, y_train, y_test, y_val):
    model = Sequential([
        Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
        Bidirectional(LSTM(128, return_sequences=True)),
        Bidirectional(LSTM(64)),
        Dense(128, activation='relu'),
        Dropout(0.4),
        Dense(num_categories, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    X = np.concatenate((X_train, X_val))
    y = np.concatenate((y_train, y_val))
    model.fit(X, y, epochs=30, batch_size=50, verbose=True, validation_data=(X_test, y_test))
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {accuracy}")
    print(f"Test Loss: {loss}")
    prediction = model.predict(X_test)
    y_pred = [int(np.argmax(y)) for y in prediction]
    y_categorical = [int(np.argmax(y)) for y in y_test]
    save_results(y_pred, 'LSTM', False)
    print(classification_report(y_categorical, y_pred))
    return model, accuracy

In [107]:
lstm_model_vec, lstm_acc_vec = lstm_model(X_train_tkn, X_test_tkn, X_val_tkn, y_train_tkn, y_test_tkn, y_val_tkn)

Epoch 1/30
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 61ms/step - accuracy: 0.1927 - loss: 3.4239 - val_accuracy: 0.7591 - val_loss: 0.9209
Epoch 2/30
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 66ms/step - accuracy: 0.7290 - loss: 0.9809 - val_accuracy: 0.8695 - val_loss: 0.4974
Epoch 3/30
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 72ms/step - accuracy: 0.8498 - loss: 0.5700 - val_accuracy: 0.8988 - val_loss: 0.3815
Epoch 4/30
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 79ms/step - accuracy: 0.8939 - loss: 0.4060 - val_accuracy: 0.9097 - val_loss: 0.3212
Epoch 5/30
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 74ms/step - accuracy: 0.9170 - loss: 0.3171 - val_accuracy: 0.9240 - val_loss: 0.2910
Epoch 6/30
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 66ms/step - accuracy: 0.9157 - loss: 0.3034 - val_accuracy: 0.9196 - val_loss: 0.2811
Epoch 7/30
[1m3

In [109]:
lstm_model_vec.save('models/LSTM_model.keras')

### BERT Classification

In [8]:
hub_preprocessor = "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2"
hub_encoder = "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1"

In [13]:
def build_classifier_model(X_train, X_test, y_train, y_test):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing = hub.KerasLayer(hub_preprocessor, name='preprocessing')(text_input)
    encoder = hub.KerasLayer(hub_encoder, name='BERT_encoder')(preprocessing)
    x = tf.keras.layers.Dense(128, activation='relu')(encoder['pooled_output'])
    x = tf.keras.layers.Dropout(0.2, name="dropout")(x)
    x = tf.keras.layers.Dense(num_categories, activation='softmax', name="output")(x)

    model = tf.keras.Model(text_input, x)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=True, validation_data=(X_test, y_test))
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {accuracy}")
    print(f"Test Loss: {loss}")
    prediction = model.predict(X_test)
    y_pred = [int(np.argmax(y)) for y in prediction]
    y_categorical = [int(np.argmax(y)) for y in y_test]
    if accuracy > 0.9543128609657288: # loss: 0.1895463764667511 (sigmoid + adamw)
        save_results(y_pred, 'BERT', False)
    print(classification_report(y_categorical, y_pred))
    return model, accuracy

In [10]:
X_train = np.array(token.sequences_to_texts(np.concatenate((X_train_tkn, X_val_tkn))))
X_test = np.array(token.sequences_to_texts(X_test_tkn))
y_train = np.concatenate((y_train_tkn, y_val_tkn))
y_train.shape, type(X_train), type(y_train)

((15504, 95), numpy.ndarray, numpy.ndarray)

In [14]:
bert_model_vec, bert_acc_vec = build_classifier_model(X_train, X_test, y_train, y_test_tkn)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.9543128609657288
Test Loss: 0.1895463764667511
Results saved in data/results/classifiers-results.csv
              precision    recall  f1-score   support

           0       0.90      0.92      0.91        49
           1       0.86      0.80      0.83        40
           2       0.95      0.78      0.86        23
           3       1.00      1.00      1.00        32
           4       0

In [15]:
bert_model_vec.save('models/BERT_model.keras')

### Word Embedding

In [72]:
embedding_dim=100
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
model = Sequential([
    Embedding(num_words, embedding_dim, input_length=max_length, name="embedding"),
    Bidirectional(LSTM(256, return_sequences=True, activation='tanh', recurrent_activation='sigmoid')),
    Bidirectional(LSTM(128)),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_categories, activation='softmax')
])

In [73]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [74]:
model.fit(X_val_tkn, y_val_tkn, epochs=10, batch_size=30, verbose=True, validation_data=(X_test_tkn, y_test_tkn))

Epoch 1/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 100ms/step - accuracy: 0.0074 - loss: 0.2956 - val_accuracy: 0.0069 - val_loss: 0.0597
Epoch 2/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 107ms/step - accuracy: 0.0132 - loss: 0.0678 - val_accuracy: 0.0069 - val_loss: 0.0590
Epoch 3/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 115ms/step - accuracy: 0.0132 - loss: 0.0646 - val_accuracy: 0.0216 - val_loss: 0.0587
Epoch 4/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 123ms/step - accuracy: 0.0061 - loss: 0.0632 - val_accuracy: 0.0077 - val_loss: 0.0586
Epoch 5/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 130ms/step - accuracy: 0.0092 - loss: 0.0627 - val_accuracy: 0.0128 - val_loss: 0.0583
Epoch 6/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 133ms/step - accuracy: 0.0165 - loss: 0.0611 - val_accuracy: 0.0249 - val_loss: 0.0567
Epoch 7/10
[1m92/92[

<keras.src.callbacks.history.History at 0x792a783a65a0>

In [75]:
loss, accuracy = model.evaluate(X_test_tkn, y_test_tkn)
print("Loss: ", loss, "Accuracy: ", accuracy)

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.0455 - loss: 0.0521
Loss:  0.0518878735601902 Accuracy:  0.05190058425068855
