In [120]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

### Catching off-line data

In [125]:
trust_models = pd.read_csv("data/results/trust_model.csv").drop(columns=['BERT'], axis=1)
np_trust_model = np.array(trust_models)
min_val = np_trust_model.min()
max_val = np_trust_model.max()
normalized_trust_models = (np_trust_model - min_val) / (max_val - min_val)
trust_models.head()

Unnamed: 0,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM
0,0.966374,0.953947,0.963816,0.964181,0.954313,0.898757,0.96674,0.950658,0.964547


In [3]:
# after that I ask to gemini to generate new data for categories with less count
all_data = pd.read_csv("data/filtered/normalized.csv")
all_data = all_data.dropna()
all_data = all_data.drop_duplicates()
all_data.shape

(18240, 2)

### Text pre process

In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

def encode_labels(categories):
    label_encoder = LabelEncoder()
    category_encoded = label_encoder.fit_transform(categories)
    print(category_encoded)
    num_categories = len(label_encoder.classes_)
    y = to_categorical(category_encoded)
    return label_encoder, y, num_categories

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

2025-06-29 00:02:21.977080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751166141.989376   43752 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751166141.993314   43752 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751166142.004194   43752 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751166142.004210   43752 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751166142.004211   43752 computation_placer.cc:177] computation placer alr

In [5]:
categories = all_data['category']
label_encoder, y, num_categories = encode_labels(categories)
print(num_categories, y)

[48 70 46 ... 72 72 72]
95 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
resumes_vec = tfidf_vec.fit_transform(all_data['resume'])
inputs = len(tfidf_vec.get_feature_names_out())
print("Tamanho:", len(tfidf_vec.get_feature_names_out()), "Vocabulário:", tfidf_vec.get_feature_names_out(), )
resumes_vec

Tamanho: 48295 Vocabulário: ['aa' 'aaa' 'aaacom' ... 'zyvox' 'zz' 'zzxzx']


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2122670 stored elements and shape (18240, 48295)>

In [7]:
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(resumes_vec, y, test_size=0.4, random_state=42)
X_train_vec.shape, X_test_vec.shape, y_train_vec.shape, y_test_vec.shape

((10944, 48295), (7296, 48295), (10944, 95), (7296, 95))

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 20000
max_length = 500
oov_tok = "<OOV>"
token = Tokenizer(num_words=num_words, oov_token=oov_tok)
token.fit_on_texts(all_data['resume'])
inputs = len(token.word_index)
resumes_token = token.texts_to_sequences(all_data['resume'])
resumes_token = pad_sequences(resumes_token, padding="post", truncating="post", maxlen=max_length)
inputs, resumes_token

(48319,
 array([[  34, 1307, 1032, ...,    0,    0,    0],
        [  34,  350, 2123, ...,    0,    0,    0],
        [  34,  769,   59, ...,    0,    0,    0],
        ...,
        [   9,  621,   61, ...,    0,    0,    0],
        [ 154,  307,  602, ...,   23, 2651,   23],
        [ 154,  307,  602, ...,   23, 2651,   23]], dtype=int32))

In [9]:
X_train_tkn, X_test_tkn, y_train_tkn, y_test_tkn = train_test_split(resumes_token, y, test_size=0.4, random_state=42)
X_train_tkn.shape, X_test_tkn.shape, y_train_tkn.shape, y_test_tkn.shape

((10944, 500), (7296, 500), (10944, 95), (7296, 95))

### Load models

In [10]:
import joblib
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

rfc = joblib.load('models/RFC_model.pkl')
svc = joblib.load('models/SVC_model.pkl')
sgd = joblib.load('models/SGD_model.pkl')
nu_svc = joblib.load('models/NuSVC_model.pkl')
mpl = joblib.load('models/MLP_model.pkl')
lr = joblib.load('models/LR_model.pkl')
linear_svc = joblib.load('models/LinearSVC_model.pkl')
knc = joblib.load('models/KNC_model.pkl')
lstm = tf.keras.models.load_model('models/LSTM_model.keras')
# bert = tf.keras.models.load_model('models/BERT_model.keras')

I0000 00:00:1751166159.356693   43752 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9712 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6
  saveable.load_own_variables(weights_store.get(inner_path))


In [11]:
pred = {
    # 'target': decoded_label(label_encoder, y_test_vec),
    'rfc': rfc.predict(X_test_vec),
    'svc': svc.predict(X_test_vec),
    'sgd': sgd.predict(X_test_vec),
    'nu_svc': nu_svc.predict(X_test_vec),
    'mpl': mpl.predict(X_test_vec),
    'lr': lr.predict(X_test_vec),
    'linear_svc': linear_svc.predict(X_test_vec),
    'knc': knc.predict(X_test_vec),
    'lstm': [int(np.argmax(y)) for y in lstm.predict(X_test_tkn)],
    # 'bert': bert_pred
}

[1m  1/228[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:51[0m 755ms/step

I0000 00:00:1751166223.532613   43863 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step


In [12]:
results = pd.DataFrame(pred)
results

Unnamed: 0,rfc,svc,sgd,nu_svc,mpl,lr,linear_svc,knc,lstm
0,73,73,73,73,73,73,73,73,73
1,92,92,92,92,92,92,92,92,92
2,52,52,52,52,52,52,52,52,52
3,81,81,81,81,81,81,81,81,81
4,78,78,78,78,78,78,78,78,78
...,...,...,...,...,...,...,...,...,...
7291,24,24,24,24,24,24,24,24,24
7292,49,49,49,49,49,49,49,49,49
7293,16,16,16,16,16,16,16,16,16
7294,23,23,23,23,23,23,23,23,23


### Test decision

In [173]:
def get_nearest(point):
    central_point = point
    nearest = float('inf')
    for v in x:
        diff = abs(v - central_point)
        central_point = central_point if nearest < diff else v

    return central_point

In [184]:
def pred_ensemble(x, weights):
    if np.std(x) == 0:
        return x[0]
    else:
        mode = stats.mode(x)
        if mode.count == 8:
            return round(mode.mode)
        else:
            average = round(np.average(x, weights=weights))
            median = round(np.median(x))
            if average == median:
                return int(average)
            central_point_average = get_nearest(average)
            count_central_point_average = np.count_nonzero(x == central_point_average)
            central_point_median = get_nearest(median)
            count_central_point_median = np.count_nonzero(x == central_point_median)
            central_point = central_point_average if count_central_point_average > count_central_point_median else central_point_median
            return int(central_point)

In [185]:
from scipy import stats
from sklearn.metrics import classification_report

trust_models = np.ravel(np.array(trust_models))
y_test = [int(np.argmax(y)) for y in y_test_vec]
statistic_predictions = []
for x in np.array(results):
    statistic_predictions.append(pred_ensemble(x, trust_models))

report = classification_report(y_test, statistic_predictions, output_dict=True)
{
    'accuracy': report['accuracy'],
    'macro avg': report['macro avg'],
    'weighted avg': report['weighted avg'],
}

{'accuracy': 0.9897203947368421,
 'macro avg': {'precision': 0.9913524351525886,
  'recall': 0.9909332967596383,
  'f1-score': 0.9910352308390453,
  'support': 7296.0},
 'weighted avg': {'precision': 0.9899479593291411,
  'recall': 0.9897203947368421,
  'f1-score': 0.9897115242629699,
  'support': 7296.0}}

### Real test

In [186]:
resume_text = """
Detail-oriented and proactive Marketing Assistant with a strong foundation in digital marketing, content creation, and market research. Eager to contribute to a dynamic marketing team, bringing a solid work ethic and a passion for developing and executing innovative marketing campaigns that drive brand awareness and engagement. Proficient in a variety of marketing software and social media platforms, with a proven ability to manage multiple projects and deadlines effectively.

---

Education

Bachelor of Science in Marketing
[University Name], [City, State]
[Month, Year] of Graduation

---

Experience

Marketing Intern | [Previous Company Name] | [City, State]
[Month, Year] – [Month, Year]

* Assisted in the development and implementation of a comprehensive social media strategy across Instagram, Facebook, and Twitter, resulting in a 15% increase in follower engagement over three months.
* Conducted market research and competitive analysis to identify emerging trends and inform campaign planning.
* Created and scheduled engaging content for various social media platforms, including graphics, videos, and blog posts.
* Monitored and reported on key performance indicators (KPIs) for social media campaigns, providing insights and recommendations for optimization.
* Supported the marketing team in organizing and promoting a major virtual event, which attracted over 500 attendees.
* Assisted with email marketing campaigns by helping to create newsletters and promotional emails using Mailchimp.

Sales Associate | [Retail Company Name] | [City, State]
[Month, Year] – [Month, Year]

* Provided excellent customer service, resulting in a high rate of customer satisfaction and repeat business.
* Assisted with in-store promotional events and product demonstrations.
* Gained valuable insights into consumer behavior and sales techniques.

---

Skills

Marketing Skills:
* Social Media Management (Instagram, Facebook, Twitter, LinkedIn)
* Content Creation (Canva, Adobe Spark)
* Email Marketing (Mailchimp)
* Market Research
* Search Engine Optimization (SEO) Basics
* Data Analysis & Reporting

Technical Skills:
* Microsoft Office Suite (Word, Excel, PowerPoint)
* Google Analytics (Basic)
* Hootsuite

Soft Skills:
* Strong written and verbal communication
* Excellent organizational and time management skills
* Detail-oriented
* Creative and analytical thinker
* Team collaboration
* Adaptability
"""

In [187]:
import re
import string
import spacy

nlp = spacy.load('en_core_web_sm')

resume_text = str(resume_text).lower()  # Convert to lowercase
resume_text = re.sub(r'\n', ' ', resume_text)  # Remove newlines
resume_text = re.sub(r'\[.*?\]', '', resume_text)  # Remove text in brackets
resume_text = re.sub(r'[%s]' % re.escape(string.punctuation), '', resume_text)  # Remove punctuation
resume_text = re.sub(r'\w*\d\w*', '', resume_text)  # Remove words with numbers
resume_text = re.sub(r'https?://\S+|www\.\S+|ftp\.\S+', '', resume_text) # remove ulrs
resume_text = re.sub(r'\S+@\S+', '', resume_text) # remove emails
resume_text = re.sub('[^A-Za-z0-9 ]+', '', resume_text) # no special character but space
resume_text = re.sub(r'\s+', ' ', resume_text)  # Remove extra spaces
resume_text = resume_text.strip()
doc = nlp(resume_text)
tokens = [token.text for token in doc if not token.is_stop]
resume_text = ' '.join(tokens)
doc = nlp(resume_text)
lemmatized_tokens = [token.lemma_ for token in doc]
resume_text = ' '.join(lemmatized_tokens)
resume_text

'detailoriente proactive marketing assistant strong foundation digital marketing content creation market research eager contribute dynamic marketing team bring solid work ethic passion develop execute innovative marketing campaign drive brand awareness engagement proficient variety marketing software social medium platform prove ability manage multiple project deadline effectively education bachelor science marketing graduation experience marketing intern assist development implementation comprehensive social medium strategy instagram facebook twitter result increase follower engagement month conduct market research competitive analysis identify emerge trend inform campaign planning create schedule engage content social medium platform include graphic video blog post monitor report key performance indicator kpi social medium campaign provide insight recommendation optimization support marketing team organize promote major virtual event attract attendee assist email marketing campaign h

In [188]:
resume_vec = tfidf_vec.transform([resume_text])
resume_vec.shape

(1, 48295)

In [189]:
resume_token = token.texts_to_sequences([resume_text])
resume_token = pad_sequences(resume_token, padding="post", truncating="post", maxlen=max_length)
resume_token.shape

(1, 500)

In [190]:
pred_resume = {
    # 'target': decoded_label(label_encoder, y_test_vec),
    'rfc': rfc.predict(resume_vec),
    'svc': svc.predict(resume_vec),
    'sgd': sgd.predict(resume_vec),
    'nu_svc': nu_svc.predict(resume_vec),
    'mpl': mpl.predict(resume_vec),
    'lr': lr.predict(resume_vec),
    'linear_svc': linear_svc.predict(resume_vec),
    'knc': knc.predict(resume_vec),
    'lstm': [int(np.argmax(y)) for y in lstm.predict(resume_token)],
    # 'bert': bert_pred
}

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step


In [192]:
result_resume = pd.DataFrame(pred_resume)
encode_category_pred = pred_ensemble(result_resume.iloc[0], trust_models)
encode_category_pred

37

In [193]:
category = decoded_label(label_encoder, [encode_category_pred], False)
category[0]

'digitalmedia'