In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Catching off-line data

In [2]:
trust_models = pd.read_csv("data/results/trust_model.csv").drop(columns=['BERT'], axis=1)
trust_models.head()

Unnamed: 0,RFC,MLP,SVC,LinearSVC,NuSVC,SGD,LR,KNC,LSTM
0,0.966374,0.953947,0.963816,0.964181,0.954313,0.898757,0.96674,0.950658,0.964547


In [3]:
# after that I ask to gemini to generate new data for categories with less count
all_data = pd.read_csv("data/filtered/normalized.csv")
all_data = all_data.dropna()
all_data = all_data.drop_duplicates()
all_data.shape

(18240, 2)

### Text pre process

In [28]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

def encode_labels(categories):
    label_encoder = LabelEncoder()
    category_encoded = label_encoder.fit_transform(categories)
    print(category_encoded)
    num_categories = len(label_encoder.classes_)
    y = to_categorical(category_encoded)
    return label_encoder, y, num_categories

def decoded_label(label_encoder, y_encoded, categorical=True):
    label = [int(np.argmax(y)) for y in y_encoded] if categorical else y_encoded
    return label_encoder.inverse_transform(label)

In [5]:
categories = all_data['category']
label_encoder, y, num_categories = encode_labels(categories)
print(num_categories, y)

[48 70 46 ... 72 72 72]
95 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
resumes_vec = tfidf_vec.fit_transform(all_data['resume'])
inputs = len(tfidf_vec.get_feature_names_out())
print("Tamanho:", len(tfidf_vec.get_feature_names_out()), "Vocabulário:", tfidf_vec.get_feature_names_out(), )
resumes_vec

Tamanho: 48295 Vocabulário: ['aa' 'aaa' 'aaacom' ... 'zyvox' 'zz' 'zzxzx']


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2122670 stored elements and shape (18240, 48295)>

In [37]:
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(resumes_vec, y, test_size=0.4, random_state=42)
X_train_vec.shape, X_test_vec.shape, y_train_vec.shape, y_test_vec.shape

((10944, 48295), (7296, 48295), (10944, 95), (7296, 95))

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 20000
max_length = 500
oov_tok = "<OOV>"
token = Tokenizer(num_words=num_words, oov_token=oov_tok)
token.fit_on_texts(all_data['resume'])
inputs = len(token.word_index)
resumes_token = token.texts_to_sequences(all_data['resume'])
resumes_token = pad_sequences(resumes_token, padding="post", truncating="post", maxlen=max_length)
inputs, resumes_token

(48319,
 array([[  34, 1307, 1032, ...,    0,    0,    0],
        [  34,  350, 2123, ...,    0,    0,    0],
        [  34,  769,   59, ...,    0,    0,    0],
        ...,
        [   9,  621,   61, ...,    0,    0,    0],
        [ 154,  307,  602, ...,   23, 2651,   23],
        [ 154,  307,  602, ...,   23, 2651,   23]], dtype=int32))

In [39]:
X_train_tkn, X_test_tkn, y_train_tkn, y_test_tkn = train_test_split(resumes_token, y, test_size=0.4, random_state=42)
X_train_tkn.shape, X_test_tkn.shape, y_train_tkn.shape, y_test_tkn.shape

((10944, 500), (7296, 500), (10944, 95), (7296, 95))

### Load models

In [10]:
import joblib
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

rfc = joblib.load('models/RFC_model.pkl')
svc = joblib.load('models/SVC_model.pkl')
sgd = joblib.load('models/SGD_model.pkl')
nu_svc = joblib.load('models/NuSVC_model.pkl')
mpl = joblib.load('models/MLP_model.pkl')
lr = joblib.load('models/LR_model.pkl')
linear_svc = joblib.load('models/LinearSVC_model.pkl')
knc = joblib.load('models/KNC_model.pkl')
lstm = tf.keras.models.load_model('models/LSTM_model.keras')
# bert = tf.keras.models.load_model('models/BERT_model.keras')

I0000 00:00:1751160918.842222   37362 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9695 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6
  saveable.load_own_variables(weights_store.get(inner_path))


In [11]:
pred = {
    # 'target': decoded_label(label_encoder, y_test_vec),
    'rfc': rfc.predict(X_test_vec),
    'svc': svc.predict(X_test_vec),
    'sgd': sgd.predict(X_test_vec),
    'nu_svc': nu_svc.predict(X_test_vec),
    'mpl': mpl.predict(X_test_vec),
    'lr': lr.predict(X_test_vec),
    'linear_svc': linear_svc.predict(X_test_vec),
    'knc': knc.predict(X_test_vec),
    'lstm': [int(np.argmax(y)) for y in lstm.predict(X_test_tkn)],
    # 'bert': bert_pred
}

[1m  1/228[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:48[0m 743ms/step

I0000 00:00:1751160981.778646   37457 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step


In [52]:
results = pd.DataFrame(pred)
results

Unnamed: 0,rfc,svc,sgd,nu_svc,mpl,lr,linear_svc,knc,lstm
0,73,73,73,73,73,73,73,73,73
1,92,92,92,92,92,92,92,92,92
2,52,52,52,52,52,52,52,52,52
3,81,81,81,81,81,81,81,81,81
4,78,78,78,78,78,78,78,78,78
...,...,...,...,...,...,...,...,...,...
7291,24,24,24,24,24,24,24,24,24
7292,49,49,49,49,49,49,49,49,49
7293,16,16,16,16,16,16,16,16,16
7294,23,23,23,23,23,23,23,23,23


### Test decision

In [48]:
def pred_ensemble(x):
    if np.std(x) == 0:
        return x[0]
    else:
        mode = stats.mode(x)
        if mode.count > 5:
            return int(mode.mode)
        else:
            mean = np.average(x, weights=trust_models)
            # mean = np.mean(x)
            nearest = len(label_encoder.classes_)
            for v in x:
                diff = abs(v - mean)
                nearest = nearest if nearest < diff else v
            return int(nearest)

In [49]:
from scipy import stats
from sklearn.metrics import classification_report

trust_models = np.ravel(np.array(trust_models))
y_test = [int(np.argmax(y)) for y in y_test_vec]
statistic_predictions = []
for x in np.array(results):
    statistic_predictions.append(pred_ensemble(x))

report = classification_report(y_test, statistic_predictions, output_dict=True)
{
    'accuracy': report['accuracy'],
    'macro avg': report['macro avg'],
    'weighted avg': report['weighted avg'],
}

{'accuracy': 0.9801260964912281,
 'macro avg': {'precision': 0.9833448650633813,
  'recall': 0.982827632000708,
  'f1-score': 0.9827130000470556,
  'support': 7296.0},
 'weighted avg': {'precision': 0.9809227207390692,
  'recall': 0.9801260964912281,
  'f1-score': 0.9800934609348122,
  'support': 7296.0}}

In [81]:
resume_text = """
lead accountant highlights quickbooks peachtree house accounting systems financial reporting mas far wawf great plains integration manager frx reporting hyperion workspace planning enterprise essbase schedules reports crystal reports ms office strong excel skill financial accounting database management budget administration performance reporting tax preparation compliance forecasting trend analysis strategic planning cash flow analysis variance analysis sarbanes research compliance experience lead accountant november current company name city state experience various areas including limited operating budgets financial analysis planning financial statements reporting accounting policies procedure cash flow taxes auditing business process improvements manage various areas financial accounting budgeting financial analysis execute qualitative quantitative analysis techniques minimize risks liabilities develop implement system processes achieve financial discipline improve overall efficiency organization provide strong analytical skills good relationship management negotiation skills liaising various large corporate entities financial institutions various regulatory authorities monitor manage daily processes multiple complex revenues streams review contracts new pricing schedules researching implementing new accounts accounting systems provide subject matter expertise analyzing revenue trends apply corrective measures risk findings maintain ongoing communication key stakeholders include accounts operations managers new markets support teams emerging accounting issues recommend corrective actions test internal controls compliance preventing fraud improving accounting processes meeting full compliance gaap federal state law stay abreast relevant new market events new product pipeline track daily activity firm new contracts provide recommendations financial related issues maintain accuracy completeness deferred revenues enforce timely revenue recognition lead develop methods implement revenue test models complex billing structures perform trends analysis revenues using quantitative methodology forecast future contingencies maintain new revenue backlog inventories periodically testing appropriateness new pricing conditions well accounting policy determinations identify risky accounts default communicate account managers corrective actions prepare various ad hoc reports request senior management requiring knowledge general ledger supporting sub ledger details lead support assist external financial audit process prepare maintain various incentive payment schedules prescriber program lead support billing automation effort streamlining monthly billing process prompt accurate timely billing customers improve cash flows meeting targeted projections senior accountant september october company name city state compiled analyzed financial data used preparation corporate financial statements provided support business operations analysis key performance indicators trends maintained supervised various systems internal controls financial reporting assessing remediating deficiencies discovered periodic testing making easier reconcile accounts collaborated compiled prepared budgets forecast operational activities maintained general ledger sub ledger accounts posting documenting financial journal entries managed maintained corporate closing schedules periodic financial reporting monitored maintained fixed assets depreciation schedules ongoing basis filed property taxes county ensured compliance corporate sales taxes federal state lead team preparation coordination financial audit process reconciled bank statements procured recommendations bank irregularities issues key stake holders staff accountant march august company name city state performed monthly closing cycle responsible preparing analyzing reconciling correcting financial statements management reports homebuilding divisions acted primary contact operations personnel accounting related matters maintained company systems internal controls financial reporting posted financial statement entries including reconciling documented p l classification entries verified preliminary sales report composed final corporate reporting reviewed reconciled inventory status posted reconciling entries prepared corrected preliminary escrow analysis including extensive analysis escrow account allocation entries performed analysis direct profits reviewed created roll forward reports builder reserve runs summary reconciled builder proceeds mortgage proceeds researched accounting procedures policy statures including sarbanes oxley sec compliance irs state local tax laws education accounting business administration university maryland university college city state accounting business administration cpa candidate skills accounting accounting systems ad analytical skills auditing automation billing budgeting budgets budget business operations business process cash flow cash flow closing contracts cpa crystal reports database management essbase senior management financial financial accounting financial analysis financial analysis planning financial reporting financial statements fixed assets forecasting general ledger great plains hyperion inventory ledger law market mas excel ms office negotiation enterprise peachtree personnel policies pricing processes quantitative analysis quickbooks reconciling relationship management reporting researching research sales sarbanes oxley strategic planning tax preparation tax taxes trend
"""

In [82]:
import re
import string
import spacy

nlp = spacy.load('en_core_web_sm')

resume_text = str(resume_text).lower()  # Convert to lowercase
resume_text = re.sub(r'\n', ' ', resume_text)  # Remove newlines
resume_text = re.sub(r'\[.*?\]', '', resume_text)  # Remove text in brackets
resume_text = re.sub(r'[%s]' % re.escape(string.punctuation), '', resume_text)  # Remove punctuation
resume_text = re.sub(r'\w*\d\w*', '', resume_text)  # Remove words with numbers
resume_text = re.sub(r'https?://\S+|www\.\S+|ftp\.\S+', '', resume_text) # remove ulrs
resume_text = re.sub(r'\S+@\S+', '', resume_text) # remove emails
resume_text = re.sub('[^A-Za-z0-9 ]+', '', resume_text) # no special character but space
resume_text = re.sub(r'\s+', ' ', resume_text)  # Remove extra spaces
resume_text = resume_text.strip()
doc = nlp(resume_text)
tokens = [token.text for token in doc if not token.is_stop]
resume_text = ' '.join(tokens)
doc = nlp(resume_text)
lemmatized_tokens = [token.lemma_ for token in doc]
resume_text = ' '.join(lemmatized_tokens)
resume_text

'lead accountant highlight quickbook peachtree house accounting system financial reporting mas far wawf great plain integration manager frx reporting hyperion workspace planning enterprise essbase schedule report crystal report ms office strong excel skill financial accounting database management budget administration performance report tax preparation compliance forecasting trend analysis strategic planning cash flow analysis variance analysis sarbane research compliance experience lead accountant november current company city state experience area include limited operating budget financial analysis plan financial statement report accounting policy procedure cash flow taxis audit business process improvement manage area financial accounting budget financial analysis execute qualitative quantitative analysis technique minimize risk liability develop implement system process achieve financial discipline improve overall efficiency organization provide strong analytical skill good relatio

In [83]:
resume_vec = tfidf_vec.transform([resume_text])
resume_vec.shape

(1, 48295)

In [84]:
resume_token = token.texts_to_sequences([resume_text])
resume_token = pad_sequences(resume_token, padding="post", truncating="post", maxlen=max_length)
resume_token.shape

(1, 500)

In [85]:
pred_resume = {
    # 'target': decoded_label(label_encoder, y_test_vec),
    'rfc': rfc.predict(resume_vec),
    'svc': svc.predict(resume_vec),
    'sgd': sgd.predict(resume_vec),
    'nu_svc': nu_svc.predict(resume_vec),
    'mpl': mpl.predict(resume_vec),
    'lr': lr.predict(resume_vec),
    'linear_svc': linear_svc.predict(resume_vec),
    'knc': knc.predict(resume_vec),
    'lstm': [int(np.argmax(y)) for y in lstm.predict(resume_token)],
    # 'bert': bert_pred
}

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step


In [86]:
result_resume = pd.DataFrame(pred_resume)
encode_category_pred = pred_ensemble(result_resume.iloc[0])
encode_category_pred

0

In [87]:
category = decoded_label(label_encoder, [encode_category_pred], False)
category[0]

'accountant'