In [1]:
max_workers = 4

## Configure nltk

In [2]:
import nltk

nltk.download('stopwords')
!python -m spacy download pt_core_news_sm

[nltk_data] Downloading package stopwords to /home/iohans/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


## Load data from file

In [20]:
import pandas as pd
from sqlalchemy import create_engine

# Load data from file
data = pd.read_csv('data.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62726 entries, 0 to 62725
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            62726 non-null  int64 
 1   name          62725 non-null  object
 2   content       62723 non-null  object
 3   category      62229 non-null  object
 4   fullcategory  62229 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.4+ MB


## Category analysis

In [4]:
### Get all distinct full categories
categories = data.drop_duplicates(subset=['fullcategory'])[['category', 'fullcategory']]

# Split full categories by level
categories_split = categories.join(
    categories['fullcategory'].str.split(' > ', expand=True)
).rename(columns={0: 'category1', 1: 'category2', 2: 'category3'})[['category1', 'category2', 'category3', 'category', 'fullcategory']]

# print(categories_split.drop_duplicates(subset=['category1'])[['category1']].dropna().count())
# print(categories_split.drop_duplicates(subset=['category2'])[['category2']].dropna().count())
# print(categories_split.drop_duplicates(subset=['category3'])[['category3']].dropna().count())

# comparison = categories_split[['category1', 'category2', 'category3', 'category']].copy()

# for column in ['category1', 'category2', 'category3']:
#     comparison[column] = categories_split[['category',column]].dropna()['category'] == categories_split[column].dropna()

# Merge data with split categories
data_category = data.set_index('fullcategory').join(categories_split[['fullcategory','category1', 'category2', 'category3']].set_index('fullcategory'))

# Reset index
data_category = data_category.reset_index()

data_category.head()

Unnamed: 0,fullcategory,id,name,content,category,category1,category2,category3
0,Equipamentos de informática (falha/substituiçã...,47,Apenas um teste,apenas um teste,Equipamentos de informática (falha/substituiçã...,Equipamentos de informática (falha/substituiçã...,,
1,Habilitação/Acesso a sistemas,48,Desbloqueio de conta de usuária,Solicito o desbloqueio da minha conta para ace...,Habilitação/Acesso a sistemas,Habilitação/Acesso a sistemas,,
2,Habilitação/Acesso a sistemas,49,IMPOSSIBILIDADE DE USO DO PAE E SRHonline,"Bom dia,\r\n\r\n Estou sem consegui acessar P...",Habilitação/Acesso a sistemas,Habilitação/Acesso a sistemas,,
3,Habilitação/Acesso a sistemas,50,IMPOSSIBILIDADE DE ENTRAR NO E-MAIL e ATENDIME...,"Bom dia,\r\n\r\n Estava sem conseguir acessar,...",Habilitação/Acesso a sistemas,Habilitação/Acesso a sistemas,,
4,Habilitação/Acesso a sistemas,51,Desabilitação,Desabilitação de todos os acessos dos e-mails ...,Habilitação/Acesso a sistemas,Habilitação/Acesso a sistemas,,


## Clean data

In [5]:
# Drop missing values for category1
data_category = data_category.dropna(subset=['category1'])


In [6]:
# def remove_noise(text):
#     import re

#     # Converte para minúsculas
#     text = text.lower()

#     # Remove pontuação
#     # []: colchetes são usados para definir uma classe de caracteres.
#     # ^: quando usado no início de uma classe de caracteres, o ^ nega a classe, ou seja, seleciona tudo que não está na classe.
#     # \w: corresponde a qualquer caractere alfanumérico (letras e números, incluindo o caractere de sublinhado _)
#     # \s: corresponde a qualquer espaço em branco (espaços, tabulações, quebras de linha).
#     text = re.sub(r'[^\w\s]', '', text)

#     return text

In [7]:
# columns_to_clean = ['name','content']

# for column in columns_to_clean:
#     data[column] = data[column].apply(remove_noise)
# data.head(5)

In [8]:
# def remove_stopwords(text):
#     from nltk.corpus import stopwords
#     # Obtém a lista de stopwords em português usando o NLTK e as converte para um conjunto para melhorar a eficiência da busca
#     stop_words = set(stopwords.words('portuguese'))

#     # Divide o texto em palavras, remove as stopwords e então junta as palavras restantes de volta em uma string
#     text = ' '.join([word for word in text.split() if word not in stop_words])

#     return text

In [9]:
# for column in columns_to_clean:
#     data[column] = data[column].apply(remove_stopwords)

# data.head(5)

## Spliting dataset

In [10]:
# from sklearn.preprocessing import OneHotEncoder

# # Cria um objeto OneHotEncoder
# encoder = OneHotEncoder(sparse_output=False)

# enc_category1 = encoder.fit_transform(data_category[['category1']])

# data_category = pd.concat([data_category, pd.DataFrame(enc_category1, columns=encoder.get_feature_names_out(['category1']))], axis=1)

# data_category.head()

In [11]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(data_category[['name', 'content']], data_category['category1'], test_size=0.5, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

data_train = pd.concat([X_train, y_train], axis=1)
data_val = pd.concat([X_val, y_val], axis=1)
data_test = pd.concat([X_test, y_test], axis=1)

data_train.info()
data_val.info()
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15557 entries, 25531 to 12872
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       15557 non-null  object
 1   content    15557 non-null  object
 2   category1  15557 non-null  object
dtypes: object(3)
memory usage: 486.2+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 15557 entries, 39663 to 12571
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       15556 non-null  object
 1   content    15557 non-null  object
 2   category1  15557 non-null  object
dtypes: object(3)
memory usage: 486.2+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 31115 entries, 49915 to 7510
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       31115 non-null  object
 1   content    31114 non-null  object
 2   category1  31115 non-null  object
dtypes: obj

## Lemmatization and vectorization

In [12]:
import spacy, regex as re
nlp = spacy.load('pt_core_news_sm')

def tokenize_and_lemmatize(text):

    doc = nlp(text)
    tokens = [
        token.lemma_ 
        for token in doc 
        if 
        not re.search(r'[^\w\s]|[\d]|[\w\-\.]+@([\w-]+\.)+[\w-]{2,}|[\r\n\t]', token.lemma_) 
        and 
        token.is_stop == False
        ]
    return tokens

teste = data['content'].sample(100).iloc[0]

tokenize_and_lemmatize(teste)


['Instalar', 'Mozila', 'Máquina', 'IP']

In [13]:
# # importa TfidfVectorizer para criar vetores TF-IDF
# from sklearn.feature_extraction.text import TfidfVectorizer

# save_dir = './vectorizers'

# import os
# os.makedirs(save_dir, exist_ok=True)

# vectorizers = {}

# for min_df in [0.01, 0.025, 0.05, 0.075, 0.1]:
#     # instancia um objeto TfidfVectorizer
#     tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000,
#                                     min_df=min_df,
#                                     use_idf=True, tokenizer=tokenize_and_lemmatize,
#                                     ngram_range=(1,3))

#     # Check if the vectorizer file exists, if so, load it, if not, create it
#     if os.path.exists(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl'):
#         with open(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl', 'rb') as f:
#             print(f'Loading tfidf_vectorizer_{min_df}')
#             import pickle
#             tfidf_vectorizer = pickle.load(f)
#     else:
#         print(f'Creating tfidf_vectorizer_{min_df}')
#         # Create the vectorizer
#         tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in data_train[['name', 'content']].apply(lambda x: ' '.join(x), axis=1)])

#         print(tfidf_matrix.shape)

#         print(f'min_df={min_df}: {len(tfidf_vectorizer.get_feature_names_out())} features')

#         # save the model to disk
#         # from joblib import dump
#         import pickle
#         with open(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl', 'wb') as f:
#             pickle.dump(tfidf_vectorizer, f)


#     vectorizers.update({f'tfidf_vectorizer_{min_df}': tfidf_vectorizer})

In [19]:
# importa TfidfVectorizer para criar vetores TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from concurrent.futures import ProcessPoolExecutor
import os
import pickle

save_dir = './vectorizers'
os.makedirs(save_dir, exist_ok=True)

vectorizers = {}

def create_or_load_vectorizer(min_df, data_train, save_dir, max_features=2000):
    import warnings
    warnings.filterwarnings('ignore', category=UserWarning)
    # instancia um objeto TfidfVectorizer
    if min_df is None:
      tfidf_vectorizer = TfidfVectorizer(max_df=1.0,
                                        max_features=max_features,
                                        use_idf=True, tokenizer=tokenize_and_lemmatize,
                                        ngram_range=(1, 3))
    else:
      tfidf_vectorizer = TfidfVectorizer(max_df=1.0,
                                        max_features=max_features,
                                        min_df=min_df,
                                        use_idf=True, tokenizer=tokenize_and_lemmatize,
                                        ngram_range=(1, 3))

    # Check if the vectorizer file exists, if so, load it, if not, create it
    if os.path.exists(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl'):
        with open(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl', 'rb') as f:
            print(f'Loading tfidf_vectorizer_{min_df}')
            tfidf_vectorizer = pickle.load(f)
            print(f'min_df={min_df}: {len(tfidf_vectorizer.get_feature_names_out())} features')
    else:
        print(f'Creating tfidf_vectorizer_{min_df}')
        # Create the vectorizer
        tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in data_train[['name', 'content']].apply(lambda x: ' '.join(x), axis=1)])

        print(tfidf_matrix.shape)
        print(f'min_df={min_df}: {len(tfidf_vectorizer.get_feature_names_out())} features')

        # save the model to disk
        with open(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl', 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)

    return f'tfidf_vectorizer_{min_df}', tfidf_vectorizer

min_dfs = [0.01, 0.025, 0.05, 0.075, 0.1]

# Use ProcessPoolExecutor to parallelize the creation/loading of vectorizers
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(create_or_load_vectorizer, min_df, data_train, save_dir) for min_df in min_dfs]

    # Collect results
    for future in futures:
        vectorizer_name, vectorizer = future.result()
        vectorizers[vectorizer_name] = vectorizer

Loading tfidf_vectorizer_0.01
min_df=0.01: 377 features
Loading tfidf_vectorizer_0.025
min_df=0.025: 138 features
Loading tfidf_vectorizer_0.05
min_df=0.05: 54 features
Loading tfidf_vectorizer_0.075
min_df=0.075: 30 features
Loading tfidf_vectorizer_0.1
min_df=0.1: 19 features


## Transform and create feature columns

In [15]:
def get_feature_columns(data, vectorizer, columns, feature_names=False):
    # Join the columns into a single column
    temp = data[columns].apply(lambda x: ' '.join(x), axis=1)

    # Transform the column using the vectorizer
    tfidf_matrix = temp.apply(lambda x: vectorizer.transform([x]).toarray()[0])

    # Convert the list of arrays to a DataFrame
    if feature_names:
        tfidf_df = pd.DataFrame(tfidf_matrix.tolist(), index=tfidf_matrix.index, columns=vectorizer.get_feature_names_out())
    else:
        tfidf_df = pd.DataFrame(tfidf_matrix.tolist(), index=tfidf_matrix.index)

    # Create a DataFrame with the features
    feature_columns = pd.DataFrame(tfidf_df, index=tfidf_df.index)

    return feature_columns

def get_vectorized_df(data, vectorizer, label_column, text_columns, feature_names=False):
    feature_columns = get_feature_columns(data.dropna(), vectorizer, text_columns, feature_names)

    # Merge the label column with the feature columns
    vectorized_df = data[label_column].to_frame().merge(feature_columns, left_index=True, right_index=True)

    return vectorized_df

In [16]:
# # Get the feature columns for the training data
# datasets = {
#     'train': data_train,
#     'val': data_val,
# }

# save_dir = './vectorized_data'
# os.makedirs(save_dir, exist_ok=True)

# vectorized_dfs = {}

# for vectorizer in vectorizers.keys():
#     for dataset in datasets.keys():
#         file_path = f'{save_dir}/{vectorizer}_{dataset}.csv'
#         if(f'{vectorizer}_{dataset}.csv' in os.listdir('./vectorized_data')):
#             print(f'Loading {dataset} with {vectorizer}')
#             vectorized_dfs.update({f'{vectorizer}': 
#                                 {f'{dataset}': pd.read_csv(file_path)}
#                                 })
#         else:
#             print(f'Processing {dataset} with {vectorizer}')
#             vectorized_dfs.update({f'{vectorizer}': 
#                                 {f'{dataset}': get_vectorized_df(datasets[dataset], vectorizers[vectorizer], 'category1', ['name', 'content'], feature_names=True)}
#                                 })
#             vectorized_dfs[vectorizer][dataset].to_csv(f'./vectorized_data/{vectorizer}_{dataset}.csv', index=False)


In [17]:
import os
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

datasets = {
    'train': data_train,
    'val': data_val,
}

save_dir = './vectorized_data'
os.makedirs(save_dir, exist_ok=True)


def process_vectorizer_dataset(vectorizer, dataset, save_dir, datasets, vectorizers):
    file_path = f'{save_dir}/{vectorizer}_{dataset}.csv'
    vectorized_dfs = {}
    if f'{vectorizer}_{dataset}.csv' in os.listdir('./vectorized_data'):
        print(f'Loading {dataset} with {vectorizer}')
        vectorized_dfs.update({f'{vectorizer}': 
                            {f'{dataset}': pd.read_csv(file_path)}
                            })
    else:
        print(f'Processing {dataset} with {vectorizer}')
        vectorized_dfs.update({f'{vectorizer}': 
                            {f'{dataset}': get_vectorized_df(datasets[dataset], vectorizers[vectorizer], 'category1', ['name', 'content'], feature_names=True)}
                            })
        vectorized_dfs[vectorizer][dataset].to_csv(f'./vectorized_data/{vectorizer}_{dataset}.csv', index=True)
    return vectorized_dfs

# Create a list of all combinations of vectorizers and datasets
tasks = [(vectorizer, dataset) for vectorizer in vectorizers.keys() for dataset in datasets.keys()]

# Use ProcessPoolExecutor to parallelize the processing
vectorized_dfs = {}
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_vectorizer_dataset, vectorizer, dataset, save_dir, datasets, vectorizers) for vectorizer, dataset in tasks]

    # Collect results
    for future in futures:
        result = future.result()
        for vectorizer, datasets in result.items():
            if vectorizer not in vectorized_dfs:
                vectorized_dfs[vectorizer] = {}
            vectorized_dfs[vectorizer].update(datasets)

Loading train with tfidf_vectorizer_0.01
Loading val with tfidf_vectorizer_0.01
Loading train with tfidf_vectorizer_0.025
Loading val with tfidf_vectorizer_0.025
Loading train with tfidf_vectorizer_0.05
Loading val with tfidf_vectorizer_0.05
Loading train with tfidf_vectorizer_0.075
Loading val with tfidf_vectorizer_0.075
Loading train with tfidf_vectorizer_0.1
Loading val with tfidf_vectorizer_0.1


In [18]:
print(tasks)

for vectorizer, dataset in tasks:
    process_vectorizer_dataset( vectorizer, dataset, save_dir, datasets, vectorizers)

[('tfidf_vectorizer_0.01', 'train'), ('tfidf_vectorizer_0.01', 'val'), ('tfidf_vectorizer_0.025', 'train'), ('tfidf_vectorizer_0.025', 'val'), ('tfidf_vectorizer_0.05', 'train'), ('tfidf_vectorizer_0.05', 'val'), ('tfidf_vectorizer_0.075', 'train'), ('tfidf_vectorizer_0.075', 'val'), ('tfidf_vectorizer_0.1', 'train'), ('tfidf_vectorizer_0.1', 'val')]
Loading train with tfidf_vectorizer_0.01
Loading val with tfidf_vectorizer_0.01
Loading train with tfidf_vectorizer_0.025
Loading val with tfidf_vectorizer_0.025
Loading train with tfidf_vectorizer_0.05
Loading val with tfidf_vectorizer_0.05
Loading train with tfidf_vectorizer_0.075
Loading val with tfidf_vectorizer_0.075
Loading train with tfidf_vectorizer_0.1
Loading val with tfidf_vectorizer_0.1
