## Configure nltk

In [None]:
import nltk

nltk.download('stopwords')
!python -m spacy download pt_core_news_sm

## Load data from file

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Load data from file
data = pd.read_csv('data.csv')

## Category analysis

In [None]:
### Get all distinct full categories
categories = data.drop_duplicates(subset=['fullcategory'])[['category', 'fullcategory']]

# Split full categories by level
categories_split = categories.join(
    categories['fullcategory'].str.split(' > ', expand=True)
).rename(columns={0: 'category1', 1: 'category2', 2: 'category3'})[['category1', 'category2', 'category3', 'category', 'fullcategory']]

# print(categories_split.drop_duplicates(subset=['category1'])[['category1']].dropna().count())
# print(categories_split.drop_duplicates(subset=['category2'])[['category2']].dropna().count())
# print(categories_split.drop_duplicates(subset=['category3'])[['category3']].dropna().count())

# comparison = categories_split[['category1', 'category2', 'category3', 'category']].copy()

# for column in ['category1', 'category2', 'category3']:
#     comparison[column] = categories_split[['category',column]].dropna()['category'] == categories_split[column].dropna()

# Merge data with split categories
data_category = data.set_index('fullcategory').join(categories_split[['fullcategory','category1', 'category2', 'category3']].set_index('fullcategory'))

# Reset index
data_category = data_category.reset_index()

data_category.head()

## Clean data

In [None]:
# Drop missing values for category1
data_category = data_category.dropna(subset=['category1'])


In [None]:
# def remove_noise(text):
#     import re

#     # Converte para minúsculas
#     text = text.lower()

#     # Remove pontuação
#     # []: colchetes são usados para definir uma classe de caracteres.
#     # ^: quando usado no início de uma classe de caracteres, o ^ nega a classe, ou seja, seleciona tudo que não está na classe.
#     # \w: corresponde a qualquer caractere alfanumérico (letras e números, incluindo o caractere de sublinhado _)
#     # \s: corresponde a qualquer espaço em branco (espaços, tabulações, quebras de linha).
#     text = re.sub(r'[^\w\s]', '', text)

#     return text

In [None]:
# columns_to_clean = ['name','content']

# for column in columns_to_clean:
#     data[column] = data[column].apply(remove_noise)
# data.head(5)

In [None]:
# def remove_stopwords(text):
#     from nltk.corpus import stopwords
#     # Obtém a lista de stopwords em português usando o NLTK e as converte para um conjunto para melhorar a eficiência da busca
#     stop_words = set(stopwords.words('portuguese'))

#     # Divide o texto em palavras, remove as stopwords e então junta as palavras restantes de volta em uma string
#     text = ' '.join([word for word in text.split() if word not in stop_words])

#     return text

In [None]:
# for column in columns_to_clean:
#     data[column] = data[column].apply(remove_stopwords)

# data.head(5)

## Spliting dataset

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# # Cria um objeto OneHotEncoder
# encoder = OneHotEncoder(sparse_output=False)

# enc_category1 = encoder.fit_transform(data_category[['category1']])

# data_category = pd.concat([data_category, pd.DataFrame(enc_category1, columns=encoder.get_feature_names_out(['category1']))], axis=1)

# data_category.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(data_category[['name', 'content']], data_category['category1'], test_size=0.3, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

data_train = pd.concat([X_train, y_train], axis=1)
data_val = pd.concat([X_val, y_val], axis=1)
data_test = pd.concat([X_test, y_test], axis=1)

# data_train.head()
# data_train.head()
# data_test.head()

## Lemmatization and vectorization

In [None]:
def tokenize_and_lemmatize(text):

    import spacy, regex as re
    nlp = spacy.load('pt_core_news_sm')

    doc = nlp(text)
    tokens = [
        token.lemma_ 
        for token in doc 
        if 
        not re.search(r'[^\w\s]|[\d]|[\w\-\.]+@([\w-]+\.)+[\w-]{2,}|[\r\n\t]', token.lemma_) 
        and 
        token.is_stop == False
        ]
    return tokens

teste = data['content'].sample(100).iloc[0]

tokenize_and_lemmatize(teste)


In [None]:
# importa TfidfVectorizer para criar vetores TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# instancia um objeto TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.05,
                                 use_idf=True, tokenizer=tokenize_and_lemmatize,
                                 ngram_range=(1,3))

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in data_train['content'].sample(1000)])

In [None]:
print(tfidf_matrix.shape)

print(tfidf_vectorizer.get_feature_names_out())

print(tfidf_matrix)

## Save data

In [None]:
# data = data[['category1', 'category2', 'category3', 'name', 'content']]

# data.to_csv('data.csv', index=False)
# # data.to_parquet('data.parquet', index=False)