## Configure nltk

In [None]:
import nltk

nltk.download('stopwords')
!python -m spacy download pt_core_news_sm

## Load data from file

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Load data from file
data = pd.read_csv('data.csv')

## Category analysis

In [None]:
### Get all distinct full categories
categories = data.drop_duplicates(subset=['fullcategory'])[['category', 'fullcategory']]

# Split full categories by level
categories_split = categories.join(
    categories['fullcategory'].str.split(' > ', expand=True)
).rename(columns={0: 'category1', 1: 'category2', 2: 'category3'})[['category1', 'category2', 'category3', 'category', 'fullcategory']]

# print(categories_split.drop_duplicates(subset=['category1'])[['category1']].dropna().count())
# print(categories_split.drop_duplicates(subset=['category2'])[['category2']].dropna().count())
# print(categories_split.drop_duplicates(subset=['category3'])[['category3']].dropna().count())

# comparison = categories_split[['category1', 'category2', 'category3', 'category']].copy()

# for column in ['category1', 'category2', 'category3']:
#     comparison[column] = categories_split[['category',column]].dropna()['category'] == categories_split[column].dropna()

# Merge data with split categories
data_category = data.set_index('fullcategory').join(categories_split[['fullcategory','category1', 'category2', 'category3']].set_index('fullcategory'))

# Reset index
data_category = data_category.reset_index()

data_category.head()

## Clean data

In [None]:
# Drop missing values for category1
data_category = data_category.dropna(subset=['category1'])


In [None]:
# def remove_noise(text):
#     import re

#     # Converte para minúsculas
#     text = text.lower()

#     # Remove pontuação
#     # []: colchetes são usados para definir uma classe de caracteres.
#     # ^: quando usado no início de uma classe de caracteres, o ^ nega a classe, ou seja, seleciona tudo que não está na classe.
#     # \w: corresponde a qualquer caractere alfanumérico (letras e números, incluindo o caractere de sublinhado _)
#     # \s: corresponde a qualquer espaço em branco (espaços, tabulações, quebras de linha).
#     text = re.sub(r'[^\w\s]', '', text)

#     return text

In [None]:
# columns_to_clean = ['name','content']

# for column in columns_to_clean:
#     data[column] = data[column].apply(remove_noise)
# data.head(5)

In [None]:
# def remove_stopwords(text):
#     from nltk.corpus import stopwords
#     # Obtém a lista de stopwords em português usando o NLTK e as converte para um conjunto para melhorar a eficiência da busca
#     stop_words = set(stopwords.words('portuguese'))

#     # Divide o texto em palavras, remove as stopwords e então junta as palavras restantes de volta em uma string
#     text = ' '.join([word for word in text.split() if word not in stop_words])

#     return text

In [None]:
# for column in columns_to_clean:
#     data[column] = data[column].apply(remove_stopwords)

# data.head(5)

## Spliting dataset

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# # Cria um objeto OneHotEncoder
# encoder = OneHotEncoder(sparse_output=False)

# enc_category1 = encoder.fit_transform(data_category[['category1']])

# data_category = pd.concat([data_category, pd.DataFrame(enc_category1, columns=encoder.get_feature_names_out(['category1']))], axis=1)

# data_category.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(data_category[['name', 'content']], data_category['category1'], test_size=0.5, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

data_train = pd.concat([X_train, y_train], axis=1)
data_val = pd.concat([X_val, y_val], axis=1)
data_test = pd.concat([X_test, y_test], axis=1)

data_train.info()
data_val.info()
data_test.info()

## Lemmatization and vectorization

In [None]:
def tokenize_and_lemmatize(text):
    import spacy, regex as re
    nlp = spacy.load('pt_core_news_sm')

    doc = nlp(text)
    tokens = [
        token.lemma_ 
        for token in doc 
        if 
        not re.search(r'[^\w\s]|[\d]|[\w\-\.]+@([\w-]+\.)+[\w-]{2,}|[\r\n\t]', token.lemma_) 
        and 
        token.is_stop == False
        ]
    return tokens

teste = data['content'].sample(100).iloc[0]

tokenize_and_lemmatize(teste)


In [None]:
# importa TfidfVectorizer para criar vetores TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

save_dir = './vectorizers'

import os
os.makedirs(save_dir, exist_ok=True)

vectorizers = {}

for min_df in [0.01, 0.025, 0.05, 0.075, 0.1]:
    # instancia um objeto TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000,
                                    min_df=min_df,
                                    use_idf=True, tokenizer=tokenize_and_lemmatize,
                                    ngram_range=(1,3))

    # Check if the vectorizer file exists, if so, load it, if not, create it
    if os.path.exists(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl'):
        with open(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl', 'rb') as f:
            print(f'Loading tfidf_vectorizer_{min_df}')
            import pickle
            tfidf_vectorizer = pickle.load(f)
    else:
        print(f'Creating tfidf_vectorizer_{min_df}')
        # Create the vectorizer
        tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in data_train[['name', 'content']].apply(lambda x: ' '.join(x), axis=1)])

        print(tfidf_matrix.shape)

        print(f'min_df={min_df}: {len(tfidf_vectorizer.get_feature_names_out())} features')

        # save the model to disk
        # from joblib import dump
        import pickle
        with open(f'{save_dir}/tfidf_vectorizer_{min_df}.pkl', 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)


    vectorizers.update({f'tfidf_vectorizer_{min_df}': tfidf_vectorizer})


## Transform and create feature columns

In [None]:
def get_feature_columns(data, vectorizer, columns, feature_names=False):
    # Join the columns into a single column
    temp = data[columns].apply(lambda x: ' '.join(x), axis=1)

    # Transform the column using the vectorizer
    tfidf_matrix = temp.apply(lambda x: vectorizer.transform([x]).toarray()[0])

    # Convert the list of arrays to a DataFrame
    if feature_names:
        tfidf_df = pd.DataFrame(tfidf_matrix.tolist(), index=tfidf_matrix.index, columns=vectorizer.get_feature_names_out())
    else:
        tfidf_df = pd.DataFrame(tfidf_matrix.tolist(), index=tfidf_matrix.index)

    # Create a DataFrame with the features
    feature_columns = pd.DataFrame(tfidf_df, index=tfidf_df.index)

    return feature_columns

def get_vectorized_df(data, vectorizer, label_column, text_columns, feature_names=False):
    feature_columns = get_feature_columns(data.dropna(), vectorizer, text_columns, feature_names)

    # Merge the original DataFrame with the feature columns
    if isinstance(data, pd.Series):
        vectorized_df = data.to_frame().merge(feature_columns, left_index=True, right_index=True)
    else:
        vectorized_df = data.merge(feature_columns, left_index=True, right_index=True)

    return vectorized_df

In [None]:
# Get the feature columns for the training data
datasets = {
    'train': data_train,
    'val': data_val,
}

save_dir = './vectorized_data'
os.makedirs(save_dir, exist_ok=True)

vectorized_dfs = {}

for vectorizer in vectorizers.keys():
    for dataset in datasets.keys():
        file_path = f'{save_dir}/{vectorizer}_{dataset}.csv'
        if(f'{vectorizer}_{dataset}.csv' in os.listdir('./vectorized_data')):
            print(f'Loading {dataset} with {vectorizer}')
            vectorized_dfs.update({f'{vectorizer}': 
                                {f'{dataset}': pd.read_csv(file_path)}
                                })
        else:
            print(f'Processing {dataset} with {vectorizer}')
            vectorized_dfs.update({f'{vectorizer}': 
                                {f'{dataset}': get_vectorized_df(datasets[dataset], vectorizers[vectorizer], 'category1', ['name', 'content'], feature_names=True)}
                                })
            vectorized_dfs[vectorizer][dataset].to_csv(f'./vectorized_data/{vectorizer}_{dataset}.csv', index=False)


## Classification

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1,
                           metric='cosine'
                           )

data_train_vectorized_no_category = data_train_vectorized.drop(columns=['category1'])

data_train_vectorized_no_category.info()

knn.fit(data_train_vectorized_no_category, data_train_vectorized['category1'])

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

data_val_vectorized_no_category = data_val_vectorized.drop(columns=['category1'])

data_val_vectorized_no_category.info()

print(data_val_vectorized_no_category.shape)

y_pred = knn.predict(data_val_vectorized_no_category.values)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
ConfusionMatrixDisplay.from_predictions(data_val_vectorized['category1'], y_pred, ax=ax)