In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [132]:
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

In [133]:
def carregando_csv():
    
    # Lendo o txt
    df = pd.read_csv(r"train.txt", sep=";")

    # Renomeando colunas
    df.columns = ["mensagem", "emocao"]

    # Transformando em csv
    df.to_csv("train.csv", index=False)

    print("Arquivo convertido e salvo como csv")
    
    return df

### Pré processamento dos Dados

In [134]:
def processando_dados(df):

    # Retirando letras maiúsculas
    df["mensagem"] = df["mensagem"].str.lower()

    # Retirando palavras 'stopwords'
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    df["mensagem"] = df["mensagem"].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

    # Lematiza as palavras
    nltk.download('wordnet')
    nltk.download('omw-1.4')

    lemmatizer = WordNetLemmatizer()
    df["mensagem"] = df["mensagem"].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

    return df

## Modelos

### Bag-of-Words

In [135]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def execute_bow(df):

    X = df["mensagem"]
    y = df["emocao"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Usando a representação Bag of Words em X
    vectorizer = CountVectorizer()
    X_train_bow = vectorizer.fit_transform(X_train)
    X_test_bow = vectorizer.transform(X_test)

    # Criando e treinando o modelo
    model = MultinomialNB()
    model.fit(X_train_bow, y_train)

    # Fazendo previsões
    y_pred = model.predict(X_test_bow)

    return y_test, y_pred

### TF-IDF

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

def execute_tfidf(df):

    X = df['mensagem']
    y = df['emocao']

    # Convertendo mensagens para vetores TF-IDF
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(X)

    # Dividindo os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

    # Criando e treinando o modelo (aqui com RandomForest, mas pode ser outro classificador)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Fazendo previsões
    y_pred = model.predict(X_test)

    return y_test, y_pred

### N-grams

In [137]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def execute_ngrams(df):


    X = df['mensagem']
    y = df['emocao']

    # Transformando em vetores N-grams (fazer testes depois para ver qual o melhor N, a acurácia tá baixa)
    vectorizer = CountVectorizer(ngram_range=(1, 2))

    # Vetorizando as mensagens
    X_ngrams = vectorizer.fit_transform(X)

    # Dividindo os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X_ngrams, y, test_size=0.2, random_state=42)

    # Criando e treinando o modelo
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Fazendo previsões
    y_pred = model.predict(X_test)


    return y_test, y_pred

### Word embeddings

In [138]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

def execute_word_embeddings(df):

    X = df['mensagem']
    y = pd.get_dummies(df['emocao']).values  # Convertendo classes para one-hot encoding

    # Tokenização
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(X)
    X_sequences = tokenizer.texts_to_sequences(X)
    X_padded = pad_sequences(X_sequences, maxlen=100)

    # Separação de dados
    X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

    # Modelo com embeddings
    model = Sequential([
        Embedding(input_dim=5000, output_dim=100, input_length=100),
        LSTM(64),
        Dense(32, activation='relu'),
        Dense(y_train.shape[1], activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Treinamento
    model.fit(X_train, y_train, epochs=5, validation_split=0.2)

    # Predições
    y_pred = model.predict(X_test)
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_test_labels = np.argmax(y_test, axis=1)


    return y_test_labels, y_pred_labels

### Subword embeddings

In [139]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

def execute_subword_embeddings(df):

    X = df['mensagem']
    y = pd.get_dummies(df['emocao']).values  # Convertendo classes para one-hot encoding

    # Separação de dados
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Tokenização com BERT
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=100, return_tensors="tf")
    test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=100, return_tensors="tf")

    # Modelo BERT
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=y.shape[1])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])

    # Treinamento
    model.fit(
        {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
        y_train,
        epochs=3,
        batch_size=16,
        validation_split=0.2
    )

    # Predições
    outputs = model({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']})
    y_pred = tf.argmax(outputs.logits, axis=1).numpy()
    y_test_labels = np.argmax(y_test, axis=1)

    # Acurácia
    acc = accuracy_score(y_test_labels, y_pred)
    print(f"Acurácia com subword embeddings: {acc:.2f}")
    
    return y_test_labels, y_pred

ModuleNotFoundError: No module named 'transformers'

### BERT

## Main

In [None]:
def main():

    df = carregando_csv()
    df = processando_dados(df)

    while True:
        
        print('Selecione uma das técnicas para prosseguir:')
        print('1 - Bag-of-Words')
        print('2 - TF-IDF')
        print('3 - N-grams')
        print('4 - Word embeddings')
        print('5 - Subword embeddings')
        print('6 - BERT')
        print('7 - Ver estatísticas')
        print('8 - Sair')

        opcao = input()
    
        if opcao == '1':
            y_test, y_pred = execute_bow(df)

            print("Acurácia:", accuracy_score(y_test, y_pred))
            print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred))

        elif opcao == '2':

            y_test, y_pred = execute_tfidf(df)

            print("Acurácia:", accuracy_score(y_test, y_pred))
            print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred))

        elif opcao == '3':
            y_test, y_pred = execute_ngrams(df)

            print("Acurácia:", accuracy_score(y_test, y_pred))
            print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred))

        elif opcao == '4':
            y_test, y_pred = execute_word_embeddings(df)
            
            print("Acurácia:", accuracy_score(y_test, y_pred))
            print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred))

        elif opcao == '5':
            y_test, y_pred = execute_subword_embeddings(df)

            print("Acurácia:", accuracy_score(y_test, y_pred))
            print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred))

        elif opcao == '6':
            print('Em construção')

        elif opcao == '7':
            print('Em construção')

        elif opcao == '8':
            break
        
        else:
            print('Opção inválida :O tente de novo!!')
        

if __name__ == "__main__":
    main()
    

Arquivo convertido e salvo como csv


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Selecione uma das técnicas para prosseguir:
1 - Bag-of-Words
2 - TF-IDF
3 - N-grams
4 - Word embeddings
5 - Subword embeddings
6 - BERT
7 - Ver estatísticas
8 - Sair




Epoch 1/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 64ms/step - accuracy: 0.3970 - loss: 1.5065 - val_accuracy: 0.7930 - val_loss: 0.6319
Epoch 2/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 58ms/step - accuracy: 0.8617 - loss: 0.4061 - val_accuracy: 0.8801 - val_loss: 0.3448
Epoch 3/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 67ms/step - accuracy: 0.9602 - loss: 0.1252 - val_accuracy: 0.8887 - val_loss: 0.3209
Epoch 4/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 71ms/step - accuracy: 0.9757 - loss: 0.0728 - val_accuracy: 0.8926 - val_loss: 0.3625
Epoch 5/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 62ms/step - accuracy: 0.9835 - loss: 0.0461 - val_accuracy: 0.8941 - val_loss: 0.4038
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
Acurácia: 0.885625

Relatório de Classificação:
               precision    recall  f1-score   support

     