## Análise de Sentimentos com Transformers

#### Importando dataset da Amazon

In [1]:
from datasets import load_dataset

dataset = load_dataset("amazon_polarity")

#### Converter para pandas DataFrame

In [3]:
import pandas as pd

df = pd.DataFrame(dataset["train"])

#### Verificar a distribuição das classes

In [6]:
class_counts = df['label'].value_counts()
class_counts

label
1    1800000
0    1800000
Name: count, dtype: int64

#### Obtendo uma amostra dos dados balanceado

In [12]:
# Balancear para 10k dados, pegando 5k de cada classe
df_balanced = pd.concat([
    df[df['label'] == 0].sample(n=5000, random_state=42),  # Classe negativa
    df[df['label'] == 1].sample(n=5000, random_state=42)   # Classe positiva
])

# Selecionar as primeiras 10k amostras balanceadas
df_balanced = df_balanced.sample(n=10000, random_state=42)

#Salvando em um arquivo do tipo 'sample_amazon_polarity.csv'
df_balanced.to_csv('../data/processed/sample_amazon_polarity.csv', index=False)

#### Lendo a amostra que criamos no salva na pasta processed

In [32]:
import pandas as pd

df_samples = pd.read_csv('../data/processed/sample_amazon_polarity.csv')
df_samples.head()

Unnamed: 0,label,title,content
0,1,Road to getting things done,I am a great believer in valueing employees an...
1,0,The Weight Loss Cure,This would be a good book. Good Ideas if every...
2,0,Royal Velvet Pillows,These pillows were over rated and the descript...
3,0,Great Book For Kiddies,Robert Sawyer's books have featured an assortm...
4,0,Digimortal,"You know, Fear Factory used to be my favorite ..."


#### Verificando quantidade de amostras, quantidade de colunas e quantidade de classes

Aqui temos uma amostra de 10 mil observações, três colunas: label, title e content duas classes [0, 1].

In [4]:
df_samples.shape

(10000, 3)

In [6]:
df_samples.label.unique()

array([1, 0])

#### Análise Exploratória dos dados

##### Função para limpeza dos dados

In [33]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Baixando o conjunto de stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Inicializando o lemmatizer
lemmatizer = WordNetLemmatizer()

# Função para limpar o texto
def clean_text(text):
    # 1. Remover links e emails
    text = re.sub(r'http\S+|www\S+|@\S+', '', text)

    # 2. Remover caracteres especiais e números
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # 3. Converter o texto para minúsculas
    text = text.lower()

    # 4. Tokenização (separar em palavras)
    words = nltk.word_tokenize(text)

    # 5. Remover stopwords
    stop_words = set(stopwords.words('english'))  # Para inglês. Para outro idioma, substitua aqui.
    words = [word for word in words if word not in stop_words]

    # 6. Lemmatização
    words = [lemmatizer.lemmatize(word) for word in words]

    # 7. Reconstruir o texto
    return ' '.join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jonnathann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jonnathann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jonnathann/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jonnathann/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


##### Relizando limpeza dos dados 

In [34]:
# Aplicando a limpeza tanto no 'title' quanto no 'context'
df_samples['cleaned_title']  = df_samples['title'].apply(clean_text)
df_samples['cleaned_content'] = df_samples['content'].apply(clean_text)

df_samples[['title', 'cleaned_title', 'content', 'cleaned_content']].head()

Unnamed: 0,title,cleaned_title,content,cleaned_content
0,Road to getting things done,road getting thing done,I am a great believer in valueing employees an...,great believer valueing employee rewarding pro...
1,The Weight Loss Cure,weight loss cure,This would be a good book. Good Ideas if every...,would good book good idea everyone needed lose...
2,Royal Velvet Pillows,royal velvet pillow,These pillows were over rated and the descript...,pillow rated description amamzon accuratewe di...
3,Great Book For Kiddies,great book kiddy,Robert Sawyer's books have featured an assortm...,robert sawyer book featured assortment funny e...
4,Digimortal,digimortal,"You know, Fear Factory used to be my favorite ...",know fear factory used favorite band album rea...


##### Concatenando o cleaned_title com o cleaned_content

In [42]:
df_samples['title_content'] = df_samples['cleaned_title'] +' '+df_samples['cleaned_content']
df_samples[['title_content', 'label']]

Unnamed: 0,title_content,label
0,road getting thing done great believer valuein...,1
1,weight loss cure would good book good idea eve...,0
2,royal velvet pillow pillow rated description a...,0
3,great book kiddy robert sawyer book featured a...,0
4,digimortal know fear factory used favorite ban...,0
...,...,...
9995,good nice trainer noisier expected also conven...,1
9996,autobiography excoloured man would recommend b...,1
9997,wonderful book coulnt put one best book ever r...,1
9998,lasted week bought item based pivoting plug gl...,0


##### Vetorizando o texto

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inicializando o vetor TF-IDF
tfidf = TfidfVectorizer(max_features=1000)  # max_features=5000 para pegar as 5000 palavras mais relevantes

# Ajustando e transformando o texto (título + contexto) em uma matriz TF-IDF
X = tfidf.fit_transform(df_samples['title_content'])

# Convertendo para um DataFrame para visualização
X_df = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())

# Visualizando as primeiras linhas
X_df.head(5)

Unnamed: 0,able,absolutely,account,accurate,across,act,acting,action,actor,actual,...,written,wrong,wrote,year,yes,yet,youll,young,youre,youve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.337877,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129794,0.0


#### Treinamento e análises dos modelos

##### Separação do conjunto de dados em treino e teste

In [None]:
from sklearn.model_selection import train_test_split

X = X  # Matriz de características gerada pelo TfidfVectorizer
y = df_samples['label']  # As labels (classes) que você está tentando prever

# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Modelo Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Instanciando o modelo
model = LogisticRegression(max_iter=1000)

# Treinando o modelo com os dados de treino
model.fit(X_train, y_train)

# Fazendo previsões com os dados de teste
y_pred = model.predict(X_test)

# Exibindo a acurácia
print(f'Acurácia do modelo: {accuracy_score(y_test, y_pred)}')

# Exibindo o relatório de classificação (precisão, recall, F1-score)
print(classification_report(y_test, y_pred))

Acurácia do modelo: 0.836
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1013
           1       0.84      0.83      0.83       987

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



##### Modelo Support Vector Machine

In [55]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Criação do modelo SVM com parâmetros padrão
svm = SVC(random_state=42)

# Treinamento do modelo com os dados de treinamento
svm.fit(X_train, y_train)

# Previsões no conjunto de teste
y_pred = svm.predict(X_test)

# Avaliação do modelo
print(f"Acurácia do modelo: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Acurácia do modelo: 0.83
              precision    recall  f1-score   support

           0       0.82      0.85      0.84      1013
           1       0.84      0.81      0.82       987

    accuracy                           0.83      2000
   macro avg       0.83      0.83      0.83      2000
weighted avg       0.83      0.83      0.83      2000

