## Análise de Sentimentos com Transformers

#### Importando dataset da Amazon

In [1]:
from datasets import load_dataset

dataset = load_dataset("amazon_polarity")

#### Converter para pandas DataFrame

In [3]:
import pandas as pd

df = pd.DataFrame(dataset["train"])

#### Verificar a distribuição das classes

In [6]:
class_counts = df['label'].value_counts()
class_counts

label
1    1800000
0    1800000
Name: count, dtype: int64

#### Obtendo uma amostra dos dados balanceado

In [12]:
# Balancear para 10k dados, pegando 5k de cada classe
df_balanced = pd.concat([
    df[df['label'] == 0].sample(n=5000, random_state=42),  # Classe negativa
    df[df['label'] == 1].sample(n=5000, random_state=42)   # Classe positiva
])

# Selecionar as primeiras 10k amostras balanceadas
df_balanced = df_balanced.sample(n=10000, random_state=42)

#Salvando em um arquivo do tipo 'sample_amazon_polarity.csv'
df_balanced.to_csv('../data/processed/sample_amazon_polarity.csv', index=False)

#### Lendo a amostra que criamos no salva na pasta processed

In [1]:
import pandas as pd

df_samples = pd.read_csv('../data/processed/sample_amazon_polarity.csv')
df_samples.head()

Unnamed: 0,label,title,content
0,1,Road to getting things done,I am a great believer in valueing employees an...
1,0,The Weight Loss Cure,This would be a good book. Good Ideas if every...
2,0,Royal Velvet Pillows,These pillows were over rated and the descript...
3,0,Great Book For Kiddies,Robert Sawyer's books have featured an assortm...
4,0,Digimortal,"You know, Fear Factory used to be my favorite ..."


#### Verificando quantidade de amostras, quantidade de colunas e quantidade de classes

Aqui temos uma amostra de 10 mil observações, três colunas: label, title e content duas classes [0, 1].

In [4]:
df_samples.shape

(10000, 3)

In [6]:
df_samples.label.unique()

array([1, 0])

#### Pre-processando os dados

##### Função para limpezad dos dados

In [14]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Baixando o conjunto de stopwords
nltk.download('stopwords')
nltk.download('punkt', download_dir='/home/jonnathann/nltk_data')
nltk.download('wordnet')

# Inicializando o lemmatizer
lemmatizer = WordNetLemmatizer()

# Função para limpar o texto
def clean_text(text):
    # 1. Remover links e emails
    text = re.sub(r'http\S+|www\S+|@\S+', '', text)

    # 2. Remover caracteres especiais e números
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # 3. Converter o texto para minúsculas
    text = text.lower()

    # 4. Tokenização (separar em palavras)
    words = nltk.word_tokenize(text)

    # 5. Remover stopwords
    stop_words = set(stopwords.words('english'))  # Para inglês. Para outro idioma, substitua aqui.
    words = [word for word in words if word not in stop_words]

    # 6. Lemmatização
    words = [lemmatizer.lemmatize(word) for word in words]

    # 7. Reconstruir o texto
    return ' '.join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jonnathann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jonnathann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jonnathann/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##### Relizando limpeza dos dados 

In [15]:
# Aplicando a limpeza tanto no 'title' quanto no 'context'
df_samples['cleaned_title']  = df_samples['title'].apply(clean_text)
df_samples['cleaned_context'] = df_samples['context'].apply(clean_text)

df_samples[['title', 'cleaned_title', 'context', 'cleaned_context']].head()

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/jonnathann/nltk_data'
    - '/home/jonnathann/Documentos/portifolio_jonnathann/env_ra/nltk_data'
    - '/home/jonnathann/Documentos/portifolio_jonnathann/env_ra/share/nltk_data'
    - '/home/jonnathann/Documentos/portifolio_jonnathann/env_ra/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [16]:
import nltk
print(nltk.data.path)

['/home/jonnathann/nltk_data', '/home/jonnathann/Documentos/portifolio_jonnathann/env_ra/nltk_data', '/home/jonnathann/Documentos/portifolio_jonnathann/env_ra/share/nltk_data', '/home/jonnathann/Documentos/portifolio_jonnathann/env_ra/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
