# Importação de Bibliotecas e Leitura do Dataset

In [None]:
import pandas as pd
import numpy as np
import re

# Biblioteca NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\55119\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\55119\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\55119\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\55119\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv("../../../data/raw/IMDB Dataset.csv")

print(df.head(10), '\n')
print(df.info(), '\n')
print(df.describe(), '\n')
print(df.isnull().sum(), '\n')
print(df['sentiment'].value_counts())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
5  Probably my all-time favorite movie, a story o...  positive
6  I sure would like to see a resurrection of a u...  positive
7  This show was an amazing, fresh & innovative i...  negative
8  Encouraged by the positive comments about this...  negative
9  If you like original gut wrenching laughter yo...  positive 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

In [None]:
# 49.582 reviews em 50.000 linhas -> tem 418 duplicadas
df = df.drop_duplicates(subset='review').reset_index(drop=True)
print(df.info(), '\n')
print(df.describe(), '\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49582 entries, 0 to 49581
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  object
dtypes: object(2)
memory usage: 774.8+ KB
None 

                                                   review sentiment
count                                               49582     49582
unique                                              49582         2
top     No one expects the Star Trek movies to be high...  positive
freq                                                    1     24884 



# **Pré-processamento**

1. Limpeza de texto -> remover elmentos não textuais, como numeros, pontuação, caracteres especiais...

2. Conversão para minúsculas -> padronizar todo o texto em letras minusculas, a fim de evitar que "Texto" seja diferente de "texto"

3. Remoção de stopwords -> remover palavras que nao agrega tanto significado semantico ("the", "and", "a"

4. Lematização ->  reduzir as palavras à sua raiz, envolve a remoção dos sufixos das palavras, por exemplo: filming seria derivada de film


5. Tokenização ->  Dividir o texto em unidades menores (tokens), que podem ser palavras, frases ou outros elementos.





- https://www.datacamp.com/pt/tutorial/text-analytics-beginners-nltk

- https://medium.com/@khalidassalafy/sentiment-analysis-with-nltk-4afbb0bf6a49  



In [7]:

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Função de limpeza completa
def clean_dataFrame(text):
    text = re.sub(r'<.*?>', '', text)     # remove tags html
    text = re.sub(r'[^\w\s]', '', text)   # remove pontuacao
    text = re.sub(r'\d+', '', text)       # remove numeros
    text = re.sub(r'\s+', ' ', text)      # remove espacos em branco
    text = text.lower()                   # converte tudo para minusculas
    tokens = word_tokenize(text)          # tranforma em tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  #remove stopwords e lematiza
    return ' '.join(tokens)

df['processed_review'] = df['review'].apply(clean_dataFrame)
df.to_csv("../../../data/processed/IMDB_cleaned_NLTK.csv", index=False)
print(df[['review', 'processed_review']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                    processed_review  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  
