In [3]:
import numpy as np
import pandas as pd

In [4]:
data_imdb = pd.read_csv("imdb_labelled.txt", delimiter='\t', header=None)
data_imdb.columns = ["Review_text", "Review_class"]

data_amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data_amazon.columns = ["Review_text", "Review_class"]

data_yelp = pd.read_csv("yelp_labelled.txt", delimiter='\t', header=None)
data_yelp.columns = ["Review_text", "Review_class"]

data = pd.concat([data_imdb, data_amazon, data_yelp])
data

Unnamed: 0,Review_text,Review_class
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [7]:
import re
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/janaina/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/janaina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def clean_text(df):
    all_reviews = list()
    lines = df["Review_text"].values.tolist()
    for text in lines:
        text = text.lower()
        # Elimina links
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        # Remove os símbolos
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        # Essas palavras foram tiradas pois trata-se de uma análise de sentimento
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)        
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text) 
        text = re.sub(r"\'ll", " will", text)  
        text = re.sub(r"\'ve", " have", text)  
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"don't", "do not", text)
        text = re.sub(r"did't", "did not", text)
        text = re.sub(r"can't", "can not", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"couldn't", "could not", text)
        text = re.sub(r"have't", "have not", text)
        
        tokens = word_tokenize(text)
        # remover pontuação
        table = str.maketrans('', '', string.punctuation)
        # transforma as palavras em tokens
        stripped = [w.translate(table) for w in tokens]
        # tira letras que não são do alfabeto
        words = [word for word in stripped if word.isalpha()]
        # conjunto de palavras mais usadas em inglês
        stop_words = set(stopwords.words("english"))
        # remove a palavra 'not'
        stop_words.discard("not")
        # Usado para transformar palavras em suas formas básicas
        PS = PorterStemmer()
        # Lista sem as palavras mais usadas e em suas formas básicas
        words = [PS.stem(w) for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews

all_reviews = clean_text(data)
all_reviews[0:20]

['slowmov aimless movi distress drift young man',
 'not sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint becam even ridicul act poor plot line almost nonexist',
 'littl music anyth speak',
 'best scene movi gerardo tri find song keep run head',
 'rest movi lack art charm mean empti work guess empti',
 'wast two hour',
 'saw movi today thought good effort good messag kid',
 'bit predict',
 'love cast jimmi buffet scienc teacher',
 'babi owl ador',
 'movi show lot florida best made look appeal',
 'song best muppet hilari',
 'cool',
 'right case movi deliv everyth almost right face',
 'averag act main person low budget clearli see',
 'review long overdu sinc consid tale two sister singl greatest film ever made',
 'put gem movi term screenplay cinematographi act postproduct edit direct aspect filmmak',
 'practic perfect true masterpiec sea faux masterpiec',
 'structur film easili tightli construct histori cinema think film some

In [10]:
import csv

# Lista de reviews
all_reviews = [...]  # Sua lista com reviews

# Exportando para um arquivo CSV
with open('reviews.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Review'])  # Cabeçalho (se necessário)
    for review in all_reviews:
        writer.writerow([review])


In [7]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
# Utiliza o CountVectorizer para converter uma coleção de textos em uma matriz numérica (matriz esparsa), onde cada
#  coluna representa uma palavra distinta do vocabulário e cada linha representa um documento (review).
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer(min_df=3) # qualquer palavra que aparecer em menos de 3 documentos (reviews) será ignorada. 
X = CV.fit_transform(all_reviews).toarray()
y = data[["Review_class"]].to_numpy()
print(np.shape(X))
print(np.shape(y))

(2748, 1190)
(2748, 1)


In [9]:
#  TF-IDF (Term Frequency-Inverse Document Frequency), uma técnica que não apenas conta as palavras, mas também ajusta a 
# importância delas com base na frequência de ocorrência em diferentes documentos.
from sklearn.feature_extraction.text import TfidfVectorizer
TV = TfidfVectorizer(min_df=3)
X = TV.fit_transform(all_reviews).toarray()
y = data[["Review_class"]].to_numpy()
print(np.shape(X))
print(np.shape(y))

(2748, 1190)
(2748, 1)


In [10]:
X[0]
# the fractions are actually the product od TF and IDF --> this helps our model to learn better

array([0.        , 0.        , 0.        , ..., 0.69902343, 0.        ,
       0.        ])

In [11]:
from sklearn.model_selection import train_test_split
# test_size: Define que 20% dos dados serão usados para teste, e os 80% restantes serão usados para treinamento.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
# treinando o modelo
model.fit(X_train, y_train)

# fazer previsões nos dados de teste.
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score

# razão entre o número de previsões corretas e o total de previsões feitas.
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
# quão preciso o modelo é ao prever as classes positivas.
print(precision_score(y_test, y_pred))

0.7072727272727273
0.7099099099099099
0.6769759450171822


  y = column_or_1d(y, warn=True)
