## Modelo para prever quão útil é uma review

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
df = pd.read_csv('book_reviews.csv')
df.shape

(11639, 6)

## EDA

In [14]:
df.shape

(11639, 6)

In [15]:
df.head()

Unnamed: 0,bookID,helpful,rating,reviewText,reviewerID,summary
0,B000F83SZQ,,4,A beautiful in-depth character description mak...,A1RK2OCZDSGC6R,Review
1,B000F83SZQ,1.0,5,I'd never read any of the Amy Brewster mysteri...,A1FV0SX13TWVXQ,I really liked it.
2,B000F83SZQ,,5,I enjoy vintage books and movies so I enjoyed ...,A1F6404F1VG29J,Nice vintage story
3,B000F83SZQ,1.0,4,Never heard of Amy Brewster. But I don't need ...,A3DE6XGZ2EPADS,Enjoyable reading and reminding the old times
4,B000F83SZQ,,4,I enjoyed this one tho I'm not sure why it's c...,A2HSAKHC3IBRE6,Nice old fashioned story


In [16]:
df.dtypes

bookID         object
helpful       float64
rating          int64
reviewText     object
reviewerID     object
summary        object
dtype: object

In [17]:
df.isnull().sum()

bookID           0
helpful       6571
rating           0
reviewText       0
reviewerID       0
summary          2
dtype: int64

### Remover os registos que têm 'helpful' como null ou Nan

In [18]:
df = df.dropna(subset=['helpful'])
print(df['helpful'].unique())

[1.         0.88888889 0.85964912 0.66666667 0.92307692 0.85714286
 0.43478261 0.22222222 0.31818182 0.75       0.875      0.01666667
 0.7        0.5        0.88709677 0.55       0.25       0.125
 0.33333333 0.97787611 0.97101449 0.95       0.2        0.1
 0.96610169 0.71428571 0.83333333 0.97142857 0.9673913  0.93333333
 0.95652174 0.96666667 0.95454545 0.375      0.91666667 0.8
 0.98130841 0.98309859 0.84615385 0.91044776 0.92857143 0.46666667
 0.41666667 0.79545455 0.69047619 0.54545455 0.27272727 0.88
 0.85       0.6        0.78571429 0.58823529 0.73684211 0.4
 0.86363636 0.57142857 0.9375     0.73333333 0.77777778 0.35714286
 0.96       0.42857143 0.84848485 0.90196078 0.9        0.69230769
 0.94444444 0.81818182 0.86666667 0.94736842 0.16666667 0.90909091
 0.80434783 0.92156863 0.91566265 0.65       0.625      0.82051282
 0.92340426 0.88235294 0.76190476 0.9787234  0.10204082 0.11111111
 0.18181818 0.9047619  0.38888889 0.97222222 0.89473684 0.90322581
 0.87096774 0.84375    0.98

In [19]:
df.isnull().sum()

bookID        0
helpful       0
rating        0
reviewText    0
reviewerID    0
summary       0
dtype: int64

In [20]:
df = df[['reviewText', 'helpful']]

# Verificar o resultado
print(df.shape)  # Mostra o novo formato do DataFrame (linhas, colunas)
df.head()        # Exibe as primeiras linhas

(5068, 2)


Unnamed: 0,reviewText,helpful
1,I'd never read any of the Amy Brewster mysteri...,1.0
3,Never heard of Amy Brewster. But I don't need ...,1.0
6,This was a fairly interesting read. It had ol...,1.0
7,This book is a reissue of an old one; the auth...,1.0
20,"Another well written eBook by Troy Denning, bu...",1.0


In [10]:
df.describe()

Unnamed: 0,rating
count,5068.0
mean,2.966259
std,1.478431
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


## Pre-processar

In [11]:
exemplo = df.iloc[1].reviewText
exemplo

"Never heard of Amy Brewster. But I don't need to like Amy Brewster to like this book. Actually, Amy Brewster is a side kick in this story, who added mystery to the story not the one resolved it. The story brings back the old times, simple life, simple people and straight relationships."

### Remoção marcação HTML

In [None]:
!pip install bs4

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(exemplo, 'html.parser')
exemplo = soup.get_text()
exemplo

### Usar regex para limpar texto

In [None]:
import re
exemplo = re.sub(r'<.*?>','',exemplo)
exemplo = re.sub(r'\[[^]]*/]','',exemplo)
exemplo = re.sub(r'\d+','',exemplo)
exemplo = re.sub(r'[^a-zA-Z]',' ',exemplo)
exemplo

### Converter para lowercase

In [None]:
exemplo = exemplo.lower()
exemplo

### Remover stopwords
### converter texto para lista de palavras

In [None]:
!pip install nltk

In [None]:
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import stopwords

In [None]:
stopwords.words('english')

In [None]:
exemplo.split()

In [None]:
exemplo = exemplo.split()
exemplo = [word for word in exemplo if not word in set(stopwords.words('english'))]
exemplo

### Stemming

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [None]:
exemplo_stemmer = [stemmer.stem(word) for word in exemplo]
exemplo_stemmer = ' '.join(exemplo_stemmer)
exemplo_stemmer

### Lematização

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
exemplo_lemmatize = [lemmatizer.lemmatize(word) for word in exemplo]
exemplo_lemmatize = ' '.join(exemplo_lemmatize)
exemplo_lemmatize

### Criação de funções do pipeline de pre-processamento

In [None]:
# Remover HTML
def remove_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

# Remover pontuação
def remove_punctuation(text):
    text2=re.sub(r'\[[^]]*\]','',text)
    return text2

# remover caracteres especiais
def remove_specialChars(text):
    text2 = re.sub(r'[^a-zA-Z]', ' ',text)
    return text2

# remover stopwords
def remove_stopwords_and_lemmatization(text):
    final_text=[]
    text = text.lower()
    text = nltk.word_tokenize(text)

    for word in text:
        if word not in set(stopwords.words('english')):
            lemma = nltk.WordNetLemmatizer()
            word = lemma.lemmatize(word)
            final_text.append(word)
    text2 = " ".join(final_text)
    return text2

# total
def cleaning(text):
    text = remove_html(text)
    text = remove_punctuation(text)
    text = remove_specialChars(text)
    text = remove_stopwords_and_lemmatization(text)
    return text

### Aplicar o pre-processamento ao dataset

In [None]:
import time

df = df.reset_index(drop=True)
inicio = time.time()

df['text'] = df['reviewText'].apply(cleaning)

fim = time.time()
duracao = fim - inicio
print("Tempo de execução:", duracao)

In [None]:
df.head(10)

### Utilização de wordclouds para visualização das palavras mais frequentes

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

### Ver as palavras mais usadas nas reviews que helpul >= 0.5

In [None]:
plt.figure(figsize=(15,15))
wc= WordCloud(max_words = 500, width = 1000, height = 500,
              stopwords = STOPWORDS).generate(' ' .join(df[df.helpful >= 0.5].text))
plt.imshow(wc, interpolation = 'bilinear')
plt.show()           

### Ver as palavras mais usadas nas reviews que helpul < 0.5

In [None]:
plt.figure(figsize=(15,15))
wc= WordCloud(max_words = 500, width = 1000, height = 500,
              stopwords = STOPWORDS).generate(' ' .join(df[df.helpful < 0.5].text))
plt.imshow(wc, interpolation = 'bilinear')
plt.show()           

In [None]:
# Calcular o tamanho das mensagens (tratando os NaNs)
df['msg_len'] = df.text.fillna('').apply(len)

plt.figure(figsize=(12, 8))

# Filtrar usando a coluna corrigida
plt.hist(df[df.helpful_filled >= 0.5].msg_len, color='blue', alpha=0.6, label='Útil (Helpful >= 0.5)', bins=36)
plt.hist(df[df.helpful_filled < 0.5].msg_len, color='red', alpha=0.6, label='Não útil (Helpful < 0.5 ou NaN)', bins=36)

plt.legend()
plt.ylabel('Frequência')
plt.xlabel('Tamanho da Mensagem')
plt.title('Distribuição do Tamanho das Mensagens por Helpful')
plt.show()

### N-Gram Analysis

In [None]:
texts = ' '.join(df['text'].fillna('').astype(str))
string = texts.split(" ")


#### vamos fazer uma análise de unigrams, bigrams e trigrams que existem no nosso dataset de reviews

In [None]:
def draw_n_gram(string,i):
    n_gram = (pd.Series(nltk.ngrams(string, i)).value_counts())[:15]
    n_gram_df=pd.DataFrame(n_gram)
    n_gram_df = n_gram_df.reset_index()
    n_gram_df = n_gram_df.rename(columns={"index": "word", 0: "count"})
    print(n_gram_df.head())

    return # sns.barplot(y='count',x='word', data=n_gram_df)

#### Unigram Analysis

In [None]:
draw_n_gram(string,1)

#### Bigram Analysis

In [None]:
draw_n_gram(string,2)

#### Trigram Analysis

In [None]:
draw_n_gram(string,3)

### Vectorizing

To vectorize we will apply -

    Bag of Words model ( CountVectorizer)
    TF-IDF model (TfidfVectorizer)

#### Bag of Words model ( CountVectorizer)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

exemplo_cv = cv.fit_transform(exemplo).toarray()
exemplo_cv

In [None]:
cv.get_feature_names_out()

In [None]:
df_cv = pd.DataFrame(exemplo_cv.transpose(), index = cv.get_feature_names_out())
df_cv

#### TF-IDF model (TfidfVectorizer)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf  =  TfidfVectorizer()

exemplo_tfidf = tfidf.fit_transform(exemplo).toarray()
exemplo_tfidf

In [None]:
# e a respectiva lista de palavras
tfidf.get_feature_names_out()

In [None]:
df_tfidf = pd.DataFrame(exemplo_tfidf.transpose(), index = tfidf.get_feature_names_out())
df_tfidf

In [None]:
exemplo_cv.shape, exemplo_tfidf.shape

In [None]:
# Vectorization using TF-IDF Technique
### é nesta função de conversão de palavras em números (através da criação de uma matriz) que indicamos:
### o nr de features (nr de colunas, nr de palavras) pretendido
### o tipo de n-gram a ser utilizado

tfidf = TfidfVectorizer(max_features=1000 #,ngram_range = (1, 1)
                       )

df_tfidf_1 = tfidf.fit_transform(df.text)

### vai escolher as max_features mais representativas entre todos os uni,bi,tri-grams existentes

In [None]:
tfidf = TfidfVectorizer(max_features=5000,ngram_range = (1, 2))

df_tfidf= tfidf.fit_transform(df.text).toarray()

In [None]:
df_tfidf.shape

In [None]:
tfidf.get_feature_names_out()

In [None]:
df_tfidf_real=pd.DataFrame(df_tfidf,columns=tfidf.get_feature_names_out())

In [None]:
df_tfidf_real

In [None]:
#### ACABAMOS O PRÉ-PROCESSAMENTO
#### a nossa matriz tem valores que podem ser manuseados pelos algoritmos de IA

In [None]:
## Treino