# Análise de avaliações de hotéis

In [1]:
### 1. Observação inicial dos dados

In [2]:
#!pip install liac-arff
import arff
import pandas as pd

dataset = arff.load(open('hoteis.arff', 'r', encoding="ISO-8859-1"))
data = pd.DataFrame(dataset['data'])
data.columns = ['text', 'class']

# Estrutura dos dados

data.head()

Unnamed: 0,text,class
0,vocês são o melhor,pos
1,foi muito grato trabalhar com voces,pos
2,realizei o aniversario de meu sobrinho na terc...,pos
3,gostei muito do atendimento personalizado/n,pos
4,"grande hotel com otimas acomodacoes, a beira d...",pos


In [None]:
data.iloc[100,0]

In [None]:
# Quantidade de linhas

print(data.shape[0])

In [None]:
# Existem valores nulos (por coluna)?

print(data.isnull().any())

In [None]:
# Quantidade de avaliações positivas e negativas

data['class'].value_counts()

### 2. Separação dos dados

In [None]:
from sklearn.model_selection import train_test_split

# Separação do label e das features
X = data.drop('class', axis=1).values
y = data['class'].values

# Separação de dados de treino e teste
train_features, test_features, class_train, class_test = train_test_split(X, y, test_size=0.20, random_state=10)

### 3. StopWords

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

### 4. CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Criação de matriz com a contagem de cada token SEM stop words
cv = CountVectorizer(max_features=10000)
cv_train_features = cv.fit_transform(train_features.ravel())
cv_test_features = cv.transform(test_features.ravel())

print('Shape das features de treino:', cv_train_features.shape, ' Shape das features de teste:', cv_test_features.shape)

# Criação de matriz com a contagem de cada token COM stop words
cv_sw = CountVectorizer(stop_words = stopwords.words('portuguese'),max_features=10000)
cv_sw_train_features = cv_sw.fit_transform(train_features.ravel())
cv_sw_test_features = cv_sw.transform(test_features.ravel())


print('Shape das features de treino:', cv_sw_train_features.shape, ' Shape das features de teste:', cv_sw_test_features.shape)

### 5. Funções Auxiliares

In [None]:
from sklearn import metrics

def train_predict_model(classifier,
                        train_features, train_labels,
                        test_features, test_labels):
    classifier.fit(train_features, train_labels)
    predictions = classifier.predict(test_features)
    return predictions

In [None]:
def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):

    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
                                  labels=classes)

    cm_frame = pd.DataFrame(cm, index=classes, columns=classes,)
    cm_frame.index.name = 'Actual'
    cm_frame.columns.name = 'Predicted'

    print(cm_frame)

def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels,
                                           y_pred=predicted_labels,
                                           labels=classes)
    print(report)

def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
                                  classes=classes)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels,
                             classes=classes)

def get_metrics(true_labels, predicted_labels):

    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels,
                                               predicted_labels),
                        4))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels,
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels,
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels,
                                               predicted_labels,
                                               average='weighted'),
                        4))


### 6. Random Forest

#### 6.1 RF com stop words

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rfc = RandomForestClassifier(n_jobs=10)

rfc_tfidf_predictions = train_predict_model(classifier=rfc,
                                                train_features=cv_sw_train_features, train_labels=class_train,
                                                test_features=cv_sw_test_features, test_labels=class_test)

display_model_performance_metrics(true_labels=class_test, predicted_labels=rfc_tfidf_predictions,classes=['pos', 'neg'])

#### 6.2 RF sem stop words

In [None]:
rfc_tfidf_predictions = train_predict_model(classifier=rfc,
                                                train_features=cv_train_features, train_labels=class_train,
                                                test_features=cv_test_features, test_labels=class_test)

display_model_performance_metrics(true_labels=class_test, predicted_labels=rfc_tfidf_predictions,classes=['pos', 'neg'])

### 7. TF-IDF

O valor tf–idf, é uma medida estatística que tem o intuito de indicar a importância de uma palavra de um documento em relação a uma coleção de documentos ou em um corpus linguístico. Ela é frequentemente utilizada como fator de ponderação na recuperação de informações e na mineração de dados.

O valor tf–idf de uma palavra aumenta proporcionalmente à medida que aumenta o número de ocorrências dela em um documento, no entanto, esse valor é equilibrado pela frequência da palavra no corpus. Isso auxilia a distinguir o fato da ocorrência de algumas palavras serem geralmente mais comuns que outras.

In [None]:
from sklearn.feature_extraction.text import  TfidfVectorizer

tv = TfidfVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(train_features.ravel())
tv_test_features = tv.transform(test_features.ravel())

print('Shape das features de treino:', tv_train_features.shape, ' Shape das features de teste:', tv_test_features.shape, '\n')

rfc_tfidf_predictions = train_predict_model(classifier=rfc,
                                               train_features=tv_train_features, train_labels=class_train,
                                               test_features=tv_test_features, test_labels=class_test)
display_model_performance_metrics(true_labels=class_test, predicted_labels=rfc_tfidf_predictions,classes=['pos', 'neg'])

### 8. STEMMER

In [None]:
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()

X = data.drop('class', axis=1).values
y = data['class'].values

corpus = []
for review in data['text']:
    phrase = []
    for word in review.split():
        w_stemmed = stemmer.stem(word)
        phrase.append(w_stemmed)
    corpus.append(phrase)
# Separate data into train and test
train_features, test_features, class_train, class_test = train_test_split(corpus, y, test_size=0.20, random_state=10)

rfc_stem_predictions = train_predict_model(classifier=rfc,
                                               train_features=tv_train_features, train_labels=class_train,
                                               test_features=tv_test_features, test_labels=class_test)
display_model_performance_metrics(true_labels=class_test, predicted_labels=rfc_stem_predictions,classes=['pos', 'neg'])

### 9. Separação de palavras positivas e negativas

In [None]:
# Palavras negativas, removendo stopwords

neg_phrases = data[data['class'] == 'neg']
neg_string = []
for phrase in neg_phrases['text']:
    for word in phrase.split():
        if word not in stopwords.words('portuguese'):
            neg_string.append(word)

neg_text = pd.Series(neg_string).str.cat(sep=' ')

In [None]:
# Palavras positivas, removendo stopwords

pos_phrases = data[data['class'] == 'pos']
pos_string = []
for phrase in pos_phrases['text']:
    for word in phrase.split():
        if word not in stopwords.words('portuguese'):
            pos_string.append(word)

pos_text = pd.Series(pos_string).str.cat(sep=' ')

### 10. WordCloud

Word cloud de palavras negativas
# !pip install wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=1600, height=800, max_font_size=200).generate(neg_text)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Word cloud de palavras positivas

In [None]:
wordcloud = WordCloud(width=1600, height=800, max_font_size=200).generate(pos_text)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()