# Exploring a Transformer-based approach on Sentiment Analysis for Application Review
**Naïve Bayes**

COMP 550 - Group Project

Renchi Zhang | ID: 261110529

In [8]:
# load the data
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("app_reviews")

import pandas as pd
df = dataset['train'].to_pandas()

data = df[["review", "star"]]
# data = data.loc[:100]



In [9]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
import time
from datetime import timedelta
def format_time(seconds):
    return str(timedelta(seconds=int(seconds)))

In [11]:
# preprocessing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import time

def review_preprocess(text):
    text = text.lower()
    text = word_tokenize(text)
    lemmatize = WordNetLemmatizer()
    text = [lemmatize.lemmatize(word) for word in text]
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    return text

preprocess_start_time = time.time()
data['review'] = data['review'].apply(lambda x: review_preprocess(x))
data['review'] = data['review'].astype(str)
preprocess_time = time.time()-preprocess_start_time
print(f"Time for preprocessing: {preprocess_time}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['review'] = data['review'].apply(lambda x: review_preprocess(x))


Time for preprocessing: 134.30165910720825


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['review'] = data['review'].astype(str)


In [12]:
# train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV

X, y = data['review'], data['star']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)  # 25% for testing

In [13]:
# grid search on naïve bayes
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np


review_cls_mnb = Pipeline([('vectorizer', CountVectorizer()),
                           ('feature extractor', TfidfTransformer()),
                           ('cls-mnb', MultinomialNB(force_alpha=True))
                           ])

parameters_mnb = {'vectorizer__ngram_range': [(1, 1), (2, 2), (1, 2), (3, 3), (1, 3), (2, 3)],
                  'feature extractor__use_idf': (True, False),
                  'feature extractor__smooth_idf': (True, False),
                  # 'cls-mnb__alpha': np.geomspace(1e-10, 1e10, num=21, endpoint=True)
                  'cls-mnb__alpha': np.geomspace(1e-2, 1e0, num=10, endpoint=True)
                  }

grid_start_time = time.time()
gs_cls_mnb = GridSearchCV(review_cls_mnb, parameters_mnb, scoring='accuracy')
gs_cls_mnb = gs_cls_mnb.fit(X_train, y_train)
grid_search_training_time = time.time() - grid_start_time
print(f"Time for grid search and training: {grid_search_training_time}")

print('The best accuracy for training set is: ' + str(gs_cls_mnb.best_score_))
cls_mnb_best_params = gs_cls_mnb.best_params_

print("\nHyper-parameters of Naive Bayes classifier:")
vectorizer = CountVectorizer(
    ngram_range=cls_mnb_best_params['vectorizer__ngram_range']
)
tfidf_transformer = TfidfTransformer(
    use_idf=cls_mnb_best_params['feature extractor__use_idf'],
    smooth_idf=cls_mnb_best_params['feature extractor__smooth_idf']
)
classifier = MultinomialNB(
    force_alpha=True,
    alpha=cls_mnb_best_params['cls-mnb__alpha']
)

print("ngram_range of vectorizer: ", str(cls_mnb_best_params['vectorizer__ngram_range']))
print("use_idf of tfidf_transformer: ", str(cls_mnb_best_params['feature extractor__use_idf']))
print("smooth_idf of tfidf_transformer: ", str(cls_mnb_best_params['feature extractor__smooth_idf']))
print("alpha of MultinomialNB: ", str(cls_mnb_best_params['cls-mnb__alpha']))


Time for grid search and training: 9935.305357694626
The best accuracy for training set is: 0.6859679334828763

Hyper-parameters of Naive Bayes classifier:
ngram_range of vectorizer:  (1, 2)
use_idf of tfidf_transformer:  False
smooth_idf of tfidf_transformer:  True
alpha of MultinomialNB:  0.046415888336127774


In [14]:
fit_start_time = time.time()
X_train_transformed = vectorizer.fit_transform(X_train)
X_train_transformed = tfidf_transformer.fit_transform(X_train_transformed)
classifier.fit(X_train_transformed, y_train)
fit_end_time = time.time()
print(f"Time for training and fitting: {fit_end_time-fit_start_time}")

predict_start_time = time.time()
X_test_transformed = vectorizer.transform(X_test)
X_test_transformed = tfidf_transformer.transform(X_test_transformed)
y_test_pred = classifier.predict(X_test_transformed)
predict_end_time = time.time()
print(f"Time for prediction: {predict_end_time-predict_start_time}")

accuracy = accuracy_score(y_test, y_test_pred)
print('The accuracy for the testing set is: ' + str(accuracy))


log_probabilities = classifier.predict_log_proba(X_test_transformed)
cross_entropy_loss = log_loss(y_test, log_probabilities)
print("Cross Entropy Loss:", cross_entropy_loss)

Time for training and fitting: 7.994792938232422
Time for prediction: 1.5407071113586426
The accuracy for the testing set is: 0.6870738853326298
Cross Entropy Loss: 1.6094379124340998
