In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

import re
import string


plt.style.use('ggplot')

pd.set_option('display.max_colwidth', 100)

df = pd.read_excel('data/bestbuy_reviews.xlsx')
df['REVIEW'] = df['TITLE'] + ' ' + df['CONTENT']
df['POST DATE'] = pd.to_datetime(df['POST DATE'])
df = df.drop(columns=['TITLE', 'CONTENT'])
df.insert(0, 'ID', range(0, 0 + len(df)))
df.shape

In [None]:
print(df.shape)
ax = df['RATING'].value_counts().sort_index()\
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')

In [None]:
lumns = ['TITLE', 'CONTENT'])
df.insert(0, 'ID', range(0, 0 + len(df)))
df.shape
print(df.shape)
ax = df['RATING'].value_counts().sort_index() \
    .plot(kind='bar',
title = 'Count of Reviews by Stars',
figsize = (10, 5))
ax.set_xlabel('Review Stars')

ps = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')


def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text




df.head()
LEMMATIZER:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()


def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text


df['lemmatized'] = df['review_clean'].apply(lambda x: lemmatizing(x))

df.head(10)
df['review_clean'] = df.review_clean.apply(' '.join)
df.head()
N - Gram
from nltk.util import ngrams


def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [' '.join(grams) for grams in n_grams]


df['ngram2'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 2))
# df['ngram3'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 3))
# df['ngram4'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 4))

df.head()

Huggingface
Roberta
Model
Sentiment
Analysis
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm.notebook import tqdm

model_name = f'cardiffnlp/twitter-roberta-base-sentiment'

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


def polarity_scores_roberta(review):
    encoded_text = tokenizer(review, padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded_text
    # print(encoded_text)
    output = model(**encoded_text)
    # output
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores
    scores_dict = {
        'negative': scores[0],
        'neutral': scores[1],
        'positive': scores[2]
    }
    return scores_dict


res = {}

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['REVIEW']
        myid = row['ID']
        # vader_results = sia.polarity_scores(text)
        roberta_result = polarity_scores_roberta(text)
        res[myid] = {**roberta_result}
    except RuntimeError:
        print(f'Broke for id {myid}')

results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'ID'})
results_df = results_df.merge(df, how='left')
results_df.head()
sns.pairplot(data=results_df,
             vars=['negative', 'neutral', 'positive'],
             hue='RATING',
             palette='tab10'
             )

plt.show
[Adding 'positivity' column to sort positive, neutral and negative reviews]

SORTING
METHOD
- positivity == '1'  ->  {Reviews
with user ratings 4 and 5} AND {'positive'} sentiment is greater than {'negative' + 'neutral'}
- positivity == '0'  ->  {Reviews
with user ratings 3} AND {'positive' nor 'negative'} is larger than 0.5 (ambiguous)
- positivity == '-1'  ->  {Reviews
with user ratings 4 and 5} AND {'negative'} sentiment is greater than {'positive' + 'neutral'}

results_df['positivity'] = np.where((results_df['RATING'] >= 4) & (results_df['positive'] > 0.5), 1, 0)
results_df['positivity'] = np.where((results_df['RATING'] <= 2) & (results_df['negative'] > 0.5), -1,
                                    results_df['positivity'])
# results_df['hi']= results_df.loc[(results_df['RATING'] >= 4) & (results_df['positive'] > 0.5)]

results_df = results_df.reindex(columns=['ID', 'negative', 'neutral', 'positive', 'RATING', 'positivity', 'POST DATE',
                                         'AUTHOR', 'REVIEW', 'review_clean', 'lemmatized', 'ngram2'])
results_df.head()

results_df['lemmatized_s'] = [', '.join(map(str, l)) for l in results_df['lemmatized']]
results_df['ngram2_s'] = [', '.join(map(str, l)) for l in results_df['ngram2']]
# results_df['ngram3_s'] = [', '.join(map(str, l)) for l in results_df['ngram3']]
# results_df['ngram4_s'] = [', '.join(map(str, l)) for l in results_df['ngram4']]


# a = pd.Series([item for sublist in results_df.lemmatized_s for item in sublist])
# a
d = results_df.groupby(results_df['positivity']).agg({'lemmatized_s': lambda x: ', '.join(x),
                                                      'ngram2_s': lambda x: ', '.join(x)})
# 'ngram3_s': lambda x: ', '.join(x),
# 'ngram4_s': lambda x: ', '.join(x)})

lem_pos = d['lemmatized_s'][1]
lem_neu = d['lemmatized_s'][0]
lem_neg = d['lemmatized_s'][-1]

tags_pos = lem_pos.split(', ')  # Positivity [1]
tags_neu = lem_neu.split(', ')  # Positivity [0]
tags_neg = lem_neg.split(', ')  # Positivity [-1]
res_pos = {}
res_neu = {}
res_neg = {}


def word_count(tags, res):
    for i in tags:
        res[i] = tags.count(i)
    return res


res_pos = word_count(tags_pos, res_pos)
res_neu = word_count(tags_neu, res_neu)
res_neg = word_count(tags_neg, res_neg)

lemmatized_count = pd.DataFrame([res_pos, res_neu, res_neg]).astype('Int64').T.fillna(0)
lemmatized_count.columns = ['POS(1)', 'NEU(0)', 'NEG(-1)']
lemmatized_count = lemmatized_count.sort_values(by='POS(1)', ascending=False)
lemmatized_count.name = 'Word Count by Sentiment'
lemmatized_count  # sorted by Most Frequent in 'positive'

# res_sort = {k: v for k, v in sorted(lemmatized_count['1'].items(), key=lambda item: item[1], reverse=True)[:30]}
# res_sort

ngram2_pos = d['ngram2_s'][1]
ngram2_neu = d['ngram2_s'][0]
ngram2_neg = d['ngram2_s'][-1]

tags_bi_pos = ngram2_pos.split(', ')  # Positive Bi-gram
tags_bi_neu = ngram2_neu.split(', ')  # Neutral Bi-gram
tags_bi_neg = ngram2_neg.split(', ')  # Negative Bi-gram

res_bi_pos = {}
res_bi_neu = {}
res_bi_neg = {}


def word_count(tags, res):
    for i in tags:
        res[i] = tags.count(i)
    return res


res_bi_pos = word_count(tags_bi_pos, res_bi_pos)
res_bi_neu = word_count(tags_bi_neu, res_bi_neu)
res_bi_neg = word_count(tags_bi_neg, res_bi_neg)

bigram_count = pd.DataFrame([res_bi_pos, res_bi_neu, res_bi_neg]).astype('Int64').T.fillna(0)
bigram_count.columns = ['POS(1)', 'NEU(0)', 'NEG(-1)']
bigram_count = bigram_count.sort_values(by='POS(1)', ascending=False)
bigram_count.name = 'Bigram (2 adjacent words) Count by Sentiment'
bigram_count  # Sorted by Most Frequent in 'positive'
from wordcloud import WordCloud

# word = lem_pos
stopwords_c = ['vacuum', 'x000d']
wordcloud_pos = WordCloud(stopwords=stopwords_c, width=1000, height=500).generate(lem_pos)

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud_pos)

plt.axis("off")
# plt.savefig("your_file_name"+".png", bbox_inches='tight')
plt.show()

# plt.close()
wordcloud_neg = WordCloud(stopwords=stopwords_c, width=1000, height=500, colormap='RdPu').generate(lem_neg)
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud_neg)
plt.axis("off")
plt.show()
wordcloud_bi_neg = WordCloud(stopwords=stopwords_c, width=1000, height=500).generate_from_frequencies(res_bi_pos)
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud_bi_neg)
plt.axis("off")
plt.show()