In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

import re
import string


plt.style.use('ggplot')

pd.set_option('display.max_colwidth', 100)

df = pd.read_excel('data/bestbuy_reviews.xlsx')
df['REVIEW'] = df['TITLE'] + ' ' + df['CONTENT']
df['POST DATE'] = pd.to_datetime(df['POST DATE'])
df = df.drop(columns=['TITLE', 'CONTENT'])
df.insert(0, 'ID', range(0, 0 + len(df)))
df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'data/bestbuy_reviews.xlsx'

In [None]:
print(df.shape)
ax = df['RATING'].value_counts().sort_index()\
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')

In [None]:

ps = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')


def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

df['review_clean'] = df['REVIEW'].apply(lambda x: clean_text(x.lower()))


df.head()

In [None]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['lemmatized'] = df['review_clean'].apply(lambda x: lemmatizing(x))

df.head(10)

In [None]:
df['review_clean'] = df.review_clean.apply(' '.join)
df.head()

In [None]:
from nltk.util import ngrams

def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data),num)
    return [ ' '.join(grams) for grams in n_grams]

df['ngram2'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 2))
# df['ngram3'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 3))
# df['ngram4'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 4))

df.head()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm.notebook import tqdm

model_name = f'cardiffnlp/twitter-roberta-base-sentiment'

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def polarity_scores_roberta(review):
    encoded_text = tokenizer(review, padding=True, truncation=True, max_length=512,  return_tensors='pt' )
    encoded_text
    # print(encoded_text)
    output = model(**encoded_text)
    # output
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores
    scores_dict = {
        'negative' : scores[0],
        'neutral' :  scores[1],
        'positive' : scores[2]
    }
    return scores_dict

res = {}

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['REVIEW']
        myid = row['ID']
        # vader_results = sia.polarity_scores(text)
        roberta_result = polarity_scores_roberta(text)
        res[myid] = {**roberta_result}
    except RuntimeError:
        print(f'Broke for id {myid}')

results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'ID'})
results_df = results_df.merge(df, how='left')
results_df.head()

In [None]:
sns.pairplot(data=results_df,
             vars=['negative','neutral','positive'],
             hue='RATING',
             palette='tab10'
             )

plt.show

In [None]:
results_df['positivity']= np.where((results_df['RATING'] >=4) & (results_df['positive']>0.5), 1, 0)
results_df['positivity']= np.where((results_df['RATING'] <=2) & (results_df['negative']>0.5), -1, results_df['positivity'])
# results_df['hi']= results_df.loc[(results_df['RATING'] >= 4) & (results_df['positive'] > 0.5)]

results_df = results_df.reindex(columns=['ID','negative','neutral','positive','RATING', 'positivity','POST DATE',
                                         'AUTHOR','REVIEW','review_clean','lemmatized','ngram2' ])
results_df.head()