In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

In [None]:
df = pd.read_csv("../data/Reviews.csv")
df.head()

# Quick EDA

In [None]:
ax = df["Score"].value_counts().sort_index().plot(kind='bar', title='Count of reviews by stars' , figsize=(10,5))
ax.set_xlabel('Review Stars')
plt.show()

## Basic NLTK

In [None]:
exp = df['Text'][50]
print(exp)

In [None]:
tokens = nltk.word_tokenize(exp)
tokens[:10]

In [None]:
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

## VADER( Valence Aware Dictionary for Sentiment Reasoning)
it's an NLTK module that provides sentiment scores based on the words used

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy')

In [None]:
sia.polarity_scores(exp)

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={"index":'Id'})
vaders = vaders.merge(df,how='left')

In [None]:
# now we have sentiment score and metadata
vaders.head()

## Plot VADER results

In [None]:
ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compound Score by Amazon Star review')
plt.show()

In [None]:
fig, axs = plt.subplots(1,3,figsize=(12,3))
sns.barplot(data=vaders , x='Score', y='pos' , ax = axs[0])
sns.barplot(data=vaders , x='Score', y='neu' , ax = axs[1])
sns.barplot(data=vaders , x='Score', y='neg' , ax = axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

## Roberta Model

In [None]:
%pip install transformers

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# VADER results on exemple
print(exp)
sia.polarity_scores(exp)

In [None]:
# run for Roberta Model
encoded_text = tokenizer(exp,return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(exp):
  encoded_text = tokenizer(exp,return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
      'roberta_neg': scores[0],
      'roberta_neu': scores[1],
      'roberta_pos': scores[2]
  }
  return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
  try:
    text = row['Text']
    myid = row['Id']
    vader_result = sia.polarity_scores(text)

    roberta_result = polarity_scores_roberta(text)
    both = {**vader_result,**roberta_result}
    res[myid] = both
    break
  except RuntimeError:
    continue
    print(f'Broke for id {myid}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = vaders.reset_index().rename(columns={"index":'Id'})
results_df = vaders.merge(df,how='left')

In [None]:
type(results_df)
results_df.head()

## Compare Scores between models

In [None]:
results_df.columns

## Combine and Comprare

In [None]:
sns.pairplot(data=results_df,
             var=['vader_neg','vader_neu','vader_pos',
                  'roberta_neg','roberta_neu','roberta_pos'],
             hue='Score',
             palette='tab10')
plt.show()

In [None]:
results_df.query('Score == 1') \
  .sort_values('roberta_pos', ascending=False)['Text'].values[0]

### Negative sentiment 5-star review

In [None]:
results_df.query('Score == 1') \
  .sort_values('roberta_neg', ascending=False)['Text'].values[0]

##Transformers Pipeline

In [None]:
from transformers import pipeline

sent_pipeline = pipeline('sentiment-analysis')

In [None]:
sent_pipeline('I love sentiment analysis')

# The End