In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns


plt.style.use('ggplot')

import nltk

In [2]:
df = pd.read_csv('../../Reviews.csv')
df = df.head(500)

In [None]:
df.head()

In [None]:
df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5))

In [None]:
example = df['Text'][50]

example

### Data preprocessing 


In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(example)
tokens[:10]

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
tagged = nltk.pos_tag(tokens, lang="eng")

In [None]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

entities =  nltk.chunk.ne_chunk(tagged)
entities.pprint()

### Model training

In [None]:
# VADER Seniment Scoring

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores("i am so happy")
sia.polarity_scores("this is the worst thing ever")

In [None]:
sia.polarity_scores(example)

In [None]:
res= {}
for i , row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    my_id = row['Id']
    res[my_id] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

vaders.head()

In [None]:
ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('score')
plt.show()

In [None]:
# fig, axs = plt.subplot(1, 3 , figsize=(15, 5))
sns.barplot(data=vaders, x='Score', y='pos')

In [None]:
fig, axs = plt.subplots(1, 3,figsize=(12, 3))
sns.barplot(data=vaders, x= 'Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x= 'Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x= 'Score', y='neg', ax=axs[2])

axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')


plt.show()

In [16]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
sia.polarity_scores(example)

In [None]:
#ROBERTA model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'negative': scores[0],
    'neutral': scores[1],
    'positive': scores[2]
}
scores_dict

In [None]:
def polarity_scores_roberta(text):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'negative': scores[0],
        'neutral': scores[1],
        'positive': scores[2]
    }
    return scores_dict

In [None]:
vader_res = {}
roberta_res = {}
result = {}

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        my_id = row['Id']
        
        vader_res[my_id] = sia.polarity_scores(text)
        vader_results = {f'vader_{key}': value for key, value in vader_res[my_id].items()}
        
        roberta_res[my_id] = polarity_scores_roberta(text)
        roberta_results = {f'roberta_{key}': value for key, value in roberta_res[my_id].items()}
        
        both_results = {**vader_results, **roberta_results}
        result[my_id] = both_results
    except Exception as e:
        print(f'Error with {my_id}')

result



In [66]:
results_df = pd.DataFrame(result).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')


In [None]:
results_df.head()


In [None]:
sns.pairplot(results_df[['vader_pos', 'vader_neu', 'vader_neg', 'roberta_positive', 'roberta_neutral', 'roberta_negative', 'Score']],
             hue='Score',
             palette='tab10')
plt.show()

In [None]:
test = results_df.query('Score == 5').sort_values('vader_neg', ascending=False)['Text'].values[0]
test2 = results_df.query('Score == 5').sort_values('roberta_negative', ascending=False)['Text'].values[0]
print(f"{test}\n-----------------\n{test2}")

In [None]:
from transformers import pipeline

sent_pipeline = pipeline('sentiment-analysis')

In [None]:
sent_pipeline(test)