In [43]:
# libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import string

In [44]:
# we load the reviews we have scrap and left only the content column
df = pd.read_csv('reviews_ingles.csv')
df = df.drop(["Review", "Title","Content"], axis=1)
df = df.dropna(subset=['reviews_english'])
df['reviews_english'] = df['reviews_english'].astype(str)


In [45]:
def preprocess_text(text):
    '''a function for preprocess the text: token, lemmas, stopwords'''
    # Step 1: Convert to lowercase and tokenize
    tokens = word_tokenize(text.lower())
    
    # Step 2: Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stopwords.words("english")]
    
    # Step 3: remove punctuation
    list_no_punctuation = [re.sub(r'['+string.punctuation+']+', ' ', i) for i in filtered_tokens]
    
    # Step 4: Lemmatize each token (FIXED: using 'token' instead of 'tokens')
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in list_no_punctuation]
    
    # Step 5: Join back into a string
    processed_text = " ".join(lemmatized_tokens)
    
    return processed_text

In [46]:
df['reviews_english'] = df['reviews_english'].apply(preprocess_text)
df

Unnamed: 0,reviews_english
0,recommend s super tasty also good s gluten...
1,better protein bar completely satisfied orde...
2,bar taste great
3,tasty üòç 10 10 ‚úåÔ∏è
4,totally worth delicious say s awesome...
...,...
123,super tasty
124,ideal go top consistency taste 1a
125,top tasty
126,perfect energy kick little sweetness enough pr...


In [47]:
# noew let's analyze the sentiment of the words
analyzer = SentimentIntensityAnalyzer()
'''this way we will obtain a dictionary with four keys: neg, neu, pos, and compound'''
# neg = negative sentiment score (between 0 and 1)
# neu = neutral sentiment score (between 0 and 1)
# pos = positive sentiment score (between 0 and 1)
# compound = overall sentiment (between -1 and 1)
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores

In [48]:
df["sentiment"] = df["reviews_english"].apply(get_sentiment)
df = pd.concat([
    df.drop("sentiment", axis=1),
    df["sentiment"].apply(pd.Series)
], axis = 1)

df

Unnamed: 0,reviews_english,neg,neu,pos,compound
0,recommend s super tasty also good s gluten...,0.000,0.340,0.660,0.9382
1,better protein bar completely satisfied orde...,0.000,0.500,0.500,0.7178
2,bar taste great,0.000,0.328,0.672,0.6249
3,tasty üòç 10 10 ‚úåÔ∏è,0.000,1.000,0.000,0.0000
4,totally worth delicious say s awesome...,0.000,0.301,0.699,0.9616
...,...,...,...,...,...
123,super tasty,0.000,0.204,0.796,0.5994
124,ideal go top consistency taste 1a,0.000,0.435,0.565,0.6369
125,top tasty,0.000,0.357,0.643,0.2023
126,perfect energy kick little sweetness enough pr...,0.000,0.487,0.513,0.8598


In [49]:
df.to_csv("sentiment_analysis_scores.csv", sep= ";")

In [50]:
# Print summary statistics with proper f-strings
print(f"Total reviews: {len(df)}")
print(f"Positive reviews: {len(df[df['compound'] > 0])} ({(len(df[df['compound'] > 0])/len(df)*100):.1f}%)")
print(f"Negative reviews: {len(df[df['compound'] < 0])} ({(len(df[df['compound'] < 0])/len(df)*100):.1f}%)")

Total reviews: 125
Positive reviews: 111 (88.8%)
Negative reviews: 3 (2.4%)


In [51]:
# Filter positive/negative reviews
positive_text = " ".join(df[df['compound'] > 0]['reviews_english'])
negative_text = " ".join(df[df['compound'] < 0]['reviews_english'])

In [None]:
# Generate word clouds
wordcloud_pos = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
# wordcloud_neg = WordCloud(width=800, height=400, background_color='black').generate(negative_text)

# Plot
plt.figure(figsize=(15, 7))

# Positive word cloud
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.title('Positive Reviews', fontsize=14, pad=20)  # Fixed title formatting
plt.axis('off')

# Negative word cloud
# plt.subplot(1, 2, 2)
# plt.imshow(wordcloud_neg, interpolation='bilinear')
# plt.title('Negative Reviews', fontsize=14, pad=20)  # Fixed title formatting
# plt.axis('off')

plt.tight_layout()
plt.show()

In [52]:
# Filter positive/negative reviews
positive_texts = df[df['compound'] > 0]['reviews_english'].tolist()
negative_texts = df[df['compound'] < 0]['reviews_english'].tolist()

In [53]:
# Combine all documents but mark their class
documents = positive_texts + negative_texts
labels = ['positive'] * len(positive_texts) + ['negative'] * len(negative_texts)

# Compute TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(documents)

# Create a DataFrame of TF-IDF scores
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)
tfidf_df['sentiment'] = labels  # Add labels for grouping

In [54]:
# Group by sentiment and get top words
top_words_by_sentiment = {
    sentiment: tfidf_df[tfidf_df['sentiment'] == sentiment]
              .drop(columns='sentiment')
              .mean()
              .nlargest(50)
    for sentiment in ['positive', 'negative']
}

In [55]:
top_words_by_sentiment

{'positive': taste          0.092687
 good           0.087782
 bar            0.080717
 tasty          0.071256
 delicious      0.068881
 great          0.063753
 super          0.047377
 consistency    0.045710
 really         0.043338
 sweet          0.037359
 protein        0.036014
 value          0.035295
 like           0.033326
 perfect        0.031651
 bit            0.030217
 snack          0.029081
 nutritional    0.028953
 ideal          0.022738
 training       0.022471
 best           0.021625
 filling        0.019747
 ingredient     0.018789
 berry          0.017300
 hard           0.017013
 energy         0.016937
 natural        0.016906
 gym            0.015942
 right          0.015852
 recommend      0.015751
 goat           0.015665
 meal           0.014940
 workout        0.014724
 little         0.014267
 provides       0.013923
 yummy          0.013310
 satisfied      0.013236
 dry            0.013176
 smaak          0.012670
 chew           0.012449
 fruity      

In [56]:
import pandas as pd

# Convert to DataFrame and print
for sentiment, word_scores in top_words_by_sentiment.items():
    print(f"\nüî† Top 20 {sentiment.upper()} Words (TF-IDF Scores):")
    print(
        pd.DataFrame(word_scores.nlargest(20),  # Show top 20 words
        columns=['TF-IDF Score']
    ).to_markdown(tablefmt="grid", floatfmt=".3f"))  # Grid format with 3 decimal places


üî† Top 20 POSITIVE Words (TF-IDF Scores):
+-------------+----------------+
|             |   TF-IDF Score |
| taste       |          0.093 |
+-------------+----------------+
| good        |          0.088 |
+-------------+----------------+
| bar         |          0.081 |
+-------------+----------------+
| tasty       |          0.071 |
+-------------+----------------+
| delicious   |          0.069 |
+-------------+----------------+
| great       |          0.064 |
+-------------+----------------+
| super       |          0.047 |
+-------------+----------------+
| consistency |          0.046 |
+-------------+----------------+
| really      |          0.043 |
+-------------+----------------+
| sweet       |          0.037 |
+-------------+----------------+
| protein     |          0.036 |
+-------------+----------------+
| value       |          0.035 |
+-------------+----------------+
| like        |          0.033 |
+-------------+----------------+
| perfect     |          0.032 