In [3]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from collections import Counter
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
file_path = 'for_lda_reviews1.csv'
reviews_df = pd.read_csv(file_path)

In [5]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = [word for word in text.split() if word not in stop_words]
    return words

In [6]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [7]:
all_words = []
for review in reviews_df['review']:
    if pd.notna(review):
        all_words.extend(preprocess_text(review))


word_freq = FreqDist(all_words)

most_common_words = word_freq.most_common(5)
print("Most Common Words:", most_common_words)


top_aspects = [word for word, freq in most_common_words[:5]]
print("Selected Aspects for Sentiment Analysis:", top_aspects)

Most Common Words: [('hotel', 895), ('room', 791), ('clean', 683), ('nice', 656), ('great', 640)]
Selected Aspects for Sentiment Analysis: ['hotel', 'room', 'clean', 'nice', 'great']


In [8]:
def compute_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_label = torch.argmax(probabilities, dim=1).item()
    sentiment_labels = {0: 'very negative', 1: 'negative', 2: 'neutral', 3: 'positive', 4: 'very positive'}
    return sentiment_labels[sentiment_label]

In [9]:
top_aspects=['room', 'staff','stay','pool','breakfast']
aspect_sentiments = {aspect: [] for aspect in top_aspects}


In [11]:

for review in reviews_df['review']:
    if pd.notna(review):
        for aspect in top_aspects:
            if aspect in review.lower():
                sentiment = compute_sentiment(review)
                aspect_sentiments[aspect].append(sentiment)

aspect_sentiment_summary = {}
for aspect, sentiments in aspect_sentiments.items():
    sentiment_counts = Counter(sentiments)
    total = sum(sentiment_counts.values())
    sentiment_distribution = {sent: count / total * 100 for sent, count in sentiment_counts.items()}
    aspect_sentiment_summary[aspect] = sentiment_distribution

aspect_sentiment_df = pd.DataFrame(aspect_sentiment_summary).fillna(0)
print(aspect_sentiment_df)


                    room      staff       stay       pool  breakfast
negative       10.098039   9.031199   7.808219   8.555133   7.824427
very negative  10.686275   5.582923   9.041096   6.844106   3.053435
neutral        14.607843   9.852217   9.178082  14.068441  15.076336
positive       30.098039  28.571429  28.219178  30.038023  34.541985
very positive  34.509804  46.962233  45.753425  40.494297  39.503817


In [12]:
def __init__(self):
        super().__init__()
        # Initialize model and tokenizer
        self.model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
