In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from transformers import pipeline

In [2]:
import pandas as pd

# df = pd.read_csv('combined_English_scrap_result.csv')
df = pd.read_csv('final_data.csv')
df.head()

Unnamed: 0,name,star,time,Place Id,review_id,reviews,final_text
0,Satria Sihombing,5.0,2024-07-11 23:00:00,14,0,view nya menyala 🤩🔥🔥,the view is on
1,silas nainggolan,5.0,2024-07-11 23:00:00,14,1,penginapan yang paling menyatu dengan alam. ca...,accommodation that is most united with nature ...
2,Ari Setiawan,5.0,2024-07-11 22:00:00,14,2,"bagus, hotel bernuasna baru yang berteknologi ...",nice new hightech hotel with very friendly ser...
3,mutiara saragih,5.0,2024-07-11 22:00:00,14,3,good,good
4,hendro sebayang,5.0,2024-07-11 22:00:00,14,4,so an exciting stay experience,so an exciting stay experience


In [3]:
df[df.review_id == 5649]

Unnamed: 0,name,star,time,Place Id,review_id,reviews,final_text
5626,Tiara,5.0,2024-03-14 00:00:00,32,5649,berkali2 ke jakarta nginepnya di sini kalau la...,ive been to jakarta many time and stayed here ...


In [4]:
df.dropna(inplace=True)
df.isna().sum()

name          0
star          0
time          0
Place Id      0
review_id     0
reviews       0
final_text    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6751 entries, 0 to 6751
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        6751 non-null   object 
 1   star        6751 non-null   float64
 2   time        6751 non-null   object 
 3   Place Id    6751 non-null   int64  
 4   review_id   6751 non-null   int64  
 5   reviews     6751 non-null   object 
 6   final_text  6751 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 421.9+ KB


In [6]:
import pandas as pd
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

nlp = spacy.load("en_core_web_sm")

absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

def extract_aspects(review):
    doc = nlp(review)
    aspects = set()
    for token in doc:
        if token.dep_ == "nsubj" and token.pos_ == "NOUN":
            aspects.add(token.text.lower())
        elif token.dep_ == "amod" and token.head.pos_ == "NOUN":
            aspects.add(token.head.text.lower())
    return list(aspects)

def analyze_aspect_sentiment(sentence, aspect):
    inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
    outputs = absa_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1)
    probs = probs.detach().numpy()[0]
    return {label: prob for label, prob in zip(["negative", "neutral", "positive"], probs)}

def truncate_sentence(sentence, max_length=512):
    tokens = sentence.split()
    if len(tokens) > max_length:
        return ' '.join(tokens[:max_length])
    return sentence

def process_reviews(df, id_column, text_column):
    results = []
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing reviews"):
        review_id = row[id_column]
        sentence = row[text_column]
        truncated_sentence = truncate_sentence(sentence)
        aspects = extract_aspects(truncated_sentence)
        for aspect in aspects:
            sentiment_scores = analyze_aspect_sentiment(truncated_sentence, aspect)
            highest_sentiment_label = max(sentiment_scores, key=sentiment_scores.get)
            highest_sentiment_score = sentiment_scores[highest_sentiment_label]
            results.append({
                'review_id': review_id,
                'aspect': aspect,
                'sentiment_label': highest_sentiment_label,
                'sentiment_score': highest_sentiment_score
            })
    return pd.DataFrame(results)

df_ = df.copy()

processed_df = process_reviews(df_, 'review_id', 'final_text')

print(processed_df.head())

processed_df.to_csv('aspect_sentiment_analysis.csv', index=False)

Processing reviews: 100%|██████████| 6751/6751 [2:18:20<00:00,  1.23s/it]   


   review_id      aspect sentiment_label  sentiment_score
0          0        view         neutral         0.926652
1          1      thanks        positive         0.994938
2          1      option        positive         0.995657
3          1  technology        positive         0.997838
4          1  experience        positive         0.987064
