In [1]:
from transformers import AutoTokenizer, pipeline
from datasets import Dataset
import pandas as pd
import numpy as np
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Sentiment Analysis

In [3]:
# Load data
path = '/content/drive/MyDrive/Final Project BBS/data/'
file_name = 'steam_reviews_clean.csv'
df = pd.read_csv(f"{path}{file_name}", encoding = "latin1")
df.shape

(9182, 7)

In [4]:
# Define model for sentiment analyis
model = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    device=0
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
# Truncate review lenght to 512 tokes
def truncate_text(text, max_tokens=512):
    """Truncate text safely to the model's max token length."""
    if not isinstance(text, str):
        return ""
    # Encode and decode back to string to keep token-level truncation safe
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)

df["trunc_review"] = df["clean_review"].astype(str).apply(lambda x: truncate_text(x, max_tokens=512))


In [6]:
# Batch-process text reviews using a sentiment analysis pipeline
texts = df["trunc_review"].tolist()
batch_size = 32

labels = []
scores = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    results = sentiment_pipeline(batch)
    labels.extend([r["label"] for r in results])
    scores.extend([r["score"] for r in results])

df["sentiment_label"] = labels
df["confidence_score"] = scores

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [7]:
# Classify reviews into negative, positive and neutral
df["sentiment_numeric"] = df["sentiment_label"].str.extract(r'(\d)').astype(float)

df["sentiment_category"] = pd.cut(
    df["sentiment_numeric"],
    bins=[0, 2, 3, 5],
    labels=["Negative", "Neutral", "Positive"]
)

# Add a binary column for positive sentiment
df["positive"] = np.where(df["sentiment_category"] == "Positive", 1, 0)

In [8]:
df.head()

Unnamed: 0,game,app_id,recommended,timestamp_created,author_playtime,date,clean_review,trunc_review,sentiment_label,confidence_score,sentiment_numeric,sentiment_category,positive
0,PAYDAY 2,218620,True,1765018293,1394,2025-12-06 10:51:33,"I rodded bank,then i start game","i rodded bank, then i start game",1 star,0.248288,1.0,Negative,0
1,PAYDAY 2,218620,False,1765005087,886,2025-12-06 07:11:27,Game is a steaming pile that doesn't even work...,game is a steaming pile that doesn't even work...,1 star,0.952912,1.0,Negative,0
2,PAYDAY 2,218620,False,1764993460,13063,2025-12-06 03:57:40,clicked no because i was bored,clicked no because i was bored,1 star,0.601361,1.0,Negative,0
3,PAYDAY 2,218620,True,1764991399,44,2025-12-06 03:23:19,better than PAYDAY 3,better than payday 3,5 stars,0.603237,5.0,Positive,1
4,PAYDAY 2,218620,True,1764976971,9650,2025-12-05 23:22:51,"I highly recommend this game as it is great, c...","i highly recommend this game as it is great, c...",4 stars,0.480363,4.0,Positive,1


In [9]:
# Save intermediate table
df.to_csv(f"{path}steam_reviews_sentiment.csv", index=False)

# Aspect-Based Sentiment Analysis

## Manually extraction

In [10]:
# Define keywords by aspect
aspects = {
    "graphics": ["graphics", "visuals", "art", "animation", "design", "textures"],
    "performance": ["performance", "fps", "lag", "crash", "bugs", "optimization"],
    "story": ["story", "plot", "narrative", "dialogue", "quest", "ending"],
    "gameplay": ["gameplay", "controls", "mechanics", "combat", "difficulty", "balance"],
    "price": ["price", "expensive", "cheap", "worth", "value"],
}

In [11]:
# Detect aspect through word matching
for aspect, keywords in aspects.items():
    pattern = r"\b(" + "|".join(map(re.escape, keywords)) + r")\b"
    df[aspect] = df["clean_review"].str.contains(pattern, regex=True, case=False).astype(int)

  df[aspect] = df["clean_review"].str.contains(pattern, regex=True, case=False).astype(int)
  df[aspect] = df["clean_review"].str.contains(pattern, regex=True, case=False).astype(int)
  df[aspect] = df["clean_review"].str.contains(pattern, regex=True, case=False).astype(int)
  df[aspect] = df["clean_review"].str.contains(pattern, regex=True, case=False).astype(int)
  df[aspect] = df["clean_review"].str.contains(pattern, regex=True, case=False).astype(int)


In [12]:
# Calculate the number of reviews that contain at least one match for each aspect
df[['graphics', 'performance', 'story', 'gameplay', 'price']].mean()

Unnamed: 0,0
graphics,0.044544
performance,0.032782
story,0.057939
gameplay,0.082771
price,0.046504


Detecting review aspects through word matching yielded very few matches, so a zero-shot aspect detection approach will be performed to achieve a more comprehensive analysis.

## Zero-shot aspect detection

In [13]:
# Filter reviews with at least 5 words length
df["word_count"] = df["trunc_review"].str.split().str.len()
df["index"] = df.index
df_len_5 = df[df["word_count"]>=5].copy()
dataset = Dataset.from_pandas(df_len_5)


In [14]:
# Define zero shot classification model
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [15]:
def classify_aspects(batch, aspects):
    """
    Perform zero-shot classification on a batch of reviews for given aspects and return per-aspect scores.
    """
    texts = list(batch["trunc_review"])
    results = classifier(texts, candidate_labels=aspects, multi_label=True)

    if isinstance(results, dict):
        results = [results]

    output = {f"zs_{label}_score": [] for label in aspects}

    for r in results:
        scores_dict = dict(zip(r["labels"], r["scores"]))
        for label in aspects:
            output[f"zs_{label}_score"].append(float(scores_dict.get(label, 0.0)))

    return output

In [16]:
# Apply zero-shot aspect classification across the dataset in batches
aspects_list = list(aspects.keys())
dataset = dataset.map(classify_aspects, batched=True, batch_size=8,
                      fn_kwargs={"aspects": aspects_list})

Map:   0%|          | 0/6631 [00:00<?, ? examples/s]

In [17]:
# Merge zero shot classification output with origin dataframe
df_result = dataset.to_pandas()
zs_columns = [col for col in df_result.columns if col.startswith("zs_")]
df_result = df_result.set_index("index")
df_result = df_result[zs_columns]

df_merged = df.copy()
for col in zs_columns:
    df_merged[col] = df_result[col]

df_merged[zs_columns] = df_merged[zs_columns].fillna(0)

In [18]:
df_merged.head()

Unnamed: 0,game,app_id,recommended,timestamp_created,author_playtime,date,clean_review,trunc_review,sentiment_label,confidence_score,...,story,gameplay,price,word_count,index,zs_graphics_score,zs_performance_score,zs_story_score,zs_gameplay_score,zs_price_score
0,PAYDAY 2,218620,True,1765018293,1394,2025-12-06 10:51:33,"I rodded bank,then i start game","i rodded bank, then i start game",1 star,0.248288,...,0,0,0,7,0,0.001864,0.789873,0.072499,0.996646,0.070791
1,PAYDAY 2,218620,False,1765005087,886,2025-12-06 07:11:27,Game is a steaming pile that doesn't even work...,game is a steaming pile that doesn't even work...,1 star,0.952912,...,0,0,0,69,1,0.238581,0.657618,0.359851,0.842916,0.241639
2,PAYDAY 2,218620,False,1764993460,13063,2025-12-06 03:57:40,clicked no because i was bored,clicked no because i was bored,1 star,0.601361,...,0,0,0,6,2,0.333301,0.008606,0.00989,0.042483,0.336748
3,PAYDAY 2,218620,True,1764991399,44,2025-12-06 03:23:19,better than PAYDAY 3,better than payday 3,5 stars,0.603237,...,0,0,0,4,3,0.0,0.0,0.0,0.0,0.0
4,PAYDAY 2,218620,True,1764976971,9650,2025-12-05 23:22:51,"I highly recommend this game as it is great, c...","i highly recommend this game as it is great, c...",4 stars,0.480363,...,0,0,1,406,4,0.239323,0.991853,0.49938,0.963858,0.36405


In [19]:
# Save data with sentiment and aspect classification
df_merged.to_csv(f"{path}steam_reviews_sentiment_aspects.csv", index=False)