In [33]:
import pandas as pd
import requests
from pprint import pprint
import datetime

In [2]:
appid = 24780  

# Open a file to write the scraped data
with open("reviews_simcity4_extended.csv", "w+", encoding='utf-8') as fo:
    # Write the header for the CSV file
    print("review,author_vote,other_votes,weighted_vote_score,votes_funny,comment_count,timestamp_created,playtime_at_review", file=fo)

    cursor = '*'  # Start with the initial cursor
    base_url = f'https://store.steampowered.com/appreviews/{appid}'

    curr_page = 0 # Initialize the current page counter
    while True:
        # Set the parameters for the API request
        params = { # https://partner.steamgames.com/doc/store/getreviews
            'json' : 1,
            'filter' : 'all', # sort by: recent, updated, all (helpfulness)
            'language' : 'english', # https://partner.steamgames.com/doc/store/localization # Onlu fetch reviews in English
            'day_range' : 9223372036854775807, # shows reviews from all time 
            'review_type' : 'all', # include all reviews (positive and negative)
            'purchase_type' : 'all', # all, non_steam_purchase, steam
            'num_per_page' : 100, # Number of reviews per page
            'cursor': cursor,  # Use the cursor returned from the last request
        }
        response = requests.get(base_url, params=params) #Make the API request
        data = response.json() # Parse the JSON response

        print(curr_page, end=" ") # Print the current page number
        curr_page += 1 # Increment the current page counter

        # Break the loop if there are no more reviews
        if 'reviews' in data and not data['reviews']:
            break
        
        page_list = data['reviews'] # Get the list of reviews from the response

        for i, page in enumerate(page_list):
            words =  page["review"].split() # Split the review text into words

            if len(words) < 30: 
                continue # Skip reviews with less than 30 words
            
            # Prepare the review text for CSV
            rev = '"' + page["review"].replace('"',"") + '"'
            author_vote = 1 if page['voted_up'] else 0 # Convert voted_up to binary
            other_votes = page['votes_up']
            weighted_vote_score = page["weighted_vote_score"] 
            votes_funny = page["votes_funny"] # Get votes_funny or default to 0
            comment_count = page["comment_count"] # Get comment_count or default to 0
            timestamp_created = datetime.datetime.fromtimestamp(page["timestamp_created"]).strftime('%Y-%m-%d %H:%M:%S') # Convert timestamp to human-readable format
            playtime_at_review = page["author"].get("playtime_at_review", 0) #Get playtime_at_review or default to 0

            # Write the extracted data to the CSV file
            print(rev, ',', author_vote, ',', other_votes, ',', weighted_vote_score, ',', votes_funny, ',', comment_count, ',', timestamp_created, ',', playtime_at_review, file=fo)

        # Update the cursor for the next API request
        cursor = data['cursor']

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 

In [None]:
%pip install nltk --upgrade

In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the VADER lexicon
nltk.download('vader_lexicon');


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\charding\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [45]:
def sentiment_analysis_vader(text):
    sia = SentimentIntensityAnalyzer()
    sentmt = sia.polarity_scores(text)   
    return sentmt["pos"], sentmt["neg"], sentmt["neu"], sentmt["compound"]

In [46]:
# this would be the review text from the csv file

text = """I don't know about you, but I've been hearing that the mayor rocks according to city blocks.
But in all seriousness, this game is a classic and for good reason. The city simulation that runs behind the scenes, while a bit outdated in its ideas for what good urban planning actually looks like, has yet to be matched by any successor. The region mode allows you to network individual city tiles together to form one large megaopolis, and RCI demand and transportation networks seamlessly carry over into your neighboring cities when establishing connections. It's a great idea, and really lets you build massive cities.
Biggest drawback to playing Simcity 4 nowadays is that it's notoriously difficult to get it running smoothly. As of right now, you're mostly limited to 1080p. Any higher and you'll experience bizarre graphical glitches every now and then but it's still playable overall. Modding is essential to get the full experience, but that's ok because the modding community is one of the best out there, rivaling the likes of Mount and Blade and Rimworld. Haven't tested this game on Windows 11, but can report that it's worked just fine for me on both Windows 7 and Windows 10."""

In [47]:
# Example for using the sentiment analysis function

pos, neg, neu, comp = sentiment_analysis_vader(text)
print(f"Positive: {pos}, Negative: {neg}, Neutral: {neu}, Compound: {comp}")


Positive: 0.155, Negative: 0.061, Neutral: 0.784, Compound: 0.9769


In [48]:
# vader scores for simcity4 reviews

# read in the csv file
import math

df = pd.read_csv('reviews_simcity4_extended.csv')
avg_pos = 0
avg_neg = 0
avg_neu = 0
M2_pos = 0
M2_neg = 0
M2_neu = 0

# apply the sentiment analysis function to the review column using a for loop
for n, review in enumerate(df['review']):
    pos, neg, neu, comp = sentiment_analysis_vader(review)
    #print(f"{i}: Positive: {pos}, Negative: {neg}, Neutral: {neu}, Compound: {comp}")

    # calculate a running average of the scores is 3 variables
    delta_pos = pos - avg_pos
    avg_pos += delta_pos / (n + 1)
    delta2 = pos - avg_pos
    M2_pos += delta_pos * delta2

    delta_neg = neg - avg_neg
    avg_neg += delta_neg / (n + 1)
    delta2 = neg - avg_neg
    M2_neg += delta_neg * delta2

    delta_neu = neu - avg_neu
    avg_neu += delta_neu / (n + 1)
    delta2 = neu - avg_neu
    M2_neu += delta_neu * delta2

    print(n, avg_pos, avg_neg, avg_neu)
    #print(".", end="")

variance_pos = M2_pos / n
std_pos = math.sqrt(variance_pos)

variance_neg = M2_neg / n
std_neg = math.sqrt(variance_neg)

variance_neu = M2_neu / n
std_neu = math.sqrt(variance_neu)
print()

print(f"Average Positive: {round(avg_pos, 3)}+-{round(std_pos, 3)}\nAverage Negative: {round(avg_neg, 3)}+-{round(std_neg, 3)}\nAverage Neutral: {round(avg_neu, 3)}+-{round(std_neu, 3)}")

0 0.11 0.014 0.875
1 0.0895 0.0695 0.8405
2 0.10366666666666667 0.08800000000000001 0.808
3 0.10650000000000001 0.07 0.82325
4 0.11100000000000002 0.0582 0.8308
5 0.12016666666666669 0.06783333333333333 0.8121666666666667
6 0.1365714285714286 0.06214285714285714 0.8014285714285715
7 0.12662500000000002 0.05975 0.8137500000000001
8 0.1238888888888889 0.05611111111111111 0.8201111111111112
9 0.1273 0.0505 0.8223000000000001
10 0.12081818181818181 0.05263636363636364 0.8266363636363637
11 0.1215 0.052083333333333336 0.8265833333333334
12 0.11776923076923076 0.05530769230769231 0.8270769230769232
13 0.1207142857142857 0.05557142857142857 0.823857142857143
14 0.12126666666666666 0.0528 0.8260666666666667
15 0.1188125 0.056625 0.8246875
16 0.12223529411764705 0.05488235294117647 0.8230000000000001
17 0.12255555555555556 0.05416666666666667 0.8234444444444445
18 0.12294736842105264 0.05573684210526316 0.8214736842105264
19 0.12350000000000001 0.052950000000000004 0.8237
20 0.12080952380952382

In [44]:
# average of author_vote
print(f"Average author_vote: {df['author_vote'].mean()}")

Average author_vote: 0.810304449648712


In [None]:
# Using the SentiWordNet lexicon instead of VADER
import nltk
from nltk.corpus import sentiwordnet as swn

# Download the SentiWordNet and WordNet resources
nltk.download('sentiwordnet');
nltk.download('wordnet');
nltk.download('averaged_perceptron_tagger');

In [49]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

def get_sentiment(word, pos):
    synsets = list(swn.senti_synsets(word, pos))
    if not synsets:
        return 0, 0, 0
    synset = synsets[0]
    return synset.pos_score(), synset.neg_score(), synset.obj_score()

def sentiment_analysis_SentiWordNet(sentence):
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    sentiment_scores = {'positive': 0, 'negative': 0, 'objective': 0}
    word_count = 0

    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        
        if wn_tag is not None:
            print(f"{word}={wn_tag}", end=", ")
            pos_score, neg_score, obj_score = get_sentiment(word, wn_tag)
            sentiment_scores['positive'] += pos_score
            sentiment_scores['negative'] += neg_score
            sentiment_scores['objective'] += obj_score
            word_count += 1

    # Normalize the scores by the number of words
    if word_count > 0:
        sentiment_scores['positive'] /= word_count
        sentiment_scores['negative'] /= word_count
        sentiment_scores['objective'] /= word_count

    return sentiment_scores['positive'], sentiment_scores['negative'], sentiment_scores['objective'], word_count

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

# Example usage
pos, neg, obj, wc = sentiment_analysis_SentiWordNet(text)
print()
print(f"Positive: {pos}, Negative: {neg}, Objective: {obj}, Word count: {wc}")

do=v, n't=r, know=v, 've=v, been=v, hearing=v, mayor=n, rocks=v, according=v, city=n, blocks=n, seriousness=n, game=n, is=v, classic=a, good=a, reason=n, city=n, simulation=n, runs=v, scenes=n, bit=n, outdated=v, ideas=n, good=a, urban=a, planning=v, actually=r, looks=v, has=v, yet=r, be=v, matched=v, successor=n, region=n, mode=n, allows=v, network=n, individual=a, city=n, tiles=v, together=r, form=v, large=a, megaopolis=n, RCI=n, demand=n, transportation=n, networks=n, seamlessly=r, carry=v, over=r, neighboring=a, cities=n, establishing=v, connections=n, 's=v, great=a, idea=n, really=r, lets=v, build=v, massive=a, cities=n, Biggest=n, drawback=n, playing=v, Simcity=n, nowadays=n, is=v, 's=v, notoriously=r, difficult=a, get=v, running=v, smoothly=r, right=n, now=r, 're=v, mostly=r, limited=v, Any=v, higher=a, experience=v, bizarre=a, graphical=a, glitches=n, now=r, then=r, 's=v, still=r, playable=a, overall=a, Modding=n, is=v, essential=a, get=v, full=a, experience=n, 's=v, ok=a, modd

In [4]:
%pip install Textblob

Collecting Textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 626.3/626.3 kB 4.9 MB/s eta 0:00:00
Installing collected packages: Textblob
Successfully installed Textblob-0.18.0.post0
Note: you may need to restart the kernel to use updated packages.
