In [None]:
%pip install requests pandas seaborn matplotlib plotly nbformat --upgrade

In [None]:
import requests # pip install requests    
import pandas as pd # pip install pandas
import seaborn as sns # pip install seaborn
import matplotlib.pyplot as plt # pip install matplotlib
import plotly.express as px  # pip install plotly

from pprint import pprint
import datetime
from IPython.display import clear_output
import time 
import os

In [None]:
# define which game to scrape
appid = << your game's app id >>   # will be in the steam store URL for your game, like this: store.steampowered.com/appreviews/<<appid>>

# Examples of app ids:
#appid = 24780  # simcity4
#appid = 2357570 # over watch 2
#appid = 413150 # stardew valley

# set base name for the file
base_name = << your games name >> #

scrape_file = f'reviews_{base_name}.csv'
results_file = f'results_{base_name}.csv'
extended_file = f'reviews_{base_name}_extended.csv'

# make a new folder for the base name if it does not exist
if not os.path.exists(base_name):
    os.makedirs(base_name)

# prepend the folder name to all file names
scrape_file = f'{base_name}/{scrape_file}'
results_file = f'{base_name}/{results_file}'
extended_file = f'{base_name}/{extended_file}'

### Scrape Steam reviews

In [None]:
# Open a file to write the scraped data and write them to a CSV file
with open(scrape_file, "w+", encoding='utf-8') as fo:
    # Write the header for the CSV file
    print("review,author_vote,other_votes,weighted_vote_score,votes_funny,comment_count,timestamp_created,playtime_at_review", file=fo)

    cursor = '*'  # Start with the initial cursor
    base_url = f'https://store.steampowered.com/appreviews/{appid}'

    curr_page = 0 # Initialize the current page counter
    while True:
        # Set the parameters for the API request
        params = { # https://partner.steamgames.com/doc/store/getreviews
            'json' : 1,
            'filter' : 'all', # sort by: recent, updated, all (helpfulness)
            'language' : 'english', # https://partner.steamgames.com/doc/store/localization # Only fetch reviews in English
            'day_range' : 9223372036854775807, # shows reviews from all time 
            'review_type' : 'all', # include all reviews (positive and negative)
            'purchase_type' : 'all', # all, non_steam_purchase, steam
            'num_per_page' : 100, # Number of reviews per page
            'cursor': cursor,  # Use the cursor returned from the last request
        }
        response = requests.get(base_url, params=params) #Make the API request
        data = response.json() # Parse the JSON response

        print(curr_page, end=" ") # Print the current page number
        curr_page += 1 # Increment the current page counter

        # Break the loop if there are no more reviews
        if 'reviews' in data and not data['reviews']:
            break
        
        page_list = data['reviews'] # Get the list of reviews from the response

        for i, page in enumerate(page_list):
            words =  page["review"].split() # Split the review text into words

            if len(words) < 30: 
                continue # Skip reviews with less than 30 words
            
            # Prepare the review text for CSV
            rev = '"' + page["review"].replace('"',"") + '"'
            author_vote = 1 if page['voted_up'] else 0 # Convert voted_up to binary
            other_votes = page['votes_up']
            weighted_vote_score = page["weighted_vote_score"] 
            votes_funny = page["votes_funny"] # Get votes_funny or default to 0
            comment_count = page["comment_count"] # Get comment_count or default to 0
            timestamp_created = datetime.datetime.fromtimestamp(page["timestamp_created"]).strftime('%Y-%m-%d %H:%M:%S') # Convert timestamp to human-readable format
            playtime_at_review = page["author"].get("playtime_at_review", 0) #Get playtime_at_review or default to 0

            # Write the extracted data to the CSV file
            print(rev, ',', author_vote, ',', other_votes, ',', weighted_vote_score, ',', votes_funny, ',', comment_count, ',', timestamp_created, ',', playtime_at_review, file=fo)

        # Update the cursor for the next API request
        cursor = data['cursor']

### Sentiment Analysis of the Reviews

In [None]:
# Using the SentiWordNet lexicon
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
import pandas as pd

# Download the SentiWordNet and WordNet resources
nltk.download('sentiwordnet');
nltk.download('wordnet');
nltk.download('averaged_perceptron_tagger');
nltk.download('punkt');

In [None]:
def get_sentiment(word, pos):
    synsets = list(swn.senti_synsets(word, pos))
    if not synsets:
        return 0, 0, 0
    synset = synsets[0]
    return synset.pos_score(), synset.neg_score(), synset.obj_score()
    """
    Get the sentiment scores (positive, negative, objective) for a given word and part of speech.

    Args:
        word (str): The word to analyze.
        pos (str): The part of speech tag for the word.

    Returns:
        tuple: Positive score, negative score, and objective score.
    """

def sentiment_analysis_SentiWordNet(sentence):
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    sentiment_scores = {'positive': 0, 'negative': 0, 'objective': 0 }
    word_count = 0
    adj_count = 0
    verb_count = 0
    noun_count = 0
    adv_count = 0

    """
    Perform sentiment analysis on a given sentence using SentiWordNet.

    Args:
        sentence (str): The sentence to analyze.

    Returns:
        tuple: Positive score, negative score, objective score, total word count,
               count of adjectives, count of verbs, count of nouns, count of adverbs.
    """

    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)

        if wn_tag is not None:
            #print(f"{word}={wn_tag}", end=", ")
            pos_score, neg_score, obj_score = get_sentiment(word, wn_tag)
            sentiment_scores['positive'] += pos_score
            sentiment_scores['negative'] += neg_score
            sentiment_scores['objective'] += obj_score
            word_count += 1
            # CH: you need to only count for adjectives, verbs, nouns, and adverbs
            # if wn_tag is of that type !
            if wn_tag == wn.ADJ:
                adj_count += 1
            if wn_tag == wn.VERB:
                verb_count += 1 
            if wn_tag == wn.NOUN:
                noun_count += 1
            if wn_tag == wn.ADV:
                adv_count += 1

    # Normalize the score by the number of words
    if word_count > 0:
        sentiment_scores['positive'] /= word_count
        sentiment_scores['negative'] /= word_count
        sentiment_scores['objective'] /= word_count
    return sentiment_scores['positive'], sentiment_scores['negative'], sentiment_scores['objective'], word_count, adj_count, verb_count, noun_count, adv_count

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None
        
    """
    Convert Treebank part of speech tags to WordNet part of speech tags.

    Args:
        treebank_tag (str): The Treebank part of speech tag.

    Returns:
        str: The corresponding WordNet part of speech tag.
    """

In [None]:
# Read the scraped CSV file into a pandas DataFrame
df = pd.read_csv(scrape_file, encoding='utf-8'  )
print(len(df), "reviews loaded")
display(df)

In [None]:
# Perform sentiment analysis on each review using SentiwordNet
# if there is already a results file, skip this and create
# an extended file in the next cell
results = []

reviews = df['review'].tolist()

for i, review in enumerate(reviews):
    pos_score, neg_score, obj_score, word_count, adj_count, verb_count, noun_count, adv_count = sentiment_analysis_SentiWordNet(review)
    results.append({
        #'review': review, # CH no need for the reviews after this stage
        'positive_score': round(pos_score, 3),
        'negative_score': round(neg_score, 3),
        'objective_score': round(obj_score, 3),
        'word_count': word_count,
        'adj_count': adj_count,
        'verb_count': verb_count,
        'noun_count': noun_count,
        'adv_count': adv_count,
    })
    if i % 100 == 0:
        print(i, end=" ")
    
# Convert the results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Save the results to a CSV file incase the concat does not work
results_df.to_csv(results_file, index=False)

In [None]:
results_df = pd.read_csv(results_file)
combined_df = pd.concat([df, results_df], axis=1)

In [None]:
# make sure we don't have duplicate reviews so drop rows where positive_score, negative_score, objective_score 
# are the same
combined_df = combined_df.drop_duplicates(subset=['positive_score', 'negative_score', 'objective_score'])

# re-index the dataframe
combined_df = combined_df.reset_index(drop=True)

display(combined_df)

# write combined data to a new file
# this is so we can load the data back in later without having to re-compute the sentiment analysis
combined_df.to_csv(extended_file, index=False, encoding='utf-8')

"""
    Remove duplicate rows from the DataFrame based on sentiment scores and save the cleaned DataFrame to a new CSV file.

    Args:
        combined_df (pd.DataFrame): DataFrame containing combined reviews and sentiment analysis results.
        extended_file (str): Path to the CSV file where the cleaned DataFrame will be saved.

    Returns:
        pd.DataFrame: Cleaned DataFrame with duplicates removed and re-indexed.
 """