In [None]:
%pip install nltk --upgrade

In [None]:
import pandas as pd
import requests
from pprint import pprint
import datetime
from IPython.display import clear_output
import time

In [None]:
# define which game to scrape
#appid = 24780  # simcity4
appid = 2357570 # over watch 2

# set base name for the file
#base_name = 'simcity4'
base_name = 'overwatch2'

scrape_file = f'reviews_{base_name}.csv'
results_file = f'results_{base_name}.csv'
extended_file = f'reviews_{base_name}_extended.csv'

In [None]:
# Open a file to write the scraped data
with open(scrape_file, "w+", encoding='utf-8') as fo:
    # Write the header for the CSV file
    print("review,author_vote,other_votes,weighted_vote_score,votes_funny,comment_count,timestamp_created,playtime_at_review", file=fo)

    cursor = '*'  # Start with the initial cursor
    base_url = f'https://store.steampowered.com/appreviews/{appid}'

    curr_page = 0 # Initialize the current page counter
    while True:
        # Set the parameters for the API request
        params = { # https://partner.steamgames.com/doc/store/getreviews
            'json' : 1,
            'filter' : 'all', # sort by: recent, updated, all (helpfulness)
            'language' : 'english', # https://partner.steamgames.com/doc/store/localization # Only fetch reviews in English
            'day_range' : 9223372036854775807, # shows reviews from all time 
            'review_type' : 'all', # include all reviews (positive and negative)
            'purchase_type' : 'all', # all, non_steam_purchase, steam
            'num_per_page' : 100, # Number of reviews per page
            'cursor': cursor,  # Use the cursor returned from the last request
        }
        response = requests.get(base_url, params=params) #Make the API request
        data = response.json() # Parse the JSON response

        print(curr_page, end=" ") # Print the current page number
        curr_page += 1 # Increment the current page counter

        # Break the loop if there are no more reviews
        if 'reviews' in data and not data['reviews']:
            break
        
        page_list = data['reviews'] # Get the list of reviews from the response

        for i, page in enumerate(page_list):
            words =  page["review"].split() # Split the review text into words

            if len(words) < 30: 
                continue # Skip reviews with less than 30 words
            
            # Prepare the review text for CSV
            rev = '"' + page["review"].replace('"',"") + '"'
            author_vote = 1 if page['voted_up'] else 0 # Convert voted_up to binary
            other_votes = page['votes_up']
            weighted_vote_score = page["weighted_vote_score"] 
            votes_funny = page["votes_funny"] # Get votes_funny or default to 0
            comment_count = page["comment_count"] # Get comment_count or default to 0
            timestamp_created = datetime.datetime.fromtimestamp(page["timestamp_created"]).strftime('%Y-%m-%d %H:%M:%S') # Convert timestamp to human-readable format
            playtime_at_review = page["author"].get("playtime_at_review", 0) #Get playtime_at_review or default to 0

            # Write the extracted data to the CSV file
            print(rev, ',', author_vote, ',', other_votes, ',', weighted_vote_score, ',', votes_funny, ',', comment_count, ',', timestamp_created, ',', playtime_at_review, file=fo)

        # Update the cursor for the next API request
        cursor = data['cursor']

In [None]:
# Using the SentiWordNet lexicon
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
import pandas as pd

# Download the SentiWordNet and WordNet resources
nltk.download('sentiwordnet');
nltk.download('wordnet');
nltk.download('averaged_perceptron_tagger');
nltk.download('punkt');

In [None]:
def get_sentiment(word, pos):
    synsets = list(swn.senti_synsets(word, pos))
    if not synsets:
        return 0, 0, 0
    synset = synsets[0]
    return synset.pos_score(), synset.neg_score(), synset.obj_score()

def sentiment_analysis_SentiWordNet(sentence):
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    sentiment_scores = {'positive': 0, 'negative': 0, 'objective': 0 }
    word_count = 0
    adj_count = 0
    verb_count = 0
    noun_count = 0
    adv_count = 0

    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)

        if wn_tag is not None:
            #print(f"{word}={wn_tag}", end=", ")
            pos_score, neg_score, obj_score = get_sentiment(word, wn_tag)
            sentiment_scores['positive'] += pos_score
            sentiment_scores['negative'] += neg_score
            sentiment_scores['objective'] += obj_score
            word_count += 1
            # CH: you need to only count for adjectives, verbs, nouns, and adverbs
            # if wn_tag is of that type !
            if wn_tag == wn.ADJ:
                adj_count += 1
            if wn_tag == wn.VERB:
                verb_count += 1 
            if wn_tag == wn.NOUN:
                noun_count += 1
            if wn_tag == wn.ADV:
                adv_count += 1

    # Normalize the score by the number of words
    if word_count > 0:
        sentiment_scores['positive'] /= word_count
        sentiment_scores['negative'] /= word_count
        sentiment_scores['objective'] /= word_count
    return sentiment_scores['positive'], sentiment_scores['negative'], sentiment_scores['objective'], word_count, adj_count, verb_count, noun_count, adv_count

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

In [None]:
# Read the scraped CSV file
df = pd.read_csv(scrape_file, encoding='utf-8'  )
print(len(df), "reviews loaded")
display(df)

In [None]:
# Perform sentiment analysis on each review
# if there is already a results file, skip this and create
# an extended file in the next cell
results = []

reviews = df['review'].tolist()

for i, review in enumerate(reviews):
    pos_score, neg_score, obj_score, word_count, adj_count, verb_count, noun_count, adv_count = sentiment_analysis_SentiWordNet(review)
    results.append({
        #'review': review, # CH no need for the reviews after this stage
        'positive_score': round(pos_score, 3),
        'negative_score': round(neg_score, 3),
        'objective_score': round(obj_score, 3),
        'word_count': word_count,
        'adj_count': adj_count,
        'verb_count': verb_count,
        'noun_count': noun_count,
        'adv_count': adv_count,
    })
    if i % 100 == 0:
        print(i, end=" ")
    
# Convert the results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Save the results to a CSV file incase the concat does not work
results_df.to_csv(results_file, index=False)

In [None]:
# Perform descriptive statistics, using the describe() method from pandas to compute descriptive statistics (mean, standard deviation, min, max, etc.) for the sentiment scores and word count.
#descriptive_stats = results_df.describe()
#print(descriptive_stats)

# Perform descriptive statistics on the combined DataFrame
#descriptive_stats = combined_df.describe(include='all')
#print(descriptive_stats)

# read in file so it's independent from the previous cells
results_df = pd.read_csv(results_file)

# Combine the original DataFrame with the sentiment analysis results
combined_df = pd.concat([df, results_df], axis=1)

# Print the combined DataFrame
# print(combined_df)

# Perform descriptive statistics on the combined DataFrame
# descriptive_stats = combined_df.describe(include='all')
# print(descriptive_stats)

In [None]:
display(combined_df)

In [None]:
# write combined data to a new file
# this is so we can load the data back in later without having to re-compute the sentiment analysis
combined_df.to_csv(extended_file, index=False, encoding='utf-8')

In [None]:
# Load the CSV file'
df = pd.read_csv(extended_file)

# make sure we don't have duplicate reviews so drop rows where positive_score, negative_score, objective_score 
# are the same
df = df.drop_duplicates(subset=['positive_score', 'negative_score', 'objective_score'])

# re-index the dataframe
df = df.reset_index(drop=True)

print(len(df), "unique reviews loaded")

# Display basic information about the dataset
print(df.info())

In [None]:
# Display the first few rows of the dataset
display(df.head())

In [None]:
# Display summary statistics of numeric data in the dataset
# CH note that all reviews now are listed as unique i.e. no duplicates
display(df.describe(include='all'))

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# CH print out the timespan of the reviews
# this is useful for dweciding what timespand to aggregate to
print(df['timestamp_created'].min(), df['timestamp_created'].max())

In [None]:
# CH aggregate means of values over time (monthly)
df_agg  = df.copy() 
df_agg['timestamp_created'] = pd.to_datetime(df_agg['timestamp_created']) # make python datetime object
df_agg.set_index('timestamp_created', inplace=True) # set index to timestamp_created

# drop review column as it is not numeric
df_agg.drop(columns=['review'], inplace=True)

# resample all numeric columns to monthly
df_agg = df_agg.resample('MS').mean() # resample all numeric columns to monthly

# resample all numeric columns to 3 month
#df_agg = df_agg.resample('3MS').mean() # resample all numeric columns to 6 month

# resample all numeric columns to 6 month
#df_agg = df_agg.resample('6MS').mean() # resample all numeric columns to 6 month

df_agg.reset_index(inplace=True) # reset index to make timestamp_created a column again
df_agg['timestamp_created'] = df_agg['timestamp_created'].dt.strftime('%Y-%m') # format timestamp_created
display(df_agg)



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_over_time(columns, df, logy=False):
    df = df[columns + ['timestamp_created']]
    # Melting the DataFrame
    df_melted = df.melt('timestamp_created', var_name='variable', value_name='value')
    plt.figure(figsize=(20, 8))
    
    sns.lineplot(data=df_melted, x='timestamp_created', y='value', hue='variable')
    if logy:
        plt.yscale('log')

In [None]:
plot_over_time(['positive_score', 'negative_score', 'objective_score'], df_agg, logy=False)

In [None]:
plot_over_time(['word_count', 'adj_count', 'verb_count', 'noun_count', 'adv_count'], df_agg, logy=True)

In [None]:
plot_over_time(['author_vote', 'other_votes', 'votes_funny', 'comment_count'], df_agg, logy=True)

In [None]:
# Distribution of Numerical Variables
# Visualize the distribution of numerical variables using histograms and boxplots.
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for numerical columns

dfs = df[['positive_score', 'negative_score', 'objective_score']]
dfs.hist(bins=100, figsize=(20, 10))
plt.show()

# Boxplot for numerical columns
#plt.figure(figsize=(15, 10))
#sns.boxplot(data=df)
#plt.xticks(rotation=90)
#plt.show()

In [None]:
dfc = df[['word_count', 'adj_count', 'verb_count', 'noun_count', 'adv_count']]
axes = dfc.hist(bins=100, figsize=(20, 10))
axes.set_xlim([0, 500])  # Limit the x-axis to 500
# set all x-axis to log scale
#for ax in axes.flatten():
#    ax.set_xscale('log')


In [None]:
# add columns to df that describe the ratio of adj, verb, noun, adv to word count
df['adj_ratio'] = df['adj_count'] / df['word_count']
df['verb_ratio'] = df['verb_count'] / df['word_count']
df['noun_ratio'] = df['noun_count'] / df['word_count']
df['adv_ratio'] = df['adv_count'] / df['word_count']

dfr = df[['adj_ratio', 'verb_ratio', 'noun_ratio', 'adv_ratio']]
axes = dfr.hist(bins=100, figsize=(20, 10))


In [None]:
# to df add a column that indicates which of adj, verb, noun, adv is the biggest 
# encode with "adj_count", "verb_count", "noun_count", "adv_count"
df['POS_ratio'] = df[['adj_ratio', 'verb_ratio', 'noun_ratio', 'adv_ratio']].idxmax(axis=1)

# in df['POS_ratio'] rename adj_ratio to adj_most, verb_ratio to verb_most, noun_ratio to noun_most, adv_ratio to adv_most
df['POS_ratio'] = df['POS_ratio'].replace({'adj_ratio': 'adj_most', 'verb_ratio': 'verb_most', 'noun_ratio': 'noun_most', 'adv_ratio': 'adv_most'})

# show the distribution of the POS_ratio
df['POS_ratio'].value_counts().plot(kind='bar');

In [None]:
# Plotting KDE plots
plt.figure(figsize=(14, 10))

# KDE plot for positive sentiment score vs adjective count
sns.kdeplot(data=results_df, x='word_count', y='positive_score', fill=True, cmap="Blues", thresh=0.05)
plt.title('KDE Plot: Adjective Count vs Positive Sentiment Score')
plt.xlabel('Adjective Count')
plt.ylabel('Positive Sentiment Score')
plt.xlim(0, 600) # limit x-axis to 0-600
plt.ylim(0, 0.2)
plt.show()

# KDE plot for negative sentiment score vs adjective count
plt.figure(figsize=(14, 10))
sns.kdeplot(data=results_df, x='word_count', y='negative_score', fill=True, cmap="Reds", thresh=0.05)
plt.title('KDE Plot: Adjective Count vs Negative Sentiment Score')
plt.xlabel('Adjective Count')
plt.ylabel('Negative Sentiment Score')
plt.xlim(0, 600)
plt.ylim(0, 0.2)
plt.show()

# KDE plot for objective sentiment score vs adjective count
plt.figure(figsize=(14, 10))
sns.kdeplot(data=results_df, x='word_count', y='objective_score', fill=True, cmap="Greens", thresh=0.05)
plt.title('KDE Plot: Adjective Count vs Objective Sentiment Score')
plt.xlabel('Adjective Count')
plt.ylabel('Objective Sentiment Score')
plt.xlim(0, 700)
plt.show()

In [None]:
# Load the CSV file'
df = pd.read_csv(extended_file)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Based on the 574 Lecture 41-Data Science
# scatterplot of votes_funny vs positive_score
fig, ax = plt.subplots(figsize=(10, 10))

sns.scatterplot(data=results.df, x="votes_funny", y="positive_score", ax=ax)
ax.set(title='Votes_funny vs positive_score', xlabel='Votes_funny', ylabel='Positive_score');

In [None]:
# Based on the 574 Lecture 41-Data Science
# scatterplot of votes_funny vs positive_score with regression line
ax1, fig =  plt.subplots(figsize= (10, 10))
plt.close(plt.gcf()) # I'm unclear why this is needed here but whatevs

ax1 = sns.lmplot(data=results.df, x="votes_funny", y="positive_score", height=10, aspect=1,)
ax1.set(title='Votes_funny vs positive_score', xlabel='Votes_funny', ylabel='Positive_score');

In [None]:
# Calculate correlation coefficient
correlation = df['votes_funny'].corr(df['positive_score'])
print(f'Correlation coefficient between votes_funny and positive_score: {correlation}')

In [None]:
# This is from 574 Lecture 41-Data Science
# visualizing the correlation coefficient
numeric_df = df.select_dtypes(include=['float64', 'int64'])

r2 = numeric_df.corr(method='pearson')   # pearson is the standard method of calculation the goodness of fit
#r2 = df.corr(method='spearman') # performs a rank-ordering first
display(r2)

In [None]:
# next step: correlation matrix between all values. However, first I need to get the R2s.
ax1, fig =  plt.subplots(figsize= (13, 13))
sns.heatmap(abs(r2), 
            annot = True, 
            fmt=".1f"); # show numbers, but with 1 digit only