In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format ='retina'
import nltk
import numpy as np
from textblob import TextBlob
from textblob import Word
import seaborn as sns

In [None]:
# Import the dataset and check the contents
robloxData = pd.read_csv("ROBLOX_REVIEWS.csv")

In [None]:
# Drop the column that counts up from 1
robloxData = robloxData.drop(columns=['Unnamed: 0', 'review_id', 'pseudo_author_id'])

In [None]:
# Ensure each review has a string value. In the original dataset, some reviews are empty.
robloxData['review_text'] = robloxData['review_text'].fillna('').astype(str)

In [None]:
# Adjust the formatting of the timestamp
robloxData['review_timestamp'] = pd.to_datetime(robloxData['review_timestamp'], errors='raise')

In [None]:
# Extract features from timestamp
robloxData['review_hour'] = robloxData['review_timestamp'].dt.hour
robloxData['review_day'] = robloxData['review_timestamp'].dt.dayofweek

## 1. Most common words
What are the most common words reviewers use when assessing Roblox?

***
Cleaning the review column:

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopw = stopwords.words('english')

In [None]:
# Change review text to lowercase
robloxData['simple_review'] = robloxData['review_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
# Remove stopwords
robloxData['simple_review'] = robloxData['simple_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stopw))

In [None]:
# Adding extra stopwords to further filter the reviews
extra_stopw = [
    'really',
    'get',
    'ever',
    'much',
    'many',
    'please',
    'play',
    'playing',
    'make',
    'app',
    'i\'m',
    'im',
    'even',
    'give',
    'got',
    'u',
    'one',
    'can\'t',
    'people',
    'want',
    'also',
    'lot',
    'would',
    'still',
    'every',
    'thing',
    'try',
    'need',
    'game',
    'games',
    'roblox'
]

In [None]:
# Removing the extra stopwords from the reviews
robloxData['simpler_review'] = robloxData['simple_review'].apply(lambda x: " ".join(x for x in x.split() if x not in extra_stopw))

In [None]:
# Remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

import string
robloxData['simple_review'] = robloxData['simple_review'].apply(lambda x: remove_punctuation(x))
robloxData[['simple_review']].head()

In [None]:
# Lemmatization
robloxData['simpler_review'] = robloxData['simpler_review'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

***
Calculating word frequencies:

In [None]:
# Create a new dataframe that takes each word from a review individually and adds instances of unique words
wordFreq = robloxData['simpler_review'].str.split().explode().value_counts()

In [None]:
# Convert the word counts to a dataframe
wordFreqDf = wordFreq.reset_index()
wordFreqDf.columns = ['Word', 'Count']

In [None]:
# Create a new dataframe with just the top most frequent words
topWords = wordFreqDf.head(10)

In [None]:
# Format and display the top words dataframe
topWords.style.format({"Count": "{:,}"})

In [None]:
# Displaying the first 30 reviews that contain the word " "
#pd.set_option('display.max_colwidth', 50)
#robloxData[robloxData['simpler_review'].str.contains('update', case=False, na=False)][['simpler_review']].head(30)

## 2. Common words amongst the top 100 most-liked reviews

In [None]:
# Create a dataframe that contains only the top 100 most-liked reviews
topReviews = robloxData.sort_values(by='review_likes', ascending=False).head(100)

In [None]:
topLikedFreq = topReviews['simpler_review'].str.split().explode().value_counts().reset_index().head(10)

In [None]:
topLikedFreq.style.format({"Count": "{:,}"})

## 3. Is there a relationship between the length of a review and its rating?

In [None]:
# Create a new column to represent the length of reviews
robloxData['review_length'] = robloxData['review_text'].apply(len)

In [None]:
# Group by review rating, display the review length column, 
# and find the mean of the review length for each review rating. Then reset the indices
# to display as if it were its own dataframe.
robloxData.groupby('review_rating')['review_length'].mean().reset_index(name='avg_rvw_len')

In [None]:
# Determining if the results are statistically significant using a one-way ANOVA test
import scipy.stats as stats

# Group data by the review rating
    # Get all unique values in the review_rating col
    # Filter dataframe to only include rows with rating and select review_rating col
    # Iterate over each unique rating and create a list of review_length values for each group
ratingGroups = [robloxData[robloxData['review_rating'] == rating]['review_length'] for rating in robloxData['review_rating'].unique()]

# Perform one-way ANOVA
f_stat, p_val = stats.f_oneway(*ratingGroups)

print("F-statistic: ", f_stat)
print("P-value: ", p_val)

alpha = 0.05
if p_val < alpha:
    print("The differences in average review length across different review ratings are statistically significant.")
else:
    print("The differences in average review length across different review ratings are not statistically significant.")

### Conclusion
The differences between the average lengths of reviews across the ratings are statistically significant. This indicates that users who are highly satisfied with the app, giving it a rating of 5, tend to write shorter reviews. Users who are more neutral or moderately satisfied with the app, giving it a rating of 3, tend to write longer reviews.

## 4. Does the time of day the review is posted have an affect on the sentiment of the review?

***
Assessing sentiment of the reviews:

In [None]:
# Function to calculate sentiment polarity
def get_sentiment(text):
    return round(TextBlob(text).sentiment.polarity, 2)

In [None]:
# Calculate the sentiment of each review
robloxData['sentiment'] = robloxData['simple_review'].apply(get_sentiment)

In [None]:
# Add column for qualitative sentiment value
def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

robloxData['sent_simple'] = robloxData['sentiment'].apply(getAnalysis)

In [None]:
# Assess sentiment counts for specific words
#robloxData[robloxData['simpler_review'].str.contains("robux", case=False, na=False)].groupby('sent_simple').size().reset_index(name='counts')

****
Calculating and applying sentiment polarity and quantifiers:

In [None]:
# Display the number of negative, neutral, and positive reviews
sentimentSummary = robloxData[['sent_simple', 'simpler_review']].groupby(by='sent_simple').count()
sentimentSummary.style.format("{:,}")

***
Classifying the times of day:

In [None]:
# Create categories for the periods of a day
def classifyPeriodOfDay(datetime):
    hour = datetime.hour
    if 5 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 17:
        return "Afternoon"
    elif 17 <= hour < 21:
        return "Evening"
    else:
        return "Night"

periodOrder = ['Morning', 'Afternoon', 'Evening', 'Night']

In [None]:
# Categorize reviews by the time of day they were posted
robloxData['day_period'] = robloxData['review_timestamp'].apply(classifyPeriodOfDay)

***
Creating a new dataframe with specific variables and plotting it:

In [None]:
# Create a new dataframe for only the variables needed
todSentPlot = robloxData[['review_timestamp', 'day_period', 'sent_simple']]

# Ensure the day periods remain in a specific order
todSentPlot['day_period'] = pd.Categorical(todSentPlot['day_period'], categories=periodOrder, ordered=True)

# Plot
p1 = todSentPlot.groupby(['day_period', 'sent_simple']).size().unstack(fill_value=0).plot(kind='bar', figsize=(10, 6));

plt.title('Sentiment Counts by Period of Day')
plt.xlabel('Period of Day')
plt.ylabel('Review Count by Sentiments')
plt.xticks(rotation=0)  # Set x-axis labels to horizontal
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

***
Bar chart only looking at neutral and negative reviews:

In [None]:
# Making a new dataframe that excludes reviews labeled as positive
robloxDataNoPos = robloxData[robloxData['sent_simple'] != 'Positive'].copy()

In [None]:
# Create a new dataframe for only the variables needed
todSentPlot = robloxDataNoPos[['review_timestamp', 'day_period', 'sent_simple']]

# Ensure the day periods remain in a specific order
todSentPlot['day_period'] = pd.Categorical(todSentPlot['day_period'], categories=periodOrder, ordered=True)

# Plot
p1 = todSentPlot.groupby(['day_period', 'sent_simple']).size().unstack(fill_value=0).plot(kind='bar', figsize=(8, 3));

plt.title('Sentiment Counts by Period of Day')
plt.xlabel('Period of Day')
plt.ylabel('Review Count by Sentiments')
plt.xticks(rotation=0)  # Set x-axis labels to horizontal
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

## 5. Attitudes toward app versions

In [None]:
# Clipping the app versions
robloxData['app_version_simple'] = robloxData['author_app_version'].str[:5]

# Grouping by app version and rating
versionRating = robloxData.groupby('app_version_simple')['review_rating'].mean().reset_index().round(2)

In [None]:
# Plot a line graph
versionRatingGraph = versionRating.plot(
    'app_version_simple',
    'review_rating',
    figsize=(10, 3),
    title='Roblox Ratings Across Version Updates',
    yticks=np.arange(1, 5.5, 0.5),
    xlabel='App Version (Truncated)',
    ylabel='Average App Rating',
    grid=True
);

## 6. Ratings over time

In [None]:
# Grouping by time and rating
timeRating = robloxData.groupby('review_timestamp')['review_rating'].mean().reset_index().round(2)

# Plot a line graph
versionRatingGraph = versionRating.plot();

In [None]:
# Grouping the data by year and month
robloxData['year_month'] = robloxData['review_timestamp'].dt.to_period('M')
monthly_avg_ratings = robloxData.groupby('year_month')['review_rating'].mean().reset_index()

# Convert the 'year_month' back to a datetime format for plotting
monthly_avg_ratings['year_month'] = monthly_avg_ratings['year_month'].dt.to_timestamp()

# Plot the data
plt.figure(figsize=(12, 6))
plt.plot(monthly_avg_ratings['year_month'], monthly_avg_ratings['review_rating'], marker='o')
plt.title('Average App Rating Over Time (Monthly)')
plt.xlabel('Date')
plt.ylabel('Average Rating')
#plt.ylim(1, 5)
#plt.ylim(4, 5)
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Group by year, month, and day and calculate the mean rating for each group
robloxData['year_month_day'] = robloxData['review_timestamp'].dt.to_period('D')
daily_avg_ratings = robloxData.groupby('year_month_day')['review_rating'].mean().reset_index()

# Convert the 'year_month_day' back to a datetime format for plotting
daily_avg_ratings['year_month_day'] = daily_avg_ratings['year_month_day'].dt.to_timestamp()

# Plot the data
plt.figure(figsize=(12, 6))
plt.plot(daily_avg_ratings['year_month_day'], daily_avg_ratings['review_rating'])
plt.title('Average App Rating Over Time (Daily)')
plt.xlabel('Date')
plt.ylabel('Average Rating')
plt.ylim(1, 5)
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()