In [49]:
# Notebook-wide definitions.

import json
import numpy

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

# Assessing Whether Reviews Are Equal

TODO: Explain context from lab notebook.

A user who has received a product for free has either had it gifted to them by a friend or a developer. It is therefore tough to discern whether they are a reviewer, a typical consumer, or somewhere in between. We eschew the notion of personal motivation by using pre-classified information, and looking solely at the wealth of data collected about the users habits.

## R1

### Aim

To discern whether reviews left by users who have received the product for free are biased.

### Purpose

If there is a trend either way then this means that reviews cannot be treated as an atomically equal metric; this is problematic as there are so many of them, and may result in the need for a new score system to be created for the dataset (at least I have control of this). Simply factoring this information into graphical representations is not useful because the usefulness of its visibility relies on the personal asssumptions of the viewer; this is what I wish to make concrete.

I believe one of three things will happen:

- Free reviews will favour products more positively (as they do not have the stigma of having spent money on the experience). They may also have a personal relationship with the developer.

- Free reviews will be more spread out and perhaps more considered because the person receiving a product for free may be a professional reviewer, community influencer, or at  least somebody aware that the context of their purchase will be highlighted.

- There will be no trend due to the wide mix of people reviewing - perhaps most people leaving reviews on products that were received for free received them as gifts and therefore even though it was not their money, they still feel obliged to give an honest opinion.

### Investigation

// TODO: Talk about ignoring price of games that are not free - could do some basic investigation there.

Out of all reviews:
- 1,162,276 (~68%) positive.
- 370,912 (~32%) negative.

Out of all reviews,
- 1,496,026 (~97.58%) were submitted by users who purchased the product themselves.
- 37,162 (~2.42%) were submitted by users who received the product for free.

Out of 1,496,026 purchased reviews:
- 1,133,417 positive (~68%).
- 362,609 negative (~32%).
The lack of change is due to the comparatively small sample size of free games, which therefore requires further investigation.

Out of 37,162 free reviews:
- 28,859 (~77.66%) positive.
- 8,303 (~22.34%) negative.

This appears to suggest that free reviews are slightly more biased towards positive recommendations. In order to verify this, we need to look at their deviation from the ratio on the product for which they were left.

For every product that has **at least one free review**, we build an intermediate index in order to construct the same set of comparisons. In it, we **discount products that have no purchased reviews** (in which case they are free or anomalous).

In [64]:
review_demographic_count = json.load(open('./dumps/review-demographic-count.json'))

num_purchased_products_with_free_reviews = 0

percent_purchased_reviews_per_product = []

# This is the deviation of the free review ratio to the purchased ratio of the same product.
deviation_per_product = []
deviation_and_total_free_per_product = []

num_more_free_than_purchased = 0
num_positive_deviation = 0
num_negative_deviation = 0

for item in review_demographic_count:
    if item['is_free'] == True:
        continue
    
    num_purchased_products_with_free_reviews += 1
    
    if item['total_reviews_purchased'] < item['total_reviews_free']:
        num_more_free_than_purchased += 1
    
    # Compute the split for free and purchased.
    percent_purchased = item['total_reviews_purchased'] / item['total_reviews'] * 100
    percent_purchased_reviews_per_product.append(percent_purchased)
    
    # Compute the positive percentages for free and purchased.
    positive_percent_purchased = item['total_reviews_purchased_positive'] / item['total_reviews_purchased'] * 100
    positive_percent_free = item['total_reviews_free_positive'] / item['total_reviews_free'] * 100
    
    # Compute the difference between them.
    deviation = positive_percent_free - positive_percent_purchased
    deviation_per_product.append(deviation)
    
    if deviation > 0:
        num_positive_deviation += 1
    if deviation < 0:
        num_negative_deviation += 1
    
    deviation_and_total_free_per_product.append({
        'total_free': item['total_reviews_free'],
        'deviation': deviation
    })
    
average_percent_purchased_reviews_per_product = numpy.average(percent_purchased_reviews_per_product)
average_deviation_per_product = numpy.average(deviation_per_product)

print("Number of paid products with more free reviews than purchased reviews: " + str(num_more_free_than_purchased))
print("Number of paid products with free reviews: " + str(num_purchased_products_with_free_reviews))
print("Average percent of purchased reviews per product: " + str(average_percent_purchased_reviews_per_product))
print("Average deviation in percent of free positive reviews against purchased positive reviews: " + str(average_deviation_per_product))
print("Number of products for which this deviation is positive: " + str(num_positive_deviation))
print("Number of products for which this deviation is negative" + str(num_negative_deviation))

Number of paid products with more free reviews than purchased reviews: 6
Number of paid products with free reviews: 795
Average percent of purchased reviews per product: 91.898367361
Average deviation in percent of free positive reviews against purchased positive reviews: 4.30510289632
Number of products for which this deviation is positive: 570
Number of products for which this deviation is negative217


This is almost sound, except the deviation might be drastic if there are only a couple of free reviews per product (for example, 70% positive purchased with 1000 reviews versus 100% positive free with 1 review is unfair to compare).
We can graph this to get a better understanding of what is happening.

// TODO: A double bar graph which will inherently show the deviation.

In [53]:
deviation_and_total_free_per_product.sort(
    key=lambda i: i['total_free']
)

x = []
y = []

for item in deviation_and_total_free_per_product:
    x.append(item['total_free'])
    y.append(item['deviation'])
    
layout = go.Layout(
    title='Deviation Of Free Reviews',
    xaxis=dict(
        title="Total Free Reviews"
    ),
    yaxis=dict(
        title="Deviation Of Ratio From Purchased Reviews"
    ),
    showlegend=False
)   

data = [go.Scatter(
    x=x,
    y=y,
    mode='markers'
)]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

It appears that products with less free reviews have wilder deviation from the purchased percentage. I did not expect this disparity to continue past 10 reviews. This suggests that there is only a slight trend. This is particularly unexpected as there are few products for which there are more free reviews than purchased reviews.

However, there are significantly (353) more products for which the percentage of positive reviews is greater in the subset of those left for free, so although the deviation may not be drastic is it reasonable to state that free reviews are more positive.

Finally, having observed something, we can also look at for which products free reviews are left, and ergo the cases for which it is most applicable  - are they left chiefly on large, popular games, or younger games (in order to promote them)?

There are several ways to do this - the most straightforward is to look at the ratio of free to purchased reviews per genre.

In [95]:
genres = json.load(open('./dumps/genres.json'))

genres_counts = []

# Pre-fill array with genres.
for genre in genres:
    genres_counts.append({
        "genre": genre['genre'],
        "total_products": genre['count'],
        "total_reviews_free": 0,
        "total_reviews": 0
    })

for item in review_demographic_count:
    for genre in item['genres']:
        # Find the matching genre to add to.
        for genre_count in genres_counts:
            if genre_count['genre'] == genre:
                genre_count['total_reviews_free'] += item['total_reviews_free']
                genre_count['total_reviews'] += item['total_reviews']
                break
                
for item in genres_counts:
    if item['total_reviews'] == 0:
        item['percentage'] = 0
    else:
        item['percentage'] = item['total_reviews_free'] / item['total_reviews'] * 100

genres_counts.sort(key=lambda genre: genre['percentage'], reverse=True)

x = []
y = []
names = []

for item in genres_counts:
    x.append(item['genre'])
    y.append(item['percentage'])
    names.append("Total Reviews: " + str(item['total_reviews']))
    
layout = go.Layout(
    title='Free Reviews Per Genre',
    xaxis=dict(
        title='Genre'
    ),
    yaxis=dict(
        title='Percent Reviews Free'
    ),
    showlegend=False
)   

data = [go.Bar(
    x=x,
    y=y,
    hovertext=names
)]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

As we can see, software and casual games receive the most free/gifted reviews proportional to purchased reviews. This is somewhat influenced by products per genre, which can be seen on hover.

### Results

- On average, ~8% of reviews per product are submitted by users who have received the product for free.
- For ~78.2% of products with free reviews, the ratio of positive free reviews to negative free reviews is greater than the ratio of positive purchased reviews to negative purchased reviews. 
- On average, reviews that were submitted for free are ~5% more likely to be positive.
- There are more free revies submitted for software products and casual games.