In [7]:
# Notebook-wide definitions.

import json
import numpy

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

# Exploring Relationships Between Sentiment And Updates

This is a grouping of explorations that focus on the role user score plays in the system - seeking to understand what it is influenced by, and any relationships it has with other metrics.

## 1

### Aim

Products that receive more updates also receive higher ratings.

### Purpose

This is based on the assumption that in an update, a developer will attempt to expand and improve the experience of the user or increase positive engagement, and therefore there should be a positive correlation between update and score metrics. If this holds true, updates have an obvious impact that can be further explored, however it may be skewed by products that happened to be well recieved upon release. Furthermore, it will not account for opinion that has waned over time.

### Investigation

I will first look at how many games are in each score band, as this may help explain some further results.

I will then take the number of updates and user score for a product and present them in scatter plot. I believe there will be an aggregation of points to the top-right, ergo, games that have been updated the most also with higher ratings.

In [8]:
user_score_distribution = json.load(open('./dumps/user-score-distribution.json'))

x = []
y = []

for item in user_score_distribution:
    x.append(item['score'])
    y.append(len(item['review_count']))

layout = go.Layout(
    title='Number Of Games Per User Score',
    xaxis=dict(
        title="User Score"
    ),
    yaxis=dict(
        title="Total Games"
    ),
    showlegend=False
)

data = [go.Bar(
    x=x,
    y=y
)]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [9]:
user_score_update_distribution = json.load(open('./dumps/user-score-update-distribution.json'))

x = []
y = []

for item in user_score_update_distribution:
    x.append(item['score'])
    y.append(
        numpy.std(item['update_counts'])
    )
    
layout = go.Layout(
    title='Standard Deviation of Total Updates Per User Score',
    hovermode='closest',
    xaxis=dict(
        title="User Score"
    ),
    yaxis=dict(
        title="Standard Deviation"
    ),
    showlegend=False
)    

data = [go.Bar(
    x=x,
    y=y
)]

fig = go.Figure(data=data, layout=layout)

iplot(fig)


Degrees of freedom <= 0 for slice


invalid value encountered in true_divide


invalid value encountered in double_scalars



In [10]:
user_score_update_count = json.load(open('./dumps/user-score-update-count.json'))

x = []
y = []

for item in user_score_update_count:
    x.append(item['user_score'])
    y.append(item['update_count'])
    
layout = go.Layout(
    title='Updates and User Scores',
    hovermode='closest',
    xaxis=dict(
        title="User Score"
    ),
    yaxis=dict(
        title="No. Updates"
    ),
    showlegend=False
)    

data = [go.Scatter(
    x=x,
    y=y,
    mode = 'markers'
)]

fig = go.Figure(data=data, layout=layout)
    
iplot(fig)

### Results

The pattern is much weaker than I was expecting, however it is still visible; **all games that have more than 124 updates have a user score higher than 50** (and ergo more than half of the users like it). I did not expect there to still be a solid representation of games within each score band having so few updates, however this leads me to suspect that some of these have less reviews behind them, making the user score less useful.

I want to grade the points based on the number of reviews that informed their score, and see how this differs when using 'true score'.

## 2

### Aim

Products that receive more updates also receive higher ratings - graded by review count, and compared with their score rank.

### Purpose

To see if the trend is more apparent in products that have a user score based on more opinions (and therefore more trustworthy).

### Investigation

Using an additional color dimension that is relative to a logarithm of the number of reviews, and further reducing the opacity of those below a threshold. Then, I plot the same chart using score rank instead.

In [11]:
user_score_update_count = json.load(open('./dumps/user-score-update-count.json'))

x = []
x2 = []
y = []
colors = []
opacities = []
tags = []

for item in user_score_update_count:
    x.append(item['user_score'])
    x2.append(item['score_rank'])
    y.append(item['update_count'])
    tags.append("Product Id: {0}, Total Reviews: {1}".format(
            item['product_id'],
            item['review_count']
        )
    )
    review_log = numpy.log2(item['review_count'])
    colors.append(
        review_log
    )
    if review_log < 10:
        opacities.append(0.1)
    else:
        opacities.append(1.0)
    
layout = go.Layout(
    title='Updates and User Scores',
    hovermode='closest',
    xaxis=dict(
        title="User Score"
    ),
    yaxis=dict(
        title="No. Updates"
    ),
    showlegend=False
) 
layout2 = go.Layout(
    title='Updates and Score Rank',
    hovermode='closest',
    xaxis=dict(
        title="Score Rank"
    ),
    yaxis=dict(
        title="No. Updates"
    ),
    showlegend=False
) 

data = [go.Scatter(
    x=x,
    y=y,
    mode='markers',
    marker=dict(
        color=colors,
        opacity=opacities,
        colorscale='Viridis'
    ),
    text=tags
)]
data2 = [go.Scatter(
    x=x2,
    y=y,
    mode='markers',
    marker=dict(
        color=colors,
        opacity=opacities,
        colorscale='Viridis'
    ),
    text=tags
)]

fig = go.Figure(data=data, layout=layout)
fig2 = go.Figure(data=data2, layout=layout2)
    
iplot(fig)
iplot(fig2)

### Results

The pattern is stronger when looking at user score, and suggests that products that are updated more often do receive more positive recommendations. Conversely, unsuccessful products rarely reach more than 100 updates. However, since the standard deviation generally appears to increase as the score increases, this suggests that products do not need to rely on updates to be successful.

When the products are placed in the context of the entire platform (ergo, ranked based on userscore proportional to all userscores), there are no meaningful patterns.

## 3

### Aim



### Purpose



### Investigation

