In [1]:
# Notebook-wide definitions.

import time
import json
import numpy
import pandas
from fbprophet import Prophet

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

# Exploring Relationships Between CCU and Other Metrics

Explain the context here...

## C2

### Aim

All products churn over time.

### Purpose

To verify assumptions to use in further investigation.

### Investigation

Looking at the start and end CCU would not account for seasonality. Most products start with 0, so to compare that with 1 would not be indicative of churn. What we can do is compare the final value with the median, and furthermore compare the means of chunks of time in CCU (compressing it).

Products that don't exhibit this may either be successful, or simply young enough to still be in their initial discovery period of exposure.

## C2

### Aim

Products that receive more updates have higher daily CCU.

### Purpose

To see if there is a link between updates and CCU and perhaps spur further investigation - for example, do updates prevent churn?

### Investigation

Having pre-computed the update frequency of products we can chart this against a measurement of the overall CCU of a product; mean is appropriate here as many products have no CCU tracked (aside from the developers themselves) until they gain some exposure. 

In [2]:
ccu = json.load(open('./dumps/ccu.json'))
update_frequencies = json.load(open('./dumps/update-frequencies.json'))

# Do some initial exploration.
max_ccus = [i['max_ccu'] for i in ccu]

print("Median of max CCU for all products: " + str(numpy.median(max_ccus)))
print("Mean of max CCU for all products: " + str(numpy.mean(max_ccus)))
print("Max of max CCU for all products: " + str(numpy.amax(max_ccus)))

bins = numpy.linspace(0, 18, 9)
histogram = numpy.histogram([numpy.log2(i) for i in max_ccus], bins)

layout = go.Layout(
    title='Product Popularity',
    xaxis=dict(
        title="Max CCU Log2"
    ),
    yaxis=dict(
        title="Number Products Per Bucket"
    ),
    showlegend=False
)

data = [go.Bar(
    x=histogram[1],
    y=histogram[0]
)]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

x = []
y = []

for i in ccu:
    for j in update_frequencies:
        if i['product_id'] == j['product_id']:
            if i['max_ccu'] > 1185:
                x.append(j['update_frequency_per_four_weeks'])
                y.append(numpy.log2(i['stddev_ccu']))
            break

layout = go.Layout(
    title='Developer Engagement Against CCU',
    xaxis=dict(
        title="Update Frequency"
    ),
    yaxis=dict(
        title="Standard Deviation of CCU Log2"
    ),
    showlegend=False
)

data = [go.Scatter(
    x=x,
    y=y,
    mode = 'markers'
)]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

# TODO: Linear regression.

Median of max CCU for all products: 54.0
Mean of max CCU for all products: 1184.50051177
Max of max CCU for all products: 151239


### Results

It is clear that most products remain relatively unseen. The median for max CCU is 54, while the mean for max CCU is 1185.

There is a lot of noise. This is reduced by using logarithm to remove outliers, and furthermore looking at products above the mean CCU (and ergo ones that are more popular). In doing so is a slight trend towards higher CCU for popular products with more updates versus popular products with less updates.

Linear regression is shown used to predcit this trend.

## C2

### Aim

Games receive a 'spike' in CCU after an update.

### Purpose

To see if there is a link between updates and CCU following, and perhaps spur further investigation.

### Investigation

First we start granular to find positive cases. There are several approaches to verifying this.

- Graph CCU.
- Investigate effect of seasonality.
- For every update, look at the change around itself.
- For every change point, look at the the proximity of updates.

Product '221100', or 'Day Z' represents the most data in the set. It is appropriate for use here because:

- It is the kind of game, genre-wise, that fosters updates as an incentive for use. An MMO, and ergo living game, with an slowly growing array of tangible content and features.
- The wealth of data means there are lots of examples to choose from.
- There is a [community curated list](https://dayz.gamepedia.com/Changelog) of updates that were meaningful. While this is not complete, it is enough to piece together a full list of actual game content changes.
- The game has not participated in any large promotions, which will reduce the influence of external factors.

In [75]:
def create_timeseries_figure(timeseries_data, product_id, ccu_override=None, minima_x=[], minima_y=[]):
    x = [time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(item['day_start_time'])) for item in timeseries_data]
    y1 = []
    if ccu_override is None:
        y1 = [item['peak_ccu'] for item in timeseries_data]
    else: 
        y1 = ccu_override
    y2 = [item['total_updates'] for item in timeseries_data]
    
    data = [
        go.Scatter(
            x=x,
            y=y1,
            mode='line',
            name="Peak CCU"
        ),
        go.Bar(
            x=x,
            y=y2,
            name="Total Updates",
            xaxis='x',
            yaxis='y2' 
        )
    ]
    
    if(len(minima_x) > 0):
        data.append(
            go.Scatter(
                x=minima_x,
                y=minima_y,
                mode='markers',
                name='Local Minima',
                xaxis='x',
                yaxis='y'
            )
        )

    layout = go.Layout(
        title='CCU Timeseries ' + product_id,
        xaxis=dict(
            title="Day Start Time",
            autorange=True,
        ),
        yaxis=dict(
            title="Peak CCU"
        ),
        yaxis2=dict(
            title="Total Updates",
            overlaying='y',
            side='right',
            anchor='x'
        ),
        showlegend=True
    )

    fig = go.Figure(data=data, layout=layout)
    
    return fig

timeseries_221100 = json.load(open('./dumps/timeseries/221100-timeseries.json'))
iplot(create_timeseries_figure(timeseries_221100, "221100"))

Clearly not all updates have a direct or immediate impact on CCU. There is a seasonality that distorts this perception, however some clearly coincide with downward trends. There are specific problems to note with the data itself:

- Some CCU seems to have been approximated. There is not much that can be done here for the context of the entire dataset without a significant amount of effort, however the systems were not expected to have been 100% accurate.
- Some updates misalign with CCU spikes by a day in either direction. When investigating some of these manually, the CCU spike would seem to be a result of them. I would chalk this up to anticipation, or updates coming towards the end of local maxima.

There are also some general problems...

- Players may not check out an update on the same day of release - they may wait until it is convenient to them.
- Most games see user churn over time post-release, in both CCU and user sentiment.
- CCU may change due to external factors, however developers with strong communication will usually relay these factors via an update (during a sale, for example).
- As observed when examining sentiment, there is a period of uncertainty at release.

We should therefore remove the updates that were not modifications to game content, and attempt to remove seaasonality.

We first look at one example of a successful update, on December 10th, 2015. The signal is relatively unchanging despite updates, until this day. Looking at the ['patch notes' for this day](http://www.dayztv.com/news/2015/12/dayz-0-59-update-features-items-list/), we can see that this was indeed a substantial update to the game content - new vehicles, items, and a map for players. These are the kinds of things we want to capture.

In [78]:
timeseries_221100_alternative = json.load(open('./dumps/timeseries/221100-timeseries-alternative.json'))

# Get the same CCU.
y = [item['peak_ccu'] for item in timeseries_221100_alternative]

iplot(create_timeseries_figure(timeseries_221100_alternative, "221100 Cleaned", y))

We can now note a few things:

- Periods without updates see steep drops in CCU.
- Although not all updates appear to have the same effect, almost every update prior to 2017 (after there was a large gap in development) is proceeded by an upswing in CCU.
- Every substantial increase in CCU coincides with an update.
- Similar to sentiment, the impact of engagement on CCU wanes as the player base drops (over time).

Updates that follow others incredibly closely are likely bug-fixes to the previous content release, however we will not remove them here.

Before performing change-point analysis, we can clean up the data representation, making it stationary and charting local minima. The seasonality seems to be weekly, with peaks happening 7 days apart. It needs to be using a sliding average to emphasise change points. This will not account for things like holidays, but will paint a clearer picture.

In [79]:
from scipy.signal import argrelmin

x = [time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(item['day_start_time'])) for item in timeseries_221100_alternative]

# Remove the seasonality.
y = pandas.Series(y).rolling(window=25,center=False).mean()
y = pandas.Series(y).rolling(window=18,center=False).mean()
y = y.tolist()

minima = argrelmin(numpy.array(y), order=5)[0]

minima_x = []
minima_y = []

for m in minima:
    minima_x.append(x[m])
    minima_y.append(y[m])
            
iplot(create_timeseries_figure(timeseries_221100_alternative, "221100 Minima", y, minima_x, minima_y))


invalid value encountered in less


invalid value encountered in less

