In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from moztelemetry.dataset import Dataset

%matplotlib inline

### Basics

We will use the Dataset API to fetch data.  Documentation can be found at: https://python-moztelemetry.readthedocs.io/en/stable/api.html#dataset

The goal of this example is to plot the startup distribution for each OS. Let's see how many parallel workers we have at our disposal:

In [2]:
sc.defaultParallelism

32

We can look at the schema of the dataset we are interested in:

In [3]:
Dataset.from_source('telemetry').schema

[u'submissionDate',
 u'sourceName',
 u'sourceVersion',
 u'docType',
 u'appName',
 u'appUpdateChannel',
 u'appVersion',
 u'appBuildId']

Let's create a Dataset of Telemetry submissions for a given submission date:

In [4]:
pings_dataset = (
    Dataset.from_source('telemetry')
    .where(docType='main')
    #.where(appBuildId='20180721100146')
    .where(submissionDate='20180808')
    .where(appUpdateChannel="nightly")
)

Select only the properties we need and then take a 10% sample:

In [5]:
pings = (
    pings_dataset
    .select(
        'clientId',
        buildId='application.buildId',
        content_paint='payload.processes.content.histograms.CONTENT_PAINT_TIME.values',
        tab_switch='payload.histograms.FX_TAB_SWITCH_TOTAL_E10S_MS.values',
        frame_time='payload.processes.gpu.histograms.CONTENT_FRAME_TIME.values',
        frame_time_sum='payload.processes.gpu.histograms.CONTENT_FRAME_TIME.sum',
        composite_time='payload.processes.gpu.histograms.COMPOSITE_TIME',
        checkerboardin='payload.processes.gpu.histograms.CHECKERBOARD_SEVERITY.values',
        experiments='environment.experiments',
        osName='environment.system.os.name',
        gfx='environment.system.gfx')
    .records(sc, sample=0.1)
)

This 'sampling' is based on s3 files and is highly
susceptible to skew. Use only for quicker performance
while prototyping.
fetching 633.07509MB in 2478 files...


In [6]:
# We add two extra steps. The first rewrites the ping to have some
# information more easily accessible (like the primary adapter),
# and the second step removes any pings that don't have adapter
# information.
def rewrite_ping(p):
    adapters = p.get('gfx', None).get('adapters', None)
    if not adapters:
        return None
    adapter = adapters[0]
            
    p['adapter'] = adapter
            
    # Convert the version to a tuple of integers.
    #if 'driverVersion' in adapter:
    #    p['driverVersion'] = [int(n) for n in adapter['driverVersion'].split('.') if n.isdigit()]
    return p

def filter_ping(p):
    return 'adapter' in p
rpings = pings.map(rewrite_ping).filter(filter_ping)
rpings = rpings.cache()
rpings.count()

18824

To prevent pseudoreplication, let's consider only a single submission for each client. As this step requires a distributed shuffle, it should always be run only after extracting the attributes of interest with *Dataset.select()*.

In [7]:
subset = (
    rpings
    .map(lambda p: (p['clientId'], p))
    .reduceByKey(lambda p1, p2: p1)
    .map(lambda p: p[1])
)

Caching is fundamental as it allows for an iterative, real-time development workflow:

In [8]:
cached = subset.cache()

How many pings are we looking at?

In [9]:
cached.count()

13730

In [10]:
cached = cached.filter(lambda p: "features" in p["gfx"])
cached = cached.filter(lambda p: "wrQualified" in p["gfx"]["features"])
cached.count()

12009

In [11]:
wrExperiment = cached.filter(lambda p: "experiments" in p and p["experiments"]).filter(lambda p: "prefflip-webrender-v1-1-1474484" in p["experiments"])
wrExperiment.map(lambda p: p["gfx"]["features"]["compositor"]).countByValue()

defaultdict(int, {u'basic': 116, u'd3d11': 2432, u'webrender': 640})

In [12]:
wrExperiment.map(lambda p: p["experiments"]["prefflip-webrender-v1-1-1474484"]["branch"]).countByValue()

defaultdict(int, {u'control': 1600, u'treatment': 1588})

In [13]:
wrExperiment.map(lambda p: p['buildId']).countByValue()

defaultdict(int,
            {u'20180725103029': 6,
             u'20180725220116': 2,
             u'20180726001822': 5,
             u'20180726100339': 6,
             u'20180726220124': 5,
             u'20180727103347': 11,
             u'20180727231224': 3,
             u'20180728101501': 10,
             u'20180728220145': 4,
             u'20180729100102': 8,
             u'20180729220222': 4,
             u'20180730100211': 9,
             u'20180730221422': 10,
             u'20180731105217': 18,
             u'20180731220208': 5,
             u'20180801100116': 18,
             u'20180801223951': 16,
             u'20180802100128': 26,
             u'20180802220056': 35,
             u'20180803104322': 41,
             u'20180803220259': 14,
             u'20180804124335': 28,
             u'20180804220307': 28,
             u'20180805100054': 61,
             u'20180805231147': 74,
             u'20180806100140': 241,
             u'20180806220216': 221,
             u'20180

In [14]:
treatment = wrExperiment.filter(lambda p: p["experiments"]["prefflip-webrender-v1-1-1474484"]["branch"] == "treatment")
qTreatment = treatment.filter(lambda p: p["gfx"]["features"]["wrQualified"]["status"] == "available")
qTreatment.map(lambda p: p["gfx"]["features"]["compositor"]).countByValue()

defaultdict(int, {u'basic': 23, u'd3d11': 395, u'webrender': 600})

In [19]:
qTreatment.filter(lambda p: "webrender" in p["gfx"]["features"]).map(lambda p: p["gfx"]["features"]["webrender"]["status"]).countByValue()

defaultdict(int, {u'available': 589, u'opt-in': 367, u'unavailable': 35})

In [20]:
wrt = qTreatment.filter(lambda p: p["gfx"]["features"]["compositor"] == 'd3d11').filter(lambda p: 'webrender' in p["gfx"]["features"])
wrt.map(lambda p: p["gfx"]["features"]["webrender"]["status"]).countByValue()
#wrt.count()

defaultdict(int, {u'opt-in': 413, u'unavailable': 11})