In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from moztelemetry.dataset import Dataset

%matplotlib inline

### Basics

We will use the Dataset API to fetch data.  Documentation can be found at: https://python-moztelemetry.readthedocs.io/en/stable/api.html#dataset

The goal of this example is to plot the startup distribution for each OS. Let's see how many parallel workers we have at our disposal:

In [2]:
sc.defaultParallelism

32

We can look at the schema of the dataset we are interested in:

In [3]:
Dataset.from_source('telemetry').schema

[u'submissionDate',
 u'sourceName',
 u'sourceVersion',
 u'docType',
 u'appName',
 u'appUpdateChannel',
 u'appVersion',
 u'appBuildId']

Let's create a Dataset of Telemetry submissions for a given submission date:

In [4]:
pings_dataset = (
    Dataset.from_source('telemetry')
    .where(docType='main')
    #.where(appBuildId='20180721100146')
    .where(submissionDate='20181022')
    .where(appUpdateChannel="nightly")
)

Select only the properties we need and then take a 10% sample:

In [5]:
pings = (
    pings_dataset
    .select(
        'clientId',
        buildId='application.buildId',
        content_paint='payload.processes.content.histograms.CONTENT_PAINT_TIME.values',
        tab_switch='payload.histograms.FX_TAB_SWITCH_TOTAL_E10S_MS.values',
        frame_time='payload.processes.gpu.histograms.CONTENT_FRAME_TIME.values',
        frame_time_sum='payload.processes.gpu.histograms.CONTENT_FRAME_TIME.sum',
        composite_time='payload.processes.gpu.histograms.COMPOSITE_TIME',
        checkerboardin='payload.processes.gpu.histograms.CHECKERBOARD_SEVERITY.values',
        experiments='environment.experiments',
        settings='environment.settings',
        osName='environment.system.os.name',
        gfx='environment.system.gfx')
    .records(sc, sample=0.1)
)

This 'sampling' is based on s3 files and is highly
susceptible to skew. Use only for quicker performance
while prototyping.
fetching 669.64267MB in 2231 files...


To prevent pseudoreplication, let's consider only a single submission for each client. As this step requires a distributed shuffle, it should always be run only after extracting the attributes of interest with *Dataset.select()*.

In [6]:
subset = (
    pings
    .map(lambda p: (p['clientId'], p))
    .reduceByKey(lambda p1, p2: p1)
    .map(lambda p: p[1])
)

Caching is fundamental as it allows for an iterative, real-time development workflow:

In [7]:
cached = subset.cache()

How many pings are we looking at?

In [8]:
cached.count()

13821

In [9]:
cached.take(1)

[{'buildId': u'20181021220134',
  'checkerboardin': None,
  'clientId': u'27add80f-9421-49d0-b9b0-66544c7ab05c',
  'composite_time': {u'bucket_count': 50,
   u'histogram_type': 0,
   u'range': [1, 1000],
   u'sum': 279254,
   u'values': {u'0': 26052,
    u'1': 46311,
    u'10': 65,
    u'11': 42,
    u'12': 64,
    u'14': 73,
    u'16': 177,
    u'18': 66,
    u'2': 50486,
    u'20': 28,
    u'23': 13,
    u'26': 9,
    u'29': 21,
    u'3': 20132,
    u'33': 9,
    u'37': 8,
    u'4': 6925,
    u'42': 3,
    u'47': 4,
    u'5': 2905,
    u'53': 2,
    u'6': 1424,
    u'60': 2,
    u'67': 1,
    u'7': 837,
    u'75': 3,
    u'8': 426,
    u'84': 0,
    u'9': 129}},
  'content_paint': {u'0': 80526,
   u'1': 2176,
   u'10': 53,
   u'107': 1,
   u'11': 42,
   u'12': 70,
   u'120': 5,
   u'135': 3,
   u'14': 53,
   u'16': 37,
   u'171': 1,
   u'18': 37,
   u'192': 1,
   u'2': 529,
   u'20': 38,
   u'23': 30,
   u'26': 15,
   u'29': 12,
   u'3': 259,
   u'33': 6,
   u'37': 8,
   u'388': 1,
 

In [12]:
experiment = cached.filter(lambda p: "experiments" in p and p["experiments"]).filter(lambda p: "pref-flip-dummy-pref-no-default-1500230" in p["experiments"])
experiment.count()

715

In [13]:
experiment.filter(lambda p: 'app.normandy.test.without_default' not in p['settings']['userPrefs']).count()

0

In [14]:
experiment = cached.filter(lambda p: "experiments" in p and p["experiments"]).filter(lambda p: "pref-flip-dummy-pref-default-false-1500230" in p["experiments"])
experiment.count()

715

In [15]:
experiment.filter(lambda p: 'app.normandy.test.with_false_default' not in p['settings']['userPrefs']).count()

359

In [16]:
experiment = cached.filter(lambda p: "experiments" in p and p["experiments"]).filter(lambda p: "pref-flip-dummy-pref-default-true-1500230" in p["experiments"])
experiment.count()

758

In [25]:
experiment.filter(lambda p: 'app.normandy.test.with_true_default' not in p['settings']['userPrefs']).map(lambda p: p["experiments"]["pref-flip-dummy-pref-default-true-1500230"]["branch"]).countByValue()

defaultdict(int, {u'true': 394})

In [19]:
experiment.filter(lambda p: 'app.normandy.test.with_true_default' in p['settings']['userPrefs']).take(1)

[{'buildId': u'20181019100103',
  'checkerboardin': None,
  'clientId': u'83a1a744-b755-45bb-a667-4de10ccd699e',
  'composite_time': None,
  'content_paint': None,
  'experiments': {u'pref-flip-dummy-pref-default-true-1500230': {u'branch': u'false',
    u'type': u'normandy-exp'}},
  'frame_time': None,
  'frame_time_sum': None,
  'gfx': {u'ContentBackend': u'Skia',
   u'D2DEnabled': None,
   u'DWriteEnabled': None,
   u'adapters': [{u'GPUActive': True,
     u'RAM': None,
     u'description': u'Intel Open Source Technology Center -- Mesa DRI Intel(R) HD Graphics 530 (Skylake GT2) ',
     u'deviceID': u'Mesa DRI Intel(R) HD Graphics 530 (Skylake GT2) ',
     u'driver': None,
     u'driverDate': None,
     u'driverVersion': u'3.0 Mesa 18.2.2',
     u'subsysID': None,
     u'vendorID': u'Intel Open Source Technology Center'}],
   u'features': {u'compositor': u'basic',
    u'gpuProcess': {u'status': u'unused'},
    u'webrender': {u'status': u'unavailable'},
    u'wrQualified': {u'status': u