In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from moztelemetry.dataset import Dataset

%matplotlib inline

### Basics

We will use the Dataset API to fetch data.  Documentation can be found at: https://python-moztelemetry.readthedocs.io/en/stable/api.html#dataset

The goal of this example is to plot the startup distribution for each OS. Let's see how many parallel workers we have at our disposal:

In [2]:
sc.defaultParallelism

32

We can look at the schema of the dataset we are interested in:

In [3]:
Dataset.from_source('telemetry').schema

[u'submissionDate',
 u'sourceName',
 u'sourceVersion',
 u'docType',
 u'appName',
 u'appUpdateChannel',
 u'appVersion',
 u'appBuildId']

Let's create a Dataset of Telemetry submissions for a given submission date:

In [32]:
pings_dataset = (
    Dataset.from_source('telemetry')
    .where(docType='main')
    #.where(appBuildId='20180721100146')
    .where(submissionDate='20180820')
    .where(appUpdateChannel="release")
)

Select only the properties we need and then take a 10% sample:

In [33]:
pings = (
    pings_dataset
    .select(
        'clientId',
        buildId='application.buildId',
        experiments='environment.experiments',
        os='environment.system.os',
        gfx='environment.system.gfx')
    .records(sc, sample=0.01)
)

This 'sampling' is based on s3 files and is highly
susceptible to skew. Use only for quicker performance
while prototyping.
fetching 8878.45489MB in 1672 files...


In [34]:
#pings = (
#    pings_dataset
#    .records(sc, sample=0.01)
#)
#pings.take(1)

In [35]:
pings.count()

2272516

In [36]:
pings.take(4)

[{'buildId': u'20180807170231',
  'checkerboardin': None,
  'clientId': u'313521b7-7dd8-4a2f-86aa-c80b65bfcb34',
  'composite_time': None,
  'content_paint': None,
  'content_paint_phase': None,
  'experiments': {u'hotfix-http-throttling-v2-bug-1462906': {u'branch': u'hotfix',
    u'type': u'normandy-exp'},
   u'rollout-release-61-tls-fallback-1-3': {u'branch': u'active',
    u'type': u'normandy-prefrollout'}},
  'frame_time': None,
  'frame_time_sum': None,
  'gfx': {u'ContentBackend': u'Skia',
   u'D2DEnabled': False,
   u'DWriteEnabled': True,
   u'adapters': [{u'GPUActive': True,
     u'RAM': 1024,
     u'description': u'AMD Radeon HD 6700 Series',
     u'deviceID': u'0x673e',
     u'driver': u'aticfx64 aticfx64 aticfx64 aticfx32 aticfx32 aticfx32 atiumd64 atidxx64 atidxx64 atiumdag atidxx32 atidxx32 atiumdva atiumd6a atitmm64',
     u'driverDate': u'2-26-2016',
     u'driverVersion': u'15.301.1901.0',
     u'subsysID': u'23101787',
     u'vendorID': u'0x1002'}],
   u'features': {u

In [37]:
# We add two extra steps. The first rewrites the ping to have some
# information more easily accessible (like the primary adapter),
# and the second step removes any pings that don't have adapter
# information.
def rewrite_ping(p):
    adapters = p.get('gfx', None).get('adapters', None)
    if not adapters:
        return None
    adapter = adapters[0]
            
    p['adapter'] = adapter
            
    # Convert the version to a tuple of integers.
    #if 'driverVersion' in adapter:
    #    p['driverVersion'] = [int(n) for n in adapter['driverVersion'].split('.') if n.isdigit()]
    return p

def filter_ping(p):
    return 'adapter' in p
rpings = pings.map(rewrite_ping).filter(filter_ping)
rpings = rpings.cache()
rpings.count()

2272516

To prevent pseudoreplication, let's consider only a single submission for each client. As this step requires a distributed shuffle, it should always be run only after extracting the attributes of interest with *Dataset.select()*.

In [38]:
subset = (
    rpings
    .map(lambda p: (p['clientId'], p))
    .reduceByKey(lambda p1, p2: p1)
    .map(lambda p: p[1])
)

Caching is fundamental as it allows for an iterative, real-time development workflow:

In [59]:
cached = subset.cache()

How many pings are we looking at?

In [60]:
cached.count()

2118920

In [61]:
wrQualified = cached.filter(lambda p: "features" in p["gfx"])
wrQualified = wrQualified.filter(lambda p: "wrQualified" in p["gfx"]["features"])
wrQualified.count()

2

In [62]:
wrQualified.map(lambda p: p["gfx"]["features"]["wrQualified"]["status"]).countByValue()

defaultdict(int, {u'blocked': 2})

In [63]:
wrQualified.map(lambda p: p['adapter']['vendorID']).countByValue()

defaultdict(int, {u'0x8086': 1, u'X.Org': 1})

In [64]:
wrQualified.map(lambda p: p["gfx"]["features"]["wrQualified"]["status"]).countByValue()

defaultdict(int, {u'blocked': 2})

In [65]:
wrQualified.map(lambda p: p["os"]["name"]).countByValue()

defaultdict(int, {u'Linux': 1, u'Windows_NT': 1})

In [66]:
wrAvailable = cached.filter(lambda p: p["gfx"]["features"]["wrQualified"]["status"] == "available" )
wrAvailable.count()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 18 in stage 101.0 failed 4 times, most recent failure: Lost task 18.3 in stage 101.0 (TID 3534, ip-172-31-4-232.us-west-2.compute.internal, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/pyspark/worker.py", line 177, in main
    process()
  File "/usr/lib/spark/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 346, in func
    return f(iterator)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 1041, in <lambda>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/usr/lib/spark/python/pyspark/rdd.py", line 1041, in <genexpr>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "<ipython-input-66-f8052b31a96f>", line 1, in <lambda>
KeyError: 'wrQualified'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1708)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1696)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1695)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1695)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:855)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:855)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:855)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1867)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:671)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:467)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor122.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/pyspark/worker.py", line 177, in main
    process()
  File "/usr/lib/spark/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 346, in func
    return f(iterator)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 1041, in <lambda>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/usr/lib/spark/python/pyspark/rdd.py", line 1041, in <genexpr>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "<ipython-input-66-f8052b31a96f>", line 1, in <lambda>
KeyError: 'wrQualified'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
wrBlocked = cached.filter(lambda p: p["gfx"]["features"]["wrQualified"]["status"] == "blocked" )
wrBlocked.count()

In [None]:
100.*wrAvailable.count()/cached.count()

In [75]:
import json
import urllib2

gpu_db = json.load(urllib2.urlopen('https://raw.githubusercontent.com/jrmuizel/gpu-db/master/nvidia.json'))
devices = {}
for gen in gpu_db['10de'].items():
    for chipset in gen[1].items():
        for dev in chipset[1]:
            #print dev, gen[0]
            devices[int(dev,16)] = chipset[0]

In [68]:
wrQualified.map(lambda p: devices[int(p["adapter"]["deviceID"],16)]).countByValue()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 103.0 failed 4 times, most recent failure: Lost task 7.3 in stage 103.0 (TID 3571, ip-172-31-4-232.us-west-2.compute.internal, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/pyspark/worker.py", line 177, in main
    process()
  File "/usr/lib/spark/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 830, in func
    initial = next(iterator)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 1239, in countPartition
    for obj in iterator:
  File "<ipython-input-68-ad0a8c05fdad>", line 1, in <lambda>
ValueError: invalid literal for int() with base 16: 'AMD HAWAII (DRM 2.50.0, 4.18.3-arch1-1-ARCH, LLVM 6.0.1)'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1708)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1696)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1695)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1695)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:855)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:855)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:855)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1867)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:671)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:467)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor122.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/pyspark/worker.py", line 177, in main
    process()
  File "/usr/lib/spark/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 830, in func
    initial = next(iterator)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 1239, in countPartition
    for obj in iterator:
  File "<ipython-input-68-ad0a8c05fdad>", line 1, in <lambda>
ValueError: invalid literal for int() with base 16: 'AMD HAWAII (DRM 2.50.0, 4.18.3-arch1-1-ARCH, LLVM 6.0.1)'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
wrQualified.map(lambda p: devices[int(p["adapter"]["deviceID"],16)]).filter(lambda p: p.endswith("M")).count()

In [None]:
wrQualified.map(lambda p: p["gfx"]["features"]["compositor"]).countByValue()

In [69]:
nv = cached.filter(lambda p: p["adapter"]["vendorID"] == "0x10de")
nv.count()

342477

In [70]:
nv10 = nv.filter(lambda p: p["os"]["name"] == "Windows_NT" and p["os"]["version"] == "10.0")
nv10.count()

147134

In [71]:
nv10tesla = nv10.filter(lambda p: int(p["adapter"]["deviceID"],16) >= 1000)
nv10tesla.count()

145446

In [80]:
nv10teslanom = nv10tesla.filter(lambda p: int(p["adapter"]["deviceID"],16) in devices).filter(lambda p: not devices[int(p["adapter"]["deviceID"],16)].endswith("M"))
nv10teslanom.count()

In [84]:
nv10tesla.filter(lambda p: int(p["adapter"]["deviceID"],16) not in devices).map(lambda p: p['adapter']['deviceID']).countByValue()

defaultdict(int, {u'0x0f03': 1, u'0x1399': 1, u'0x179c': 12})

In [82]:
100.*nv10teslanom.count()/cached.count()

6.205095048420894

In [42]:
nv10blocked = nv10tesla.filter(lambda p: p["gfx"]["features"]["wrQualified"]["status"] == "blocked")
nv10blocked.count()

245

In [43]:
nv10blocked.map(lambda p: devices[int(p["adapter"]["deviceID"],16)]).countByValue()

defaultdict(int,
            {u'G84GLM': 1,
             u'G86GLM': 1,
             u'G86M': 1,
             u'G92': 1,
             u'G92GLM': 1,
             u'G94M': 1,
             u'G96M': 1,
             u'G98M': 3,
             u'GF106GLM': 1,
             u'GF106M': 2,
             u'GF108GLM': 2,
             u'GF108M': 5,
             u'GF116M': 3,
             u'GF117M': 5,
             u'GF119M': 3,
             u'GK104': 3,
             u'GK104GLM': 2,
             u'GK104M': 3,
             u'GK106': 3,
             u'GK106GLM': 4,
             u'GK106M': 7,
             u'GK107': 2,
             u'GK107GLM': 3,
             u'GK107M': 7,
             u'GK208BM': 2,
             u'GK208M': 2,
             u'GM107': 2,
             u'GM107GLM': 12,
             u'GM107M': 8,
             u'GM108M': 5,
             u'GM200': 2,
             u'GM204': 10,
             u'GM204M': 17,
             u'GM206': 4,
             u'GM206M': 1,
             u'GP102': 4,
             u

In [20]:
wrExperiment = cached.filter(lambda p: "experiments" in p and p["experiments"]).filter(lambda p: "prefflip-webrender-v1-1-1474484" in p["experiments"])
wrExperiment.map(lambda p: p["gfx"]["features"]["compositor"]).countByValue()

defaultdict(int, {u'basic': 123, u'd3d11': 2622, u'webrender': 664})

In [21]:
wrExperiment = wrExperiment.filter(lambda p: p["gfx"]["features"]["wrQualified"]["status"] == "available")
wrExperiment = wrExperiment.filter(lambda p: len(p["gfx"]["monitors"]) == 1 and p["gfx"]["monitors"][0]["refreshRate"] == 60)

In [22]:
wrExperiment.map(lambda p: p["experiments"]["prefflip-webrender-v1-1-1474484"]["branch"]).countByValue()

defaultdict(int, {u'control': 495, u'treatment': 472})

In [23]:
treatment = wrExperiment.filter(lambda p: p["experiments"]["prefflip-webrender-v1-1-1474484"]["branch"] == "treatment")
control = wrExperiment.filter(lambda p: p["experiments"]["prefflip-webrender-v1-1-1474484"]["branch"] == "control")


In [24]:
#wrt = qTreatment.filter(lambda p: p["gfx"]["features"]["compositor"] == 'd3d11').filter(lambda p: 'webrender' in p["gfx"]["features"])
#wrt.map(lambda p: p["gfx"]["features"]["webrender"]["status"]).countByValue()
#wrt.count()

In [25]:
cached.filter(lambda p: p['checkerboardin']).map(lambda p: p["gfx"]["features"]["compositor"]).countByValue()

defaultdict(int,
            {u'basic': 196,
             u'd3d11': 4122,
             u'none': 1,
             u'opengl': 1,
             u'webrender': 341})

In [26]:
checked = cached.filter(lambda p: p['checkerboardin']).filter(lambda p: p["gfx"]["features"]["compositor"] == "webrender")
checked.map(lambda p: p["gfx"]["D2DEnabled"]).countByValue()

defaultdict(int, {None: 3, False: 2, True: 336})

In [27]:
treatment.map(lambda p: p["gfx"]["features"]["compositor"]).countByValue()

defaultdict(int, {u'basic': 8, u'd3d11': 193, u'webrender': 271})

In [28]:
wrQualified.take(1)

[{'adapter': {u'GPUActive': True,
   u'RAM': 2048,
   u'description': u'NVIDIA GeForce GTX 660',
   u'deviceID': u'0x11c0',
   u'driver': u'C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumdx.dll,C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumdx.dll,C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumdx.dll,C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumdx.dll C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumd.dll,C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumd.dll,C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumd.dll,C:\\WINDOWS\\System32\\DriverStore\\FileRepository\\nv_dispi.inf_amd64_38c9bee769f9ef1f\\nvldumd.dll',
   u'driverDate': u'5-7-2018',
   u'driverVers

In [30]:
wrEnabled = treatment.filter(lambda p: p["gfx"]["features"]["compositor"] == "webrender")
wrDisabled = control.filter(lambda p: p["gfx"]["features"]["compositor"] == "d3d11")
wrEnabled.count(), wrDisabled.count()

(271, 476)