In [2]:
!pip install opendp-smartnoise

Collecting opendp-smartnoise
  Using cached opendp_smartnoise-0.1.4.2-py3-none-any.whl (193 kB)
Collecting antlr4-python3-runtime==4.8
  Using cached antlr4-python3-runtime-4.8.tar.gz (112 kB)
Collecting pandasql
  Using cached pandasql-0.7.3.tar.gz (26 kB)
Collecting msrest
  Using cached msrest-0.6.21-py2.py3-none-any.whl (85 kB)
Collecting statsmodels
  Using cached statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
Collecting opendp-smartnoise-core
  Using cached opendp_smartnoise_core-0.2.2-py3-none-any.whl (12.7 MB)
Collecting requests-oauthlib>=0.5.0
  Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting isodate>=0.6.0
  Using cached isodate-0.6.0-py2.py3-none-any.whl (45 kB)
Collecting patsy>=0.5
  Using cached patsy-0.5.1-py2.py3-none-any.whl (231 kB)
Building wheels for collected packages: antlr4-python3-runtime, pandasql
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25ldone
[?25h  Created wheel for antlr4-python3-runtime: fi

In [4]:
import wmfdata
from dataclasses import dataclass
import numpy as np
import random
import math
from IPython.display import Latex
from opendp.smartnoise.core.api import LibraryWrapper 

In [5]:
spark = wmfdata.spark.get_session(
    app_name='pyspark-large — differential privacy opendp test — htriedman',
    type='yarn-large',
    ship_python_env=True
)

Collecting packages...
Packing environment at '/home/htriedman/.conda/envs/2021-08-31T19.46.22_htriedman' to 'conda-2021-08-31T19.46.22_htriedman.tgz'
[########################################] | 100% Completed |  1min  6.2s


Will ship conda-2021-08-31T19.46.22_htriedman.tgz to remote Spark executors.
PySpark executors will use conda-2021-08-31T19.46.22_htriedman/bin/python3.


In [6]:
# get (page title, page id, project, country, actor signature) for Aug 15 2021 UTC6:00

rdd = spark.sql("""
SELECT
  pageview_info['page_title'] as page_title,
  page_id,
  pageview_info['project'] as project,
  geocoded_data['country'] as country,
  actor_signature
FROM wmf.pageview_actor
WHERE year = 2021 AND month = 8 AND day = 15 AND hour = 6 AND page_id IS NOT NULL
""").rdd

In [11]:
# add laplace noise to a single number
def add_laplace_noise(x, eps, sensitivity):
    return LibraryWrapper().laplace_mechanism(value=x, epsilon=eps, sensitivity=sensitivity, enforce_constant_time=False)

# add laplace noise to a spark rdd
def add_laplace_noise_to_rdd(rdd, eps, max_partitions, max_per_partition):
    eps_per_partition = eps / max_partitions
    sensitivity_per_partition = max_per_partition
    return rdd.map(lambda x: (x[0], add_laplace_noise(x[1], eps_per_partition, sensitivity_per_partition)))

# add gaussian noise to a single number
def add_gaussian_noise(x, eps, delta, sensitivity):
    return LibraryWrapper().gaussian_mechanism(value=x, epsilon=eps, delta=delta, sensitivity=sensitivity, enforce_constant_time=False)

# add laplace noise to a spark rdd
def add_gaussian_noise_to_rdd(rdd, eps, delta, max_partitions, max_per_partition):
    eps_per_partition = eps / max_partitions
    sensitivity_per_partition = max_per_partition
    return rdd.map(lambda x: (x[0], add_gaussian_noise(x[1], eps_per_partition, delta, sensitivity_per_partition)))

def calculate_threshold(eps, delta, max_partitions, max_per_partition):
    eps_per_partition = eps / max_partitions
    sensitivity_per_partition = max_per_partition
    b = sensitivity_per_partition / eps_per_partition
    return -b * math.log(2 * b * delta)

In [13]:
# do bounded DP count
def do_count(rdd, eps, delta, max_partitions, max_per_partition, noise_kind):
    # rekey to a tuple of (actor signature, page id)
    # ((actor_signature, page_id), pageview)
    dp_count_rdd = rdd.map(lambda x: ((x.actor_signature, x.page_id), [x]))

    # randomly get a set of at most `max_per_partition` pageviews for each (actor signature, page id) tuple
    # ((actor_signature, page_id), [pageview]) {max length of max_per_partition}
    dp_count_rdd = dp_count_rdd.reduceByKey(lambda x, y: random.sample(x + y, min(len(x) + len(y), max_per_partition)))

    # rekey to just actor signature
    # (actor_signature, [pageview]) {with redundancies}
    dp_count_rdd = dp_count_rdd.map(lambda x: ((x[0][0], x[1])))

    # randomly get a set of at most `max_partitions` sets of pageviews for each actor signature
    # (actor_signature, [pageview]) {max length of max_per_partition * max_partitions}
    dp_count_rdd = dp_count_rdd.reduceByKey(lambda x, y: random.sample(x + y, min(len(x) + len(y), max_partitions)))

    # drop actor signature as key
    # ([pageview])
    dp_count_rdd = dp_count_rdd.map(lambda x: x[1])

    # unnest lists of pageviews using a flatmap
    # (pageview)
    dp_count_rdd = dp_count_rdd.flatMap(lambda x: x)

    # now that contributions are bounded, count views per tuple
    dp_count_rdd = dp_count_rdd.map(lambda x: ((x.project, x.country, x.page_id, x.page_title), 1))
    dp_count_rdd = dp_count_rdd.reduceByKey(lambda x, y: (x + y))

    if noise_kind == "laplace":
        # add laplace noise to counts
        dp_count_rdd = add_laplace_noise_to_rdd(dp_count_rdd, eps, max_partitions, max_per_partition)
    elif noise_kind == "gaussian":
        dp_count_rdd = add_gaussian_noise_to_rdd(dp_count_rdd, eps, delta, max_partitions, max_per_partition)

    # filter tuples that have less than `min_number_of_views` views
    dp_count_rdd = dp_count_rdd.filter(lambda x: x[1] >= calculate_threshold(delta, eps, max_partitions, max_per_partition))

    # round view count to integers for readability
    dp_count_rdd = dp_count_rdd.map(lambda x: (x[0], round(x[1], 0)))

    return dp_count_rdd.takeOrdered(200, key=lambda x: -x[1])

In [9]:
# total contributions (aka sensitivity) = max_per_partition * max_partitions
max_partitions = 5    # say that users can visit at most 5 pages
max_per_partition = 2 # and for each page they can contribute at most 2 pageviews

eps = 1
delta = 5e-8

In [14]:
do_count(rdd, eps, delta, max_partitions, max_per_partition, "laplace")

[(('en.wikipedia', 'United States', 15580374, 'Main_Page'), 49278.0),
 (('de.wikipedia', 'Germany', 5248757, 'Wikipedia:Hauptseite'), 24422.0),
 (('ja.wikipedia', 'Japan', 253348, 'メインページ'), 24252.0),
 (('en.wikipedia', 'India', 15580374, 'Main_Page'), 16174.0),
 (('en.wikipedia', 'United Kingdom', 15580374, 'Main_Page'), 13111.0),
 (('fr.wikipedia', 'Russia', 1034876, 'Questionnaire'), 11548.0),
 (('ja.wikipedia', 'Japan', 3093109, 'ジャッキー・ウー'), 11447.0),
 (('fr.wikipedia', 'France', 10635368, 'Wikipédia:Accueil_principal'),
  10302.0),
 (('en.wikipedia', 'India', 3349824, 'Vikram_Batra'), 9975.0),
 (('ja.wikipedia', 'Japan', 2069252, '鍛治舎巧'), 9652.0),
 (('it.wikipedia', 'Italy', 665216, "Gianfranco_D'Angelo"), 9565.0),
 (('ja.wikipedia', 'Japan', 303551, '馬淵史郎'), 8340.0),
 (('en.wikipedia', 'India', 2499568, 'Independence_Day_(India)'), 8194.0),
 (('en.wikipedia', 'Australia', 15580374, 'Main_Page'), 7942.0),
 (('en.wikipedia', 'Iran', 15580374, 'Main_Page'), 7925.0),
 (('ja.wikipedia

In [15]:
do_count(rdd, eps, delta, max_partitions, max_per_partition, "gaussian")

[(('en.wikipedia', 'United States', 15580374, 'Main_Page'), 49256.0),
 (('de.wikipedia', 'Germany', 5248757, 'Wikipedia:Hauptseite'), 24451.0),
 (('ja.wikipedia', 'Japan', 253348, 'メインページ'), 24302.0),
 (('en.wikipedia', 'India', 15580374, 'Main_Page'), 16133.0),
 (('en.wikipedia', 'United Kingdom', 15580374, 'Main_Page'), 13123.0),
 (('fr.wikipedia', 'Russia', 1034876, 'Questionnaire'), 11564.0),
 (('ja.wikipedia', 'Japan', 3093109, 'ジャッキー・ウー'), 11472.0),
 (('fr.wikipedia', 'France', 10635368, 'Wikipédia:Accueil_principal'),
  10360.0),
 (('en.wikipedia', 'India', 3349824, 'Vikram_Batra'), 10026.0),
 (('ja.wikipedia', 'Japan', 2069252, '鍛治舎巧'), 9673.0),
 (('it.wikipedia', 'Italy', 665216, "Gianfranco_D'Angelo"), 9572.0),
 (('ja.wikipedia', 'Japan', 303551, '馬淵史郎'), 8375.0),
 (('en.wikipedia', 'India', 2499568, 'Independence_Day_(India)'), 8207.0),
 (('en.wikipedia', 'Iran', 15580374, 'Main_Page'), 7948.0),
 (('en.wikipedia', 'Australia', 15580374, 'Main_Page'), 7926.0),
 (('en.wikipedi