In [18]:
import matplotlib.pyplot as plt
import os
import requests
import pandas as pd
import seaborn as sns

from pyspark.sql import SparkSession
from tmlt.analytics.keyset import KeySet
from tmlt.analytics.privacy_budget import PureDPBudget
from tmlt.analytics.query_builder import QueryBuilder
from tmlt.analytics.session import Session

r = requests.get(
    'https://tumult-public.s3.amazonaws.com/library-members.csv',
)
with open("members.csv", "w") as f:
    f.write(r.text)

spark = SparkSession.builder.getOrCreate()
members_df = spark.read.csv("members.csv", header=True, inferSchema=True)


In [None]:
!pip uninstall --yes tmlt.analytics

In [4]:
!pip install --extra-index-url https://kjarmul-0:46LZeoVNvaS8EsoTwPTq@gitlab.com/api/v4/projects/17405343/packages/pypi/simple tmlt.analytics --upgrade

Looking in indexes: https://pypi.org/simple, https://kjarmul-0:****@gitlab.com/api/v4/projects/17405343/packages/pypi/simple


In [19]:
session = Session.from_dataframe(
    privacy_budget=PureDPBudget(epsilon=1.1),
    source_id="members",
    dataframe=members_df,
)

In [20]:
edu_levels = KeySet.from_dict({
    "education_level": [
        "up-to-high-school",
        "high-school-diploma",
        "bachelors-associate",
        "masters-degree",
        "doctorate-professional",
    ]
})

In [21]:
edu_average_age_query = (
    QueryBuilder("members")
    .groupby(edu_levels)
    .average("age", low=0, high=100)
)
edu_average_ages = session.evaluate(
    edu_average_age_query,
    privacy_budget=PureDPBudget(0.6),
)
edu_average_ages.sort("age_average").show(truncate=False)


                                                                                

+----------------------+------------------+
|education_level       |age_average       |
+----------------------+------------------+
|up-to-high-school     |17.872960848287114|
|high-school-diploma   |43.701353108296814|
|bachelors-associate   |46.54830917874396 |
|masters-degree        |49.72571893651655 |
|doctorate-professional|52.51534618129907 |
+----------------------+------------------+



In [23]:
gender_keys = KeySet.from_dict({
        "gender": ["female", "male", "nonbinary", "unspecified"],
    }
)



query = QueryBuilder("members").filter("age >= 40").filter(
    "education_level='masters-degree' or education_level='doctorate-professional'").groupby(
    binned_age_gender_keys).count()


session.evaluate(query, privacy_budget=PureDPBudget(0.4))

RuntimeError: Cannot answer measurement without exceeding maximum privacy loss: it needs 3602879701896397/9007199254740992, but the remaining budget is 225179981368525/2251799813685248