In [None]:
import matplotlib.pyplot as plt
import os
import requests
import pandas as pd
import seaborn as sns

from pyspark.sql import SparkSession
from tmlt.analytics.keyset import KeySet
from tmlt.analytics.privacy_budget import PureDPBudget
from tmlt.analytics.query_builder import QueryBuilder, ColumnType, BinningSpec
from tmlt.analytics.session import Session


spark = SparkSession.builder.getOrCreate()
members_df = spark.read.csv("data/members.csv", header=True, inferSchema=True)


In [None]:
session = Session.from_dataframe(
    privacy_budget=PureDPBudget(epsilon=1.1),
    source_id="members",
    dataframe=members_df,
)

In [None]:
members_df.columns

I'm curious if there is a correlation between education_level and books_borrowed. Let's take a look!

In [None]:
members_df.select("education_level").distinct().show(truncate=False)

I first need to build a KeySet with the values I'd like to use...

In [None]:
edu_levels = KeySet.from_dict({
    "education_level": [
        "up-to-high-school",
        "high-school-diploma",
        "bachelors-associate",
        "masters-degree",
        "doctorate-professional",
    ]
})

Now I can use the QueryBuilder to group by education and then give an average. Here I am binning the number of books borrowed between 0 and 100.

In [None]:
edu_average_books_query = (
    QueryBuilder("members")
    .groupby(edu_levels)
    .average("books_borrowed", low=0, high=100)
)
edu_average_books = session.evaluate(
    edu_average_books_query,
    privacy_budget=PureDPBudget(0.6),
)
edu_average_books.sort("books_borrowed_average").show(truncate=False)


There doesn't seem to be any correlation to find here! I wonder if age might be a better indicator, maybe even connected with an education level?

To take a look, I first want to create age groups by binning the age in ranges.

In [None]:
age_binspec = BinningSpec([10*i for i in range(0, 11)])

age_bin_keys = KeySet.from_dict({
    "age_binned": age_binspec.bins()
})

Now I can filter and group by age... Here I am singling out those with masters or doctorates and I am using a new bounds for my books borrowed as I think 100 was too high!

In [None]:
binned_age_with_filter_query = QueryBuilder("members")\
      .filter("education_level='masters-degree' or education_level='doctorate-professional'")\
      .bin_column("age", age_binspec)\
      .groupby(age_bin_keys)\
      .average("books_borrowed", low=0, high=22)

session.evaluate(binned_age_with_filter_query, privacy_budget=PureDPBudget(0.4)).show(truncate=False)

Oye! I can see that there is a lot of noise added to some of these columns. What did I do wrong? In this case, I filtered on age and did not take into account that some of the age groups represented would likely be underrepresented in my filter. The likelihood that a 8 year old has a masters degree is quite small...

In the future, I might run a query like the following first! Getting an idea for books borrowed by age before filtering... :)

In [None]:
binned_age_query = QueryBuilder("members")\
    .bin_column("age", age_binspec)\
    .groupby(age_bin_keys)\
    .average("books_borrowed", low=0, high=22)

session.evaluate(binned_age_query, privacy_budget=PureDPBudget(0.1)).show(truncate=False)

Or even just looking at a count....

Oh no! I ran out of budget!

Good news: Tumult Labs has a bunch of notebooks to try out with this dataset and there is an option to set your budget to inifinity as you play around and get to know the library. That said, when you are using Tumult or any differential privacy library in production, you'll need to first make real decisions on your queries and budget! 

Take a look at their tutorials and happy privacying!