# Create Synthetic Dataset

In [1]:
import pandas as pd
import random
df_hits = pd.read_csv('data/q3/impressions-top-1000-domains.csv')
df_users = pd.read_csv('data/q3/unique_users-top-1000-domains.csv')
df = df_users.merge(df_hits,on='url_domain')

In [2]:
# citizen browser had 1,917 panelists as of December 2020
# these are the "data subjects" whose privacy would need to be protected
user_ids = pd.Series(list(range(1,1919)))

In [3]:
# create an empty list to hold all impressions
domain_impressions = []

In [4]:
# create series / list of dates
# Q3 2021 was July 1st to September 30th
dates_q3_2021 = pd.Series(pd.date_range(start="2021-07-01",end="2021-09-30").date)

# create simple distribution for "n_impression"
# 60% had one impression, 20% had 2, 20%
dist = [1,1,1,2,4]

In [5]:
# synthesize impressions!
for index, row in df.iterrows():
    users = pd.Series(user_ids.sample(row['unique_users']))
    # only include 10% of the impressions to reduce data size
    for i in range(int(row['impressions'] / 10)):
        to_append = {}
        to_append['user_id'] = users.sample(1, ignore_index=True)[0]
        to_append['date'] = dates_q3_2021.sample(1, ignore_index=True)[0]
        to_append['n_impressions'] = random.sample(dist,1)[0]
        to_append['url_domain'] = row['url_domain']
        domain_impressions.append(to_append)

In [6]:
# save to csv
synthetic_impressions = pd.DataFrame(domain_impressions)
synthetic_impressions.to_csv('synthetic_impressions/synthetic_q3_impressions.csv',index=False)
synthetic_impressions.sample(1000).to_csv('synthetic_impressions/synthetic_q3_impressions_1k_sample.csv',index=False)

In [7]:
# show a sample
synthetic_impressions.sample(10)

Unnamed: 0,user_id,date,n_impressions,url_domain
15647,569,2021-08-23,4,nytimes.com
14361,1603,2021-07-19,1,cnn.com
43520,711,2021-07-14,1,breitbart.com
32770,1861,2021-08-05,1,sports.yahoo.com
36910,77,2021-08-28,4,dailywire.com
38523,19,2021-07-02,1,pinknews.co.uk
57893,274,2021-08-22,1,louderwithcrowder.com
24450,1188,2021-09-22,1,forbes.com
28070,752,2021-09-26,1,yahoo.com
41323,574,2021-07-28,4,iheartdogs.com
