# Chapter 9 - Data Science
## Data Preparation

## 0 - Setting up the notebook

In [1]:
import json
import random
from datetime import date, timedelta
import faker

In [2]:
fake = faker.Faker()

In [3]:
usernames = set()
usernames_no = 1000

# populate the set with 1000 unique usernames
while len(usernames) < usernames_no:
    usernames.add(fake.user_name())

In [4]:
def get_random_name_and_gender():
    skew = .6 # 60% of users will be female
    male = random.random() > skew
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'

def get_users(usernames):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        user = {
            'username': username,
            'name': name,
            'gender': gender,
            'email': fake.email(),
            'age': fake.random_int(min=18, max=90),
            'address': fake.address()
        }
        users.append(json.dumps(user))
    return users

users = get_users(usernames)
users[:3]

['{"username": "uwalker", "name": "Tanya Gray", "gender": "F", "email": "fmarquez@example.net", "age": 43, "address": "PSC 3134, Box 2577\\nAPO AP 25102"}',
 '{"username": "tyrone78", "name": "Michael Dodson", "gender": "M", "email": "michael54@example.net", "age": 56, "address": "5993 Miller Ways Apt. 148\\nWest Jacob, AR 59795"}',
 '{"username": "vstuart", "name": "Melissa Frank", "gender": "F", "email": "sandra87@example.com", "age": 86, "address": "16810 Pamela Prairie\\nTracyhaven, FM 21572"}']

In [5]:
# compaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():
    # just some gibberish internal codes
    types = ['AKX', 'BYU', 'GRZ', 'KTR']
    return random.choice(types)

def get_start_end_dates():
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)

    def _format_date(date_):
        return date_.strftime("%Y%m%d")
    return _format_date(start), _format_date(end)

def get_age():
    age = random.randrange(20, 46, 5)
    diff = random.randrange(5, 26, 5)
    return '{}-{}'.format(age, age + diff)

def get_gender():
    return random.choice(('M', 'F', 'B'))

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start, end = get_start_end_dates()
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join(
        (type_, start, end, age, gender, currency))

In [6]:
# campaign data:
# name, budget, spent, clicks, impressions
def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)
    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt': budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions
    }

In [7]:
def get_data(users):
    data = []
    for user in users:
        campaigns = [get_campaign_data()
                     for _ in range(random.randint(2, 8))]
        data.append({'user': user, 'campaigns': campaigns})
    return data

In [8]:
rough_data = get_data(users)
rough_data[:2] # let's take a peek

[{'user': '{"username": "uwalker", "name": "Tanya Gray", "gender": "F", "email": "fmarquez@example.net", "age": 43, "address": "PSC 3134, Box 2577\\nAPO AP 25102"}',
  'campaigns': [{'cmp_name': 'AKX_20240505_20250130_25-30_M_EUR',
    'cmp_bgt': 965482,
    'cmp_spent': 801270,
    'cmp_clicks': 47633,
    'cmp_impr': 499998},
   {'cmp_name': 'KTR_20240425_20260318_30-50_M_GBP',
    'cmp_bgt': 480386,
    'cmp_spent': 363936,
    'cmp_clicks': 79887,
    'cmp_impr': 499999},
   {'cmp_name': 'AKX_20221225_20240219_20-35_F_USD',
    'cmp_bgt': 161578,
    'cmp_spent': 118809,
    'cmp_clicks': 65196,
    'cmp_impr': 500000}]},
 {'user': '{"username": "tyrone78", "name": "Michael Dodson", "gender": "M", "email": "michael54@example.net", "age": 56, "address": "5993 Miller Ways Apt. 148\\nWest Jacob, AR 59795"}',
  'campaigns': [{'cmp_name': 'AKX_20221222_20241111_20-30_B_EUR',
    'cmp_bgt': 23646,
    'cmp_spent': 13083,
    'cmp_clicks': 45014,
    'cmp_impr': 499997},
   {'cmp_name': '

In [9]:
data = []
for datum in rough_data:
    for campaign in datum['campaigns']:
        campaign.update({'user': datum['user']})
        data.append(campaign)
data[:2] # let's take another peek

[{'cmp_name': 'AKX_20240505_20250130_25-30_M_EUR',
  'cmp_bgt': 965482,
  'cmp_spent': 801270,
  'cmp_clicks': 47633,
  'cmp_impr': 499998,
  'user': '{"username": "uwalker", "name": "Tanya Gray", "gender": "F", "email": "fmarquez@example.net", "age": 43, "address": "PSC 3134, Box 2577\\nAPO AP 25102"}'},
 {'cmp_name': 'KTR_20240425_20260318_30-50_M_GBP',
  'cmp_bgt': 480386,
  'cmp_spent': 363936,
  'cmp_clicks': 79887,
  'cmp_impr': 499999,
  'user': '{"username": "uwalker", "name": "Tanya Gray", "gender": "F", "email": "fmarquez@example.net", "age": 43, "address": "PSC 3134, Box 2577\\nAPO AP 25102"}'}]

In [None]:
with open('data.json', 'w') as stream:
    stream.write(json.dumps(data))