# Chapter 9 - Data Science
## Data Preparation

## 0 - Setting up the notebook

In [6]:
import json
import random
from datetime import date, timedelta

import faker

## 1 - Preparing the Data

In [7]:
# create the faker to populate the data
fake = faker.Faker()

In [8]:
usernames = set()
usernames_no = 1000

# populate the set with 1000 unique usernames
while len(usernames) < usernames_no:
    usernames.add(fake.user_name())

In [9]:
def get_random_name_and_gender():
    skew = .6  # 60% of users will be female
    male = random.random() > skew
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'

# for each username, create a complete user profile
# simulate user data coming from an API. It is a list
# of JSON strings (users).
def get_users(usernames):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        user = {
            'username': username,
            'name': name,
            'gender': gender,
            'email': fake.email(),
            'age': fake.random_int(min=18, max=90),
            'address': fake.address(),
        }
        users.append(json.dumps(user))
    return users

users = get_users(usernames)
users[:3]

['{"username": "tracie73", "name": "David Maxwell", "gender": "M", "email": "rodriguezbobby@example.net", "age": 41, "address": "414 Arnold Street\\nRoyberg, IL 80169"}',
 '{"username": "anna27", "name": "Colleen Peterson MD", "gender": "F", "email": "brandon94@example.com", "age": 86, "address": "29760 Chad Summit\\nWest Davidstad, PR 44505"}',
 '{"username": "joshuamartin", "name": "Andrew Price", "gender": "M", "email": "sean59@example.com", "age": 83, "address": "845 Wiggins Greens\\nMandyview, ND 30513"}']

In [10]:
# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():
    # just some gibberish internal codes
    types = ['AKX', 'BYU', 'GRZ', 'KTR']
    return random.choice(types)

def get_start_end_dates():
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)
    
    def _format_date(date_):
        return date_.strftime("%Y%m%d")
    
    return _format_date(start), _format_date(end)

def get_age():
    age = random.randrange(20, 46, 5)
    diff = random.randrange(5, 26, 5)
    return '{}-{}'.format(age, age + diff)

def get_gender():
    return random.choice(('M', 'F', 'B'))

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start, end = get_start_end_dates()
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join(
        (type_, start, end, age, gender, currency))

In [11]:
# campaign data:
# name, budget, spent, clicks, impressions
def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)    
    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))    
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt': budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions
    }

In [12]:
# assemble the logic to get the final version of the rough data
# data will be a list of dictionaries. Each dictionary will follow
# this structure:
# {'user': user_json, 'campaigns': [c1, c2, ...]}
# where user_json is the JSON string version of a user data dict
# and c1, c2, ... are campaign dicts as returned by
# get_campaign_data

def get_data(users):
    data = []
    for user in users:
        campaigns = [get_campaign_data()
                     for _ in range(random.randint(2, 8))]
        data.append({'user': user, 'campaigns': campaigns})
    return data

## 2 - Cleaning the data

In [13]:
# fetch simulated rough data
rough_data = get_data(users)

rough_data[:2]  # let's take a peek

[{'user': '{"username": "tracie73", "name": "David Maxwell", "gender": "M", "email": "rodriguezbobby@example.net", "age": 41, "address": "414 Arnold Street\\nRoyberg, IL 80169"}',
  'campaigns': [{'cmp_name': 'KTR_20240725_20250515_45-50_F_EUR',
    'cmp_bgt': 178525,
    'cmp_spent': 116926,
    'cmp_clicks': 43985,
    'cmp_impr': 499998},
   {'cmp_name': 'GRZ_20240325_20250718_30-40_F_GBP',
    'cmp_bgt': 897803,
    'cmp_spent': 745120,
    'cmp_clicks': 87866,
    'cmp_impr': 500000},
   {'cmp_name': 'BYU_20230621_20231001_25-30_M_GBP',
    'cmp_bgt': 435973,
    'cmp_spent': 76953,
    'cmp_clicks': 39710,
    'cmp_impr': 500002}]},
 {'user': '{"username": "anna27", "name": "Colleen Peterson MD", "gender": "F", "email": "brandon94@example.com", "age": 86, "address": "29760 Chad Summit\\nWest Davidstad, PR 44505"}',
  'campaigns': [{'cmp_name': 'AKX_20231224_20240209_45-50_B_GBP',
    'cmp_bgt': 604350,
    'cmp_spent': 40386,
    'cmp_clicks': 83080,
    'cmp_impr': 499999},
   {

In [14]:
# Let's start from having a different version of the data
# I want a list whose items will be dicts. Each dict is 
# the original campaign dict plus the user JSON

data = []
for datum in rough_data:
    for campaign in datum['campaigns']:
        campaign.update({'user': datum['user']})
        data.append(campaign)
data[:2]  # let's take another peek

[{'cmp_name': 'KTR_20240725_20250515_45-50_F_EUR',
  'cmp_bgt': 178525,
  'cmp_spent': 116926,
  'cmp_clicks': 43985,
  'cmp_impr': 499998,
  'user': '{"username": "tracie73", "name": "David Maxwell", "gender": "M", "email": "rodriguezbobby@example.net", "age": 41, "address": "414 Arnold Street\\nRoyberg, IL 80169"}'},
 {'cmp_name': 'GRZ_20240325_20250718_30-40_F_GBP',
  'cmp_bgt': 897803,
  'cmp_spent': 745120,
  'cmp_clicks': 87866,
  'cmp_impr': 500000,
  'user': '{"username": "tracie73", "name": "David Maxwell", "gender": "M", "email": "rodriguezbobby@example.net", "age": 41, "address": "414 Arnold Street\\nRoyberg, IL 80169"}'}]

In [15]:
# Warning: Uncommenting and executing this cell will overwrite data.json
#with open('data.json', 'w') as stream:
#     stream.write(json.dumps(data))