# Your Use Case


You work at a e-commerce company  selling online groceries and other home goods.

The company:

- operates in your country
- you have a similar role to your current one (or your dream role)
- wants to create a beautiful experience
- cares about retention, trust and therefore privacy
- works in an agile fashion and has several product, software and data teams



There will be a variety of code and non-coding challenges. If you are coding, please install the repository: 

`kjam/practical-data-privacy`




In [1]:
!pip install Faker



In [2]:
from faker import Faker
from pprint import pprint
import json
import random
from collections import Counter


fake = Faker('en_US') #change locale if you'd like!

In [3]:
USER_MEMBERSHIP_NUMBERS = range(5000)

# loosely adapted from https://www.ziplist.com/grocery-list-template/categories
CATEGORIES_AND_ITEMS = {
    "Fruits": ["bananas", "apples", "grapes", "oranges", "strawberries", "avocados", "peaches", "pineapple", "pears"],
    "Vegetables": ["potatoes", "tomatoes", "onions", "carrots", "lettuce", "broccoli", "peppers", "celery", "garlic", "cucumbers"],
    "Canned Goods": ["olives", "soup", "canned tuna", "veggies", "fruit"],
    "Frozen Foods": ["fish", "ice cream", "pizza", "frozen potatoes", "ready meals"],
    "Meat and Animal Products": ["chicken", "beef", "pork", "sausage", "eggs"],
    "Vegetarian and Vegan Alternatives": ["veggie burgers", "tofu", "vegan fish", "oat milk", "vegan sausages"],
    "Fish and shellfish": ["shrimp", "crab", "clams", "tuna", "salmon", "tilapia"],
    "Dairy": ["cheese", "milk", "yogurt", "butter"],
    "Deli": ["sliced cheese", "ham", "turkey", "salami"],
    "Condiments & Spices": ["salt", "sugar", "pepper", "oregano", "cinnamon", "ketchup", "mayonnaise", "mustard"],
    "Sauces & Oils": ["olive oil", "tomato sauce", "hot sauce", "soy sauce"],
    "Snacks": ["chips", "crackers", "pretzels", "popcorn", "peanuts", "nuts", "candy"],
    "Bread & Bakery": ["whole wheat", "white", "italian", "sandwich", "tortillas", "pies", "muffins", "bagels", "cookies"],
    "Beverages": ["water", "coffee", "juice", "soda", "tea", "beer", "wine"],
    "Pasta/Rice": ["spaghetti", "macaroni", "noodles", "white rice"],
    "Cereal": ["oats", "rice", "wheat", "granola"],
    "Baking": ["flour", "baking powder", "sprinkles"],
    "Personal Care": ["shampoo", "conditioner", "soap", "deodorant", "toothpaste", "dental floss", "shaving cream", "razor blades"],
    "Health Care": ["band-aid", "hydrogen peroxide", "rubbing alcohol", "pain reliever", "antacids"],
    "Paper & Wrap": ["toilet paper", "paper towels", "tissues", "aluminum foil", "zip bags"],
    "Household Supplies": ["detergent", "softener", "bleach", "dish soap", "air freshener", "gloves", "sponge", "trash bags", "batteries"],
    "Baby Items": ["baby food", "diapers", "wet wipes", "moisturizing lotion"],
    "Other items": ["pet food", "flowers", "tobacco"],
}

CATEGORIES_WITH_WEIGHTS = [
    ("Fruits", 10),
    ("Vegetables", 10),
    ("Canned Goods", 4),
    ("Frozen Foods", 5),
    ("Meat and Animal Products", 5),
    ("Vegetarian and Vegan Alternatives", 5),
    ("Fish and shellfish", 5),
    ("Dairy", 5),
    ("Deli", 3),
    ("Condiments & Spices", 3),
    ("Sauces & Oils", 3),
    ("Snacks", 5),
    ("Bread & Bakery", 5),
    ("Beverages", 5),
    ("Pasta/Rice", 7),
    ("Cereal", 3),
    ("Baking", 3),
    ("Personal Care", 4),
    ("Health Care", 2),
    ("Paper & Wrap", 2),
    ("Household Supplies", 3),
    ("Baby Items", 2),
    ("Other items", 1),
]

STORE_LOCATIONS = ["{}, {}".format(fake.unique.city(), fake.state()) for _ in range(30)]

STORE_DICT = dict([(loc, fake.unique.random_int(min=1000, max=9999)) for loc in STORE_LOCATIONS])

CATEGORIES_AND_ITEMS.values()

def flatten(lols):
    return [item for sublist in lols for item in sublist]

ITEM_LOOKUP = dict([(item_name, fake.unique.random_int(min=10000, max=99999)) for item_name in flatten(CATEGORIES_AND_ITEMS.values())])

In [4]:
print(sum(w for _, w in CATEGORIES_WITH_WEIGHTS))
sum(w for _, w in CATEGORIES_WITH_WEIGHTS) == 100

100


True

In [5]:
most_common, num_most_common = Counter(STORE_DICT.values()).most_common(1)[0]
num_most_common == 1

True

In [6]:
most_common, num_most_common = Counter(ITEM_LOOKUP.values()).most_common(1)[0]
num_most_common == 1

True

In [7]:
def generate_items(CATEGORIES_AND_ITEMS, with_quantity=False):
    category = random.choices([c for c,_ in CATEGORIES_WITH_WEIGHTS], 
                              [w for _,w in CATEGORIES_WITH_WEIGHTS], k=1)[0]
    item = random.choice(CATEGORIES_AND_ITEMS[category])
    return {"category": category, 
            "item": item, 
            "item_id": ITEM_LOOKUP[item],
            "quantity": random.choices([1, 2, 3, 4], [30, 30, 20, 10], k=1)[0]}

In [8]:
# generate row
SALES_DATASET = []

for i in range(50000):
    location = random.choice(STORE_LOCATIONS)
    SALES_DATASET.append({
        "membership_number": (lambda n: str(random.choice(USER_MEMBERSHIP_NUMBERS)) if n <= 0.5 else "")(random.random()),
        "sale_datetime": fake.date_time().isoformat(),
        "sale_id": 3000 + i,
        "items": [generate_items(CATEGORIES_AND_ITEMS) for _ in range(random.randrange(3, 30, 1))],
        "store_location": location,
        "store_id": STORE_DICT[location]
    })

In [9]:
for sale in SALES_DATASET[:5]:
    pprint(sale)

{'items': [{'category': 'Fish and shellfish',
            'item': 'salmon',
            'item_id': 41258,
            'quantity': 1},
           {'category': 'Fruits',
            'item': 'avocados',
            'item_id': 44717,
            'quantity': 2},
           {'category': 'Health Care',
            'item': 'hydrogen peroxide',
            'item_id': 53586,
            'quantity': 3},
           {'category': 'Snacks',
            'item': 'nuts',
            'item_id': 20537,
            'quantity': 1},
           {'category': 'Condiments & Spices',
            'item': 'mustard',
            'item_id': 39739,
            'quantity': 3},
           {'category': 'Frozen Foods',
            'item': 'ready meals',
            'item_id': 70579,
            'quantity': 2},
           {'category': 'Deli', 'item': 'ham', 'item_id': 67720, 'quantity': 2},
           {'category': 'Frozen Foods',
            'item': 'ready meals',
            'item_id': 70579,
            'quantity': 1},
 

In [10]:
with open('data/sales.json', 'w', encoding='utf-8') as f:
    json.dump(SALES_DATASET, f, indent=4)

In [11]:
!head -n 50 data/sales.json

[
    {
        "membership_number": "",
        "sale_datetime": "1990-03-04T03:53:32.827930",
        "sale_id": 3000,
        "items": [
            {
                "category": "Fish and shellfish",
                "item": "salmon",
                "item_id": 41258,
                "quantity": 1
            },
            {
                "category": "Fruits",
                "item": "avocados",
                "item_id": 44717,
                "quantity": 2
            },
            {
                "category": "Health Care",
                "item": "hydrogen peroxide",
                "item_id": 53586,
                "quantity": 3
            },
            {
                "category": "Snacks",
                "item": "nuts",
                "item_id": 20537,
                "quantity": 1
            },
            {
                "category": "Condiments & Spices",
                "item": "mustard",
                "item_id": 39739,
                "quantity": 3
        

In [12]:
LIST_DATASET = []

STORE_SAMPLE = random.sample(list(STORE_DICT.values()), 3)

for i in range(5):
    LIST_DATASET.append({
        "list_items": [generate_items(CATEGORIES_AND_ITEMS) for _ in range(random.randrange(3, 30, 1))],
        "store_location_preferences": random.sample(STORE_SAMPLE, 2), 
    })

In [13]:
LIST_DATASET

[{'list_items': [{'category': 'Frozen Foods',
    'item': 'ready meals',
    'item_id': 70579,
    'quantity': 1},
   {'category': 'Frozen Foods',
    'item': 'fish',
    'item_id': 37256,
    'quantity': 2},
   {'category': 'Fish and shellfish',
    'item': 'salmon',
    'item_id': 41258,
    'quantity': 2},
   {'category': 'Pasta/Rice',
    'item': 'white rice',
    'item_id': 44001,
    'quantity': 3},
   {'category': 'Frozen Foods',
    'item': 'frozen potatoes',
    'item_id': 29709,
    'quantity': 2},
   {'category': 'Fish and shellfish',
    'item': 'crab',
    'item_id': 92164,
    'quantity': 3},
   {'category': 'Vegetables',
    'item': 'peppers',
    'item_id': 26725,
    'quantity': 2},
   {'category': 'Condiments & Spices',
    'item': 'sugar',
    'item_id': 43831,
    'quantity': 2},
   {'category': 'Fruits', 'item': 'bananas', 'item_id': 19917, 'quantity': 2},
   {'category': 'Pasta/Rice',
    'item': 'macaroni',
    'item_id': 11656,
    'quantity': 2},
   {'category'

In [14]:
with open('data/lists.json', 'w', encoding='utf-8') as f:
    json.dump(LIST_DATASET, f, indent=4)

In [15]:
INVENTORY_LIST = []

for location in STORE_SAMPLE:
    for cat, item_list in CATEGORIES_AND_ITEMS.items():
        INVENTORY_LIST.extend([{ 
            "category": cat, 
            "item": i,
            "item_id": ITEM_LOOKUP[i],
            "quantity": random.choices([1, 2, 3, 4], [30, 30, 20, 10], k=1)[0],
            "store_id": location,
        } for i in item_list])

In [16]:
INVENTORY_LIST

[{'category': 'Fruits',
  'item': 'bananas',
  'item_id': 19917,
  'quantity': 1,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'apples',
  'item_id': 83260,
  'quantity': 2,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'grapes',
  'item_id': 78138,
  'quantity': 2,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'oranges',
  'item_id': 39399,
  'quantity': 1,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'strawberries',
  'item_id': 85617,
  'quantity': 4,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'avocados',
  'item_id': 44717,
  'quantity': 3,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'peaches',
  'item_id': 28815,
  'quantity': 3,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'pineapple',
  'item_id': 53555,
  'quantity': 1,
  'store_id': 8040},
 {'category': 'Fruits',
  'item': 'pears',
  'item_id': 99461,
  'quantity': 3,
  'store_id': 8040},
 {'category': 'Vegetables',
  'item': 'potatoes',
  'item_id': 89599,

In [17]:
with open('data/inventory.json', 'w', encoding='utf-8') as f:
    json.dump(INVENTORY_LIST, f, indent=4)

## testing logic for federated exercise

= Additional challenge:

- Think through how you might actually incorporate testing in a federated enviornment as a team exercise! 

In [26]:
import pandas as pd

total_demand_list = []

for userlist in LIST_DATASET:
    for location in userlist['store_location_preferences']:
        for item in userlist['list_items']:
            total_demand_list.append([location, item['item_id'], item['quantity'] * -1])

total_inventory_list = [[inventory_item['store_id'], 
                        inventory_item['item_id'], 
                        inventory_item['quantity']] for inventory_item in INVENTORY_LIST]

consolidated_demand = pd.DataFrame(total_demand_list + total_inventory_list,
                                  columns=['store_id', 'item_id', 'quantity'])

final_result = consolidated_demand.groupby(['store_id', 'item_id'], as_index=False)['quantity'].sum()

In [30]:
for line in final_result.to_numpy():
    print(line)

[ 4936 10801    -1]
[ 4936 10981     1]
[ 4936 11656    -1]
[ 4936 13557    -2]
[ 4936 13983     1]
[ 4936 14563     4]
[ 4936 14783    -2]
[ 4936 15399     1]
[ 4936 15671     3]
[ 4936 16905    -2]
[ 4936 16948     3]
[ 4936 16979     2]
[ 4936 19917    -1]
[ 4936 20537     1]
[ 4936 20907     2]
[ 4936 21681     1]
[ 4936 22280     0]
[ 4936 22481     1]
[ 4936 23802     1]
[ 4936 24109     0]
[ 4936 26611     2]
[ 4936 26682     1]
[ 4936 26725     0]
[ 4936 27122     3]
[ 4936 28805    -1]
[ 4936 28815    -2]
[ 4936 29034    -1]
[ 4936 29709    -4]
[ 4936 29740     1]
[ 4936 31738    -3]
[ 4936 31814     2]
[ 4936 33171     0]
[ 4936 33242     1]
[ 4936 33590     1]
[ 4936 33900     3]
[ 4936 33964     1]
[ 4936 35531     1]
[ 4936 35861     2]
[ 4936 37032     1]
[ 4936 37256    -8]
[ 4936 38077     2]
[ 4936 38184     3]
[ 4936 38562     1]
[ 4936 39293     2]
[ 4936 39399     0]
[ 4936 39679     4]
[ 4936 39739     1]
[ 4936 40631    -4]
[ 4936 40634     1]
[ 4936 40982    -4]
