In [1]:
import csv

import mariadb
import pandas as pd
from columnar import columnar

In [2]:
connection = mariadb.connect(
    host="127.0.0.1",
    user="root",
    database="data-warehouse",
    autocommit=True,
)
cursor = connection.cursor()

In [3]:
begin_date = "2020-04-05"
end_date = "2020-04-18"

subquery = "SELECT * FROM transactions WHERE transaction_date BETWEEN '{begin_date}' AND '{end_date}'".format(
    begin_date=begin_date, end_date=end_date
)

cursor.execute("DROP TABLE IF EXISTS transactions_subset")
cursor.execute("CREATE TABLE transactions_subset {q}".format(q=subquery))

## Number of rows within date range

In [4]:
cursor.execute("SELECT COUNT(*) FROM transactions_subset")
cursor.fetchall()[0][0]

733507

## Average per day statistics

In [5]:
cursor.execute(
    "SELECT COUNT(*) / COUNT(DISTINCT DATE(transaction_date)) FROM transactions_subset"
)
items_per_day = float(cursor.fetchall()[0][0])

cursor.execute(
    "SELECT (SELECT COUNT(DISTINCT transaction_id) FROM transactions_subset) / (SELECT COUNT(DISTINCT transaction_date) from transactions_subset)"
)
customers_per_day = float(cursor.fetchall()[0][0])

cursor.execute("SELECT COUNT(*) FROM catalog")
num_catalog_items = cursor.fetchall()[0][0]

cursor.execute(
    "SELECT COUNT(*) FROM catalog WHERE item_type NOT IN ('MILK', 'BABY FOOD', 'CEREAL', 'DIAPERS', 'BREAD', 'PEANUT BUTTER', 'JELLY/JAM')"
)
num_unweighted_items = cursor.fetchall()[0][0]
percent_unweighted_items = num_unweighted_items / num_catalog_items

expected_customers_per_day = ((1100 + 1150) / 2) + (50 * (2.0 / 7.0))
expected_items_per_customer = ((90 - 1) / 2) + 1
expected_items_per_day = expected_customers_per_day * int(
    round(expected_items_per_customer)
)

print(
    "Average total items sold per day: {actual} (expected: {expected})".format(
        actual=int(round(items_per_day)), expected=int(round(expected_items_per_day))
    )
)
print(
    "Average total customers per day: {actual} (expected: {expected})".format(
        actual=int(round(customers_per_day)),
        expected=int(round(expected_customers_per_day)),
    )
)
print(
    "Average items per customer: {actual} (expected: {expected})".format(
        actual=int(round(items_per_day / customers_per_day)),
        expected=int(round(expected_items_per_customer)),
    )
)
print(
    "Percentage of unweighted items: {x}".format(
        x=num_unweighted_items / num_catalog_items
    )
)
print(
    "Expected sales per day of each unweighted item: {x:0.2f}".format(
        x=(items_per_day / num_catalog_items) / percent_unweighted_items
    )
)

Average total items sold per day: 52393 (expected: 52407)
Average total customers per day: 1136 (expected: 1139)
Average items per customer: 46 (expected: 46)
Percentage of unweighted items: 0.8
Expected sales per day of each unweighted item: 31.56


## Build tables containing daily averages

In [6]:
def build_avg_per_day_table(table_name: str, target_column: str):
    cursor.execute("DROP TABLE IF EXISTS {table_name}".format(table_name=table_name))
    cursor.execute(
        """
        CREATE TABLE {table_name} ({target_column} TEXT, count INT, percent_of_customers FLOAT)
            SELECT
                catalog.{target_column},
                ROUND({count}) as count,
                ({count}) / (SELECT ({num_transactions}) / ({num_days})) as percent_of_customers
            FROM transactions_subset
            JOIN catalog ON transactions_subset.sku = catalog.sku
            GROUP BY {target_column}
        """.format(
            table_name=table_name,
            target_column=target_column,
            count="COUNT(*) / COUNT(DISTINCT DATE(transaction_date))",
            num_transactions="SELECT COUNT(DISTINCT transaction_id) FROM transactions_subset",
            num_days="SELECT COUNT(DISTINCT transaction_date) from transactions_subset",
        )
    )

### Average sales per day of each category

In [7]:
build_avg_per_day_table("avg_item_categories_sold_per_day", "item_type")
cursor.execute("SELECT * FROM avg_item_categories_sold_per_day LIMIT 5")
cursor.fetchall()

[('', 8846, 7.78991),
 ('ACETOMINIFEN', 148, 0.130079),
 ('ASPIRIN', 303, 0.267141),
 ('BABY FOOD', 231, 0.203673),
 ('BAKED GOODS OTHER THAN BREAD', 2276, 2.0044)]

#### Verify distrubution of weighted categories

In [8]:
weighted_categories = [
    ("MILK", 0.7),
    ("BABY FOOD", 0.2),
    ("CEREAL", (0.7 * 0.5) + (0.3 * 0.05)),
    ("DIAPERS", (0.2 * 0.8) + (0.8 * 0.01)),
    ("BREAD", 0.5),
    ("PEANUT BUTTER", 0.1),
    ("JELLY/JAM", (0.1 * 0.9) + (0.9 * 0.05)),
]


data = []
for category in weighted_categories:
    cursor.execute(
        "SELECT percent_of_customers FROM avg_item_categories_sold_per_day WHERE item_type = '{category}'".format(
            category=category[0]
        )
    )
    actual = cursor.fetchall()[0][0]
    data.append(
        [
            category[0],
            round(category[1], 3),
            round(actual, 3),
            round(abs(actual - category[1]), 3),
        ]
    )


table = columnar(
    data, ["catgeory", "expected", "actual", "difference"], no_borders=True
)
print(table)

          
  CATGEORY       EXPECTED  ACTUAL  DIFFERENCE  
    
  MILK           0.7       0.701   0.001       
  BABY FOOD      0.2       0.204   0.004       
  CEREAL         0.365     0.364   0.001       
  DIAPERS        0.168     0.171   0.003       
  BREAD          0.5       0.498   0.002       
  PEANUT BUTTER  0.1       0.104   0.004       
  JELLY/JAM      0.135     0.14    0.005       



### Average sales per day of each item

In [9]:
build_avg_per_day_table("avg_items_sold_per_day", "sku")
cursor.execute(
    """
    SELECT item_type, CONCAT_WS(' ', manufacturer, product_name, size), count, percent_of_customers 
        FROM avg_items_sold_per_day
        JOIN catalog ON avg_items_sold_per_day.sku = catalog.sku LIMIT 5
    """
)
cursor.fetchall()

[('RICE/RICE MIX', 'Zatarains Jambalaya Rice Mix 12\xa0oz', 31, 0.0269845),
 ('RICE/RICE MIX', 'Zatarains Jambalaya Rice Mix 8\xa0oz', 33, 0.0288087),
 ('', 'Yucatan Guacamole Regular 8\xa0oz', 30, 0.0267329),
 ('COFFEE/CREAMER', 'Yuban Coffee Original Blend 12\xa0oz', 31, 0.0268587),
 ('YOGURT', 'Yoplait GoGurt Variety Pack 8\xa0ct', 32, 0.0285571)]

#### Verify distrubution of unweighted categories

In [10]:
cursor.execute(
    """
    SELECT AVG(count)
        FROM avg_items_sold_per_day
        JOIN catalog ON avg_items_sold_per_day.sku = catalog.sku
        WHERE item_type NOT IN ('MILK', 'BABY FOOD', 'CEREAL', 'DIAPERS', 'BREAD', 'PEANUT BUTTER', 'JELLY/JAM')
    """
)

print(
    "Expected sales per day of each unweighted item: {x:0.2f}".format(
        x=(items_per_day / num_catalog_items) / percent_unweighted_items
    )
)

print(
    "Actual sales per day of each unweighted item: {x:0.2f}".format(
        x=float(cursor.fetchall()[0][0])
    )
)

Expected sales per day of each unweighted item: 31.56
Actual sales per day of each unweighted item: 30.10


## Dump average items sold per day to CSV file

In [11]:
cursor.execute(
    """
    SELECT catalog.sku, item_type, manufacturer, product_name, size, count, percent_of_customers 
        FROM avg_items_sold_per_day
        JOIN catalog ON avg_items_sold_per_day.sku = catalog.sku
    """
)
colnames = [desc[0] for desc in cursor.description]
with open('avg_items_sold_per_day.csv', 'w') as f:
    while True:
        df = pd.DataFrame(cursor.fetchall())
        if len(df) == 0:
            break
        else:
            df.to_csv(f, header=colnames, index=False)