In [1]:
import mariadb
import pandas as pd
from columnar import columnar

In [2]:
connection = mariadb.connect(
    host="127.0.0.1",
    user="root",
    database="data-warehouse",
    autocommit=True,
)
cursor = connection.cursor()

In [3]:
begin_date = "2020-12-17"
end_date = "2021-01-01"

subquery = "SELECT * FROM transactions WHERE transaction_date BETWEEN '{begin_date}' AND '{end_date}'".format(
    begin_date=begin_date, end_date=end_date
)

cursor.execute("DROP TABLE IF EXISTS transactions_subset")
cursor.execute("CREATE TABLE transactions_subset {q}".format(q=subquery))

## Number of rows within date range

In [4]:
cursor.execute("SELECT COUNT(*) FROM transactions_subset")
cursor.fetchall()[0][0]

794242

## Average per day statistics

In [5]:
cursor.execute(
    "SELECT COUNT(*) / COUNT(DISTINCT DATE(transaction_date)) FROM transactions_subset"
)
items_per_day = float(cursor.fetchall()[0][0])

cursor.execute(
    "SELECT (SELECT COUNT(DISTINCT transaction_id) FROM transactions_subset) / (SELECT COUNT(DISTINCT transaction_date) from transactions_subset)"
)
customers_per_day = float(cursor.fetchall()[0][0])

cursor.execute("SELECT COUNT(*) FROM catalog")
num_catalog_items = cursor.fetchall()[0][0]

cursor.execute(
    "SELECT COUNT(*) FROM catalog WHERE item_type NOT IN ('MILK', 'BABY FOOD', 'CEREAL', 'DIAPERS', 'BREAD', 'PEANUT BUTTER', 'JELLY/JAM')"
)
num_unweighted_items = cursor.fetchall()[0][0]
percent_unweighted_items = num_unweighted_items / num_catalog_items

expected_customers_per_day = ((1100 + 1150) / 2) + (50 * (2.0 / 7.0))
expected_items_per_customer = ((90 - 1) / 2) + 1
expected_items_per_day = expected_customers_per_day * int(
    round(expected_items_per_customer)
)

print(
    "Average total items sold per day: {actual} (expected: {expected})".format(
        actual=int(round(items_per_day)), expected=int(round(expected_items_per_day))
    )
)
print(
    "Average total customers per day: {actual} (expected: {expected})".format(
        actual=int(round(customers_per_day)),
        expected=int(round(expected_customers_per_day)),
    )
)
print(
    "Average items per customer: {actual} (expected: {expected})".format(
        actual=int(round(items_per_day / customers_per_day)),
        expected=int(round(expected_items_per_customer)),
    )
)
print(
    "Percentage of unweighted items: {x}".format(
        x=num_unweighted_items / num_catalog_items
    )
)
print(
    "Expected sales per day of each unweighted item: {x:0.2f}".format(
        x=(items_per_day / num_catalog_items) / percent_unweighted_items
    )
)

Average total items sold per day: 52949 (expected: 52407)
Average total customers per day: 1138 (expected: 1139)
Average items per customer: 47 (expected: 46)
Percentage of unweighted items: 0.8
Expected sales per day of each unweighted item: 31.90


## Build tables containing daily averages

In [6]:
def build_avg_per_day_table(table_name: str, target_column: str):
    cursor.execute("DROP TABLE IF EXISTS {table_name}".format(table_name=table_name))
    cursor.execute(
        """
        CREATE TABLE {table_name} ({target_column} TEXT, count INT, percent_of_customers FLOAT)
            SELECT
                catalog.{target_column},
                ROUND({count}) as count,
                ({count}) / (SELECT ({num_transactions}) / ({num_days})) as percent_of_customers
            FROM transactions_subset
            JOIN catalog ON transactions_subset.sku = catalog.sku
            GROUP BY {target_column}
        """.format(
            table_name=table_name,
            target_column=target_column,
            count="COUNT(*) / COUNT(DISTINCT DATE(transaction_date))",
            num_transactions="SELECT COUNT(DISTINCT transaction_id) FROM transactions_subset",
            num_days="SELECT COUNT(DISTINCT transaction_date) from transactions_subset",
        )
    )

### Average sales per day of each category

In [7]:
build_avg_per_day_table("avg_item_categories_sold_per_day", "item_type")
cursor.execute("SELECT * FROM avg_item_categories_sold_per_day LIMIT 5")
cursor.fetchall()

[('', 8943, 7.85703),
 ('ACETOMINIFEN', 149, 0.130959),
 ('ASPIRIN', 301, 0.264789),
 ('BABY FOOD', 227, 0.199309),
 ('BAKED GOODS OTHER THAN BREAD', 2329, 2.04586)]

#### Verify distrubution of weighted categories

In [8]:
weighted_categories = [
    ("MILK", 0.7),
    ("BABY FOOD", 0.2),
    ("CEREAL", (0.7 * 0.5) + (0.3 * 0.05)),
    ("DIAPERS", (0.2 * 0.8) + (0.8 * 0.01)),
    ("BREAD", 0.5),
    ("PEANUT BUTTER", 0.1),
    ("JELLY/JAM", (0.1 * 0.9) + (0.9 * 0.05)),
]


data = []
for category in weighted_categories:
    cursor.execute(
        "SELECT percent_of_customers FROM avg_item_categories_sold_per_day WHERE item_type = '{category}'".format(
            category=category[0]
        )
    )
    actual = cursor.fetchall()[0][0]
    data.append(
        [
            category[0],
            round(category[1], 3),
            round(actual, 3),
            round(abs(actual - category[1]), 3),
        ]
    )


table = columnar(
    data, ["catgeory", "expected", "actual", "difference"], no_borders=True
)
print(table)

          
  CATGEORY       EXPECTED  ACTUAL  DIFFERENCE  
    
  MILK           0.7       0.699   0.001       
  BABY FOOD      0.2       0.199   0.001       
  CEREAL         0.365     0.364   0.001       
  DIAPERS        0.168     0.168   0.0         
  BREAD          0.5       0.499   0.001       
  PEANUT BUTTER  0.1       0.097   0.003       
  JELLY/JAM      0.135     0.132   0.003       



#### Verify number of cases ordered for weighted categories

In [9]:
data = []
for category in weighted_categories:
    cursor.execute(
        """
        SELECT AVG(count)
            FROM avg_items_sold_per_day
            JOIN catalog ON avg_items_sold_per_day.sku = catalog.sku
            WHERE item_type = '{category}'
        """.format(
            category=category[0]
        )
    )

    sales_per_item = float(cursor.fetchall()[0][0])    
    expected_cases_ordered = (sales_per_item / 12) * 365

    cursor.execute(
        """
        SELECT MAX(total_cases_ordered)
            FROM transactions_subset
            JOIN catalog ON transactions_subset.sku = catalog.sku
            WHERE item_type = '{category}'
        """.format(
            category=category[0]
        )
    )

    actual_cases_ordered = float(cursor.fetchall()[0][0])
    
    error = (
        (expected_cases_ordered - actual_cases_ordered) / actual_cases_ordered
    ) * 100.0

    data.append(
        [
            category[0],
            int(round(expected_cases_ordered)),
            int(round(actual_cases_ordered)),
            abs(round(error, 2)),
        ]
    )

table = columnar(data, ["catgeory", "expected", "actual", "error"], no_borders=True)
print(table)

          
  CATGEORY       EXPECTED  ACTUAL  ERROR  
    
  MILK           4035      4053    0.44   
  BABY FOOD      60        48      25.56  
  CEREAL         140       146     4.12   
  DIAPERS        78        77      1.17   
  BREAD          357       375     4.69   
  PEANUT BUTTER  170       179     4.84   
  JELLY/JAM      1141      1174    2.84   



### Average sales per day of each item

In [10]:
build_avg_per_day_table("avg_items_sold_per_day", "sku")
cursor.execute(
    """
    SELECT item_type, CONCAT_WS(' ', manufacturer, product_name, size), count, percent_of_customers 
        FROM avg_items_sold_per_day
        JOIN catalog ON avg_items_sold_per_day.sku = catalog.sku LIMIT 5
    """
)
cursor.fetchall()

[('RICE/RICE MIX', 'Zatarains Jambalaya Rice Mix 12\xa0oz', 31, 0.0269415),
 ('RICE/RICE MIX', 'Zatarains Jambalaya Rice Mix 8\xa0oz', 29, 0.0251259),
 ('', 'Yucatan Guacamole Regular 8\xa0oz', 31, 0.0271758),
 ('COFFEE/CREAMER', 'Yuban Coffee Original Blend 12\xa0oz', 31, 0.0273515),
 ('YOGURT', 'Yoplait GoGurt Variety Pack 8\xa0ct', 30, 0.0262387)]

#### Verify distrubution of unweighted categories

In [11]:
cursor.execute(
    """
    SELECT AVG(count)
        FROM avg_items_sold_per_day
        JOIN catalog ON avg_items_sold_per_day.sku = catalog.sku
        WHERE item_type NOT IN ('MILK', 'BABY FOOD', 'CEREAL', 'DIAPERS', 'BREAD', 'PEANUT BUTTER', 'JELLY/JAM')
    """
)

print(
    "Expected sales per day of each unweighted item: {x:0.2f}".format(
        x=(items_per_day / num_catalog_items) / percent_unweighted_items
    )
)

print(
    "Actual sales per day of each unweighted item: {x:0.2f}".format(
        x=float(cursor.fetchall()[0][0])
    )
)

Expected sales per day of each unweighted item: 31.90
Actual sales per day of each unweighted item: 30.40


#### Verify number of cases ordered for unweighted categories

In [12]:
sales_per_item = (items_per_day / num_catalog_items) / percent_unweighted_items
expected_cases_ordered = (sales_per_item / 12) * 365

cursor.execute(
    """
    SELECT MAX(total_cases_ordered)
        FROM transactions_subset
        JOIN catalog ON transactions_subset.sku = catalog.sku
        WHERE item_type NOT IN ('MILK', 'BABY FOOD', 'CEREAL', 'DIAPERS', 'BREAD', 'PEANUT BUTTER', 'JELLY/JAM')
    """
)

print(
    "Expected cases ordered per year of each unweighted item: {x:0.2f}".format(
        x=expected_cases_ordered
    )
)

print(
    "Actual cases ordered per year of each unweighted item: {x:0.2f}".format(
        x=float(cursor.fetchall()[0][0])
    )
)

Expected cases ordered per year of each unweighted item: 970.21
Actual cases ordered per year of each unweighted item: 975.00
