In [1]:
import os
import warnings
from datetime import date, timedelta
from typing import Union

import mariadb
import pandas as pd
from columnar import columnar
from dotenv import load_dotenv
from IPython.core.interactiveshell import InteractiveShell
from pandas.tseries.holiday import USFederalHolidayCalendar

warnings.filterwarnings("ignore")

_ = load_dotenv("config.env")
InteractiveShell.ast_node_interactivity = "all"
DATABASE_NAME: str = "data_mart"

In [2]:
datamart_connection = mariadb.connect(
    host="127.0.0.1",
    port=23306,
    user=os.getenv("user"),
    password=os.getenv("password"),
    autocommit=False,
)
datamart_cursor = datamart_connection.cursor()

team8_connection = mariadb.connect(
    host="127.0.0.1",
    port=3306,
    user="root",
    database="data_warehouse",
    autocommit=False,
)
team8_cursor = team8_connection.cursor()

# Part 1a - Replace product table with the conformed product table

In [3]:
# original product dimension table was copied to database "backups"
datamart_cursor.execute("SELECT * FROM backups.product_dimension")
columns: list = [desc[0] for desc in datamart_cursor.description]
original_product_dimensions: pd.DataFrame = pd.DataFrame(data=datamart_cursor.fetchall(), columns=columns)
original_product_dimensions.shape
original_product_dimensions.columns

(2075, 14)

Index(['product_key', 'sku', 'product_name', 'product_class_id',
       'product_subcategory', 'product_category', 'product_department',
       'product_family', 'size', 'number_per_case', 'brand_name',
       'manufacturer', 'supplier', 'product_class_source_key'],
      dtype='object')

In [4]:
new_product_dimensions: pd.DataFrame = pd.read_csv("ConformedProducts.txt", sep="\t")
new_product_dimensions.shape
new_product_dimensions.columns

(2075, 11)

Index(['ProductKey1', 'sku', 'product_name', 'product_class_id', 'subcategory',
       'category', 'department', 'product_family', 'size', 'brandName',
       'supplier'],
      dtype='object')

### Add new product class source to meta table

In [5]:
datamart_cursor.execute(
    """
    INSERT IGNORE INTO {db}.product_class_source (source_key, source) VALUES (?, ?)
    """.format(
        db=DATABASE_NAME
    ),
    (
        7,
        "FROM CONFORMED PRODUCT TABLE",
    ),
)
datamart_connection.commit()

### Create intermediate mapping table

In [6]:
remap_product_categories: bool = False

if remap_product_categories:
    datamart_cursor.execute("DROP TABLE IF EXISTS {db}.product_dimension_change_mapping".format(db=DATABASE_NAME))
    datamart_cursor.execute(
        """
        CREATE TABLE IF NOT EXISTS {db}.product_dimension_change_mapping (
            sku INT UNSIGNED PRIMARY KEY NOT NULL,
            old_product_key INT UNSIGNED NOT NULL,
            old_product_class_id INT UNSIGNED NOT NULL,
            old_product_class_source_key INT UNSIGNED NOT NULL,
            new_product_key INT UNSIGNED NOT NULL,
            new_product_class_id INT UNSIGNED NOT NULL,
            new_product_class_source_key INT UNSIGNED NOT NULL
        )
        """.format(
            db=DATABASE_NAME
        )
    )

    for original_dimension in original_product_dimensions.itertuples():
        new_dimension = new_product_dimensions[new_product_dimensions["sku"] == original_dimension.sku].iloc[0]
        new_product_class_id = new_dimension.product_class_id.item()
        new_product_class_source_key = 7 if original_dimension.product_class_id != new_product_class_id else original_dimension.product_class_source_key
        
        datamart_cursor.execute(
            """
            INSERT INTO {db}.product_dimension_change_mapping (
                sku, old_product_key, old_product_class_id, old_product_class_source_key,
                new_product_key, new_product_class_id, new_product_class_source_key
            ) VALUES (?, ?, ?, ?, ?, ?, ?)
            """.format(
                db=DATABASE_NAME
            ),
            (
                original_dimension.sku,
                original_dimension.product_key,
                original_dimension.product_class_id,
                original_dimension.product_class_source_key,
                new_dimension.ProductKey1.item(),
                new_product_class_id,
                new_product_class_source_key
            ),
        )
    datamart_connection.commit()

In [7]:
datamart_cursor.execute("SELECT * FROM {db}.product_dimension_change_mapping LIMIT 25".format(db=DATABASE_NAME))
for row in datamart_cursor.fetchall():
    print(row)

(42081001, 1, 57, 2, 1, 57, 2)
(42082001, 2, 57, 2, 2, 57, 2)
(42083001, 3, 83, 6, 3, 99, 7)
(42084001, 4, 7, 4, 4, 90, 7)
(42085001, 5, 6, 1, 5, 6, 1)
(42086001, 6, 48, 6, 6, 260, 7)
(42087001, 7, 65, 6, 7, 65, 6)
(42088001, 8, 0, 0, 8, 77, 7)
(42089001, 9, 30, 3, 9, 99, 7)
(42090001, 10, 30, 1, 10, 30, 1)
(42091001, 11, 35, 1, 11, 35, 1)
(42092001, 12, 5, 2, 12, 10, 7)
(42093001, 13, 5, 2, 13, 5, 2)
(42094001, 14, 30, 1, 14, 30, 1)
(42095001, 15, 30, 1, 15, 30, 1)
(42096001, 16, 30, 1, 16, 30, 1)
(42097001, 17, 30, 1, 17, 30, 1)
(42098001, 18, 30, 1, 18, 30, 1)
(42099001, 19, 35, 1, 19, 35, 1)
(42100001, 20, 52, 6, 20, 52, 6)
(42101001, 21, 30, 1, 21, 30, 1)
(42102001, 22, 30, 1, 22, 30, 1)
(42103001, 23, 1, 1, 23, 1, 1)
(42104001, 24, 13, 1, 24, 13, 1)
(42105001, 25, 13, 1, 25, 13, 1)


### Create new Product Dimension table with conformed data

In [8]:
if remap_product_categories:
    datamart_cursor.execute("DROP TABLE IF EXISTS {db}.product_dimension".format(db=DATABASE_NAME))
    datamart_cursor.execute(
        """
        CREATE TABLE IF NOT EXISTS {db}.product_dimension (
            product_key INT PRIMARY KEY AUTO_INCREMENT,
            sku INT UNSIGNED NOT NULL,
            product_name TEXT NOT NULL,
            product_class_id INT UNSIGNED,
            product_subcategory TEXT,
            product_category TEXT,
            product_department TEXT,
            product_family TEXT,
            size TEXT NOT NULL,
            brand_name TEXT,
            supplier TEXT NOT NULL
        )
        """.format(
            db=DATABASE_NAME
        )
    )
    
    for row in new_product_dimensions.itertuples():
        datamart_cursor.execute(
            """
            INSERT INTO {db}.product_dimension (sku, product_name, product_class_id, product_subcategory, product_category, product_department,
            product_family, size, brand_name, supplier)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """.format(
                db=DATABASE_NAME
            ),
            (
                row.sku,
                row.product_name,
                row.product_class_id,
                row.subcategory,
                row.category,
                row.department,
                row.product_family,
                row.size,
                row.brandName,
                row.supplier,
            ),
        )
        datamart_connection.commit()

In [9]:
datamart_cursor.execute("SELECT * FROM {db}.product_dimension LIMIT 25".format(db=DATABASE_NAME))
for row in datamart_cursor.fetchall():
    print(row)

(1, 42081001, 'Jambalaya Rice Mix', 57, 'Rice', 'Starchy Foods', 'Starchy Foods', 'Food', '12 oz', 'Zatarains', 'Rowan Warehouse')
(2, 42082001, 'Jambalaya Rice Mix', 57, 'Rice', 'Starchy Foods', 'Starchy Foods', 'Food', '8 oz', 'Zatarains', 'Rowan Warehouse')
(3, 42083001, 'Guacamole Regular', 99, 'Fresh Fruit', 'Fruit', 'Produce', 'Food', '8 oz', 'Yucatan', 'Rowan Warehouse')
(4, 42084001, 'Coffee Original Blend', 90, 'Coffee', 'Hot Beverages', 'Beverages', 'Drink', '12 oz', 'Yuban', 'Rowan Warehouse')
(5, 42085001, 'GoGurt Variety Pack', 6, 'Yogurt', 'Dairy', 'Dairy', 'Food', '8 ct', 'Yoplait', 'Rowan Warehouse')
(6, 42086001, 'Italian Dressing', 260, 'Dressings', 'Baking Goods', 'Baking Products', 'Food', '16 oz', 'Wishbone', 'Rowan Warehouse')
(7, 42087001, 'Cheeseburger Heat & Serve Sliders', 65, 'Hamburger', 'Meat', 'Meat', 'Food', '29.28 oz', 'White Castle', 'Rowan Warehouse')
(8, 42088001, 'Choice Cuts Poultry', 77, 'Fresh Chicken', 'Meat', 'Deli', 'Food', '36 oz', 'Whiskas', 

# Part 1b - Ingest transactions for Jan-June 2021 from Team 7 / Team 8 databases

### Expand date dimension table

In [10]:
starting_key = 366  # last date key from 2020
year = 2021
current_date: date = date(year, 1, 1)
stop_date: date = date(year, 12, 31)
fiscal_year_start = date(year, 8, 1)

holidays = (
    USFederalHolidayCalendar().holidays(start=current_date.isoformat(), end=stop_date.isoformat()).to_pydatetime()
)
holidays = [d.date() for d in holidays]

seasons = [
    ("winter", (date(year, 1, 1), date(year, 3, 20))),
    ("spring", (date(year, 3, 21), date(year, 6, 20))),
    ("summer", (date(year, 6, 21), date(year, 9, 22))),
    ("autumn", (date(year, 9, 23), date(year, 12, 20))),
    ("winter", (date(year, 12, 21), date(year, 12, 31))),
]

while current_date <= stop_date:
    time_tuple = current_date.timetuple()
    weekday = current_date.strftime("%A")

    datamart_cursor.execute(
        """
        INSERT IGNORE INTO {db}.dates (
            date_key, datetime, day_in_month, day_in_year, week_number, month_number, month_text,
            quarter, year, fiscal_year, is_holiday, is_weekend, season
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """.format(
            db=DATABASE_NAME
        ),
        (
            starting_key + time_tuple.tm_yday,
            current_date.isoformat(),
            time_tuple.tm_mday,
            time_tuple.tm_yday,
            current_date.isocalendar()[1],
            time_tuple.tm_mon,
            current_date.strftime("%B"),
            pd.Timestamp(current_date).quarter,
            time_tuple.tm_year,
            time_tuple.tm_year if current_date >= fiscal_year_start else time_tuple.tm_year - 1,
            True if current_date in holidays else False,
            True if weekday == "Saturday" or weekday == "Sunday" else False,
            next(season for season, (start, end) in seasons if start <= current_date <= end),
        ),
    )

    current_date = current_date + timedelta(days=1)
datamart_connection.commit()

In [11]:
datamart_cursor.execute(
    "SELECT * FROM {db}.dates WHERE date_key > {starting_key} LIMIT 25".format(
        db=DATABASE_NAME, starting_key=starting_key
    )
)
for row in datamart_cursor.fetchall():
    print(row)

(367, datetime.date(2021, 1, 1), 1, 1, 53, 1, 'January', 1, 2021, 2020, 1, 0, 'winter')
(368, datetime.date(2021, 1, 2), 2, 2, 53, 1, 'January', 1, 2021, 2020, 0, 1, 'winter')
(369, datetime.date(2021, 1, 3), 3, 3, 53, 1, 'January', 1, 2021, 2020, 0, 1, 'winter')
(370, datetime.date(2021, 1, 4), 4, 4, 1, 1, 'January', 1, 2021, 2020, 0, 0, 'winter')
(371, datetime.date(2021, 1, 5), 5, 5, 1, 1, 'January', 1, 2021, 2020, 0, 0, 'winter')
(372, datetime.date(2021, 1, 6), 6, 6, 1, 1, 'January', 1, 2021, 2020, 0, 0, 'winter')
(373, datetime.date(2021, 1, 7), 7, 7, 1, 1, 'January', 1, 2021, 2020, 0, 0, 'winter')
(374, datetime.date(2021, 1, 8), 8, 8, 1, 1, 'January', 1, 2021, 2020, 0, 0, 'winter')
(375, datetime.date(2021, 1, 9), 9, 9, 1, 1, 'January', 1, 2021, 2020, 0, 1, 'winter')
(376, datetime.date(2021, 1, 10), 10, 10, 1, 1, 'January', 1, 2021, 2020, 0, 1, 'winter')
(377, datetime.date(2021, 1, 11), 11, 11, 2, 1, 'January', 1, 2021, 2020, 0, 0, 'winter')
(378, datetime.date(2021, 1, 12), 

### Ingest Team 8 data into staging table

In [12]:
begin_date = "2021-01-01"
end_date = "2021-06-30"
team8_staging_table = "team8.transactions_2021"

ingest_team8_data: bool = False

if ingest_team8_data:
    # create 2021 staging table in datamart database
    datamart_cursor.execute("DROP TABLE IF EXISTS {table}".format(table=team8_staging_table))
    datamart_cursor.execute(
        """
        CREATE TABLE {table}
        (
            id INT PRIMARY KEY NOT NULL,
            transaction_id BIGINT UNSIGNED NOT NULL,
            customer_id BIGINT UNSIGNED NOT NULL,
            sku INT UNSIGNED NOT NULL,
            sale_price DECIMAL(65,2) UNSIGNED NOT NULL,
            transaction_date DATE NOT NULL,
            items_left INT UNSIGNED NOT NULL,
            total_cases_ordered INT UNSIGNED NOT NULL
        )
        """.format(
            table=team8_staging_table
        )
    )

    # query for 2021 transactions from Team 8 database
    team8_cursor.execute(
        "SELECT * FROM transactions WHERE transaction_date BETWEEN '{begin_date}' AND '{end_date}'".format(
            begin_date=begin_date, end_date=end_date
        )
    )

    count = 0
    for transaction in team8_cursor:
        count = count + 1
        datamart_cursor.execute(
            """
            INSERT INTO {table} (
                id, transaction_id, customer_id, sku, sale_price,
                transaction_date, items_left, total_cases_ordered
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """.format(
                table=team8_staging_table
            ),
            (
                transaction[0],
                transaction[1],
                transaction[2],
                transaction[3],
                transaction[4],
                transaction[5],
                transaction[6],
                transaction[7],
            ),
        )
        if count % 100000 == 0:
            datamart_connection.commit()
        if count % 1000000 == 0:
            print(count)
    datamart_connection.commit()

In [13]:
datamart_cursor.execute("SELECT COUNT(*) FROM {table}".format(table=team8_staging_table))
print("Team 8 transactions for Jan-June 2021: {:,}".format(datamart_cursor.fetchall()[0][0]))

Team 8 transactions for Jan-June 2021: 9,584,035


### Add Team 8 data to daily sales fact table

In [14]:
regen_sales_facts_daily_level: bool = False

if regen_sales_facts_daily_level:
    datamart_cursor.execute(
        """
        INSERT INTO {db}.sales_facts_daily_level (date_key, product_key, store_key, number_sold_today, cost_of_items_sold, sales_total, gross_profit)
        SELECT date_key, product_key, store_key, number_sold_today, cost_of_items_sold, sales_total, gross_profit FROM
        (SELECT * FROM 
        (SELECT t.*, product_dimension.product_key, number_sold_today * base_price AS cost_of_items_sold, sales_total - (number_sold_today * base_price) as gross_profit FROM
            (SELECT transactions_2021.*, COUNT(*) AS number_sold_today, SUM(sale_price) AS sales_total, base_price
            FROM team8.transactions_2021
            JOIN {db}.product_catalog ON transactions_2021.sku = product_catalog.sku
            GROUP BY sku, transaction_date) t
            JOIN {db}.product_dimension ON t.sku = product_dimension.sku) t
        JOIN {db}.dates ON t.transaction_date = dates.datetime) t
        JOIN {db}.stores ON store_key = 8
        """.format(
            db=DATABASE_NAME
        )
    )
    datamart_connection.commit()

# Part 2 - Report generator

In [17]:
def generate_sales_report(
    begin_date: Union[date, str] = date(2000, 1, 1), end_date: Union[date, str] = date(2100, 1, 1), store: int = 0, department: str = "%"
) -> str:
    store_key: str = str(store) if store > 0 else "%"

    if isinstance(begin_date, str):
        begin_date = datetime.strptime(begin_date, "%Y-%m-%d").date()
        
    if isinstance(end_date, str):
        end_date = datetime.strptime(end_date, "%Y-%m-%d").date()
    
    datamart_cursor.execute(
        """
        SELECT store, department, CONVERT(CONCAT('$', FORMAT(sales_total, 2)) USING latin1) AS 'total sales',
        CONVERT(CONCAT('$', FORMAT(gross_profit, 2)) USING latin1) AS 'total profit' FROM
        ((SELECT IFNULL(store_key, 'ALL') AS store, IFNULL(product_department, 'TOTAL') AS department,
            SUM(sales_total) AS sales_total,
            SUM(gross_profit) AS gross_profit
            FROM backups.{fact_table}
        JOIN {db}.dates ON {fact_table}.date_key = dates.date_key
        JOIN {db}.product_dimension ON {fact_table}.product_key = product_dimension.product_key
        WHERE datetime BETWEEN '{begin_date}' AND '{end_date}' AND product_department LIKE "{department}" AND store_key LIKE "{store_key}"
        GROUP BY product_department, store_key WITH ROLLUP)
        UNION
        (SELECT store_key, "TOTAL", SUM(sales_total), SUM(gross_profit) FROM backups.{fact_table}
        JOIN {db}.dates ON {fact_table}.date_key = dates.date_key
        WHERE datetime BETWEEN '{begin_date}' AND '{end_date}' AND store_key LIKE "{store_key}"
        GROUP BY store_key)) t
        WHERE department LIKE "{department}" AND store LIKE "{store_key}"
        ORDER BY department, store
        """.format(
            db=DATABASE_NAME,
            fact_table="sales_facts_daily_level_sample",
            begin_date=begin_date,
            end_date=end_date,
            department=department,
            store_key=store_key,
        )
    )

    colnames = [desc[0] for desc in datamart_cursor.description]
    tuples = datamart_cursor.fetchall()
    lists = [list(x) for x in tuples] if len(tuples) > 0 else [["N/A", "N/A", "N/A", "N/A"]]
    sales_table = columnar(
        lists,
        colnames,
        no_borders=True,
        terminal_width=100,
    )
    return sales_table


def generate_quarterly_sales_report(quarter: int, year: int, store: int = 0, department: str = "%") -> str:
    period = pd.PeriodIndex(year=[year], quarter=quarter)
    begin_date = period.array[0].start_time.to_pydatetime().date()
    end_date = period.array[0].end_time.to_pydatetime().date()
    return generate_sales_report(store=store, department=department, begin_date=begin_date, end_date=end_date)


In [18]:
# print(generate_sales_report(begin_date="2020-1-1", end_date="2021-12-31"))

print(generate_quarterly_sales_report(quarter=2, year=2020, store=7))
print(generate_quarterly_sales_report(quarter=2, year=2021, store=7))

print(generate_quarterly_sales_report(quarter=2, year=2020, store=8))
print(generate_quarterly_sales_report(quarter=2, year=2021, store=8))

          
  STORE  DEPARTMENT          TOTAL SALES  TOTAL PROFIT  
    
  7      Baby Food           $666.89      $43.64        
  7      Baked Goods         $12,808.51   $837.91       
  7      Baking Products     $14,259.66   $932.84       
  7      Beverages           $84,951.28   $5,557.61     
  7      Breakfast Foods     $3,418.11    $223.81       
  7      Canned Foods        $14,734.46   $963.88       
  7      Dairy               $14,525.92   $950.29       
  7      Deli                $9,664.10    $632.22       
  7      Eggs                $330.48      $21.62        
  7      Frozen Foods        $26,836.42   $1,755.60     
  7      Health and Hygiene  $7,393.42    $483.68       
  7      Household           $7,811.67    $511.16       
  7      Meat                $2,971.93    $194.42       
  7      Pet Foods           $152.03      $9.95         
  7      Produce             $28,034.04   $1,834.08     
  7      Seafood             $314.69      $20.59        
  7      Snack 