In [1]:
import os
from csv import DictReader

import mariadb
import pandas as pd
from dotenv import load_dotenv
from IPython.core.interactiveshell import InteractiveShell

_ = load_dotenv("config.env")
InteractiveShell.ast_node_interactivity = "all"
DATABASE_NAME: str = "data_mart"

In [2]:
connection = mariadb.connect(
    host="127.0.0.1",
    port=23306,
    user=os.getenv("user"),
    password=os.getenv("password"),
    autocommit=False,
)
cursor = connection.cursor()

## Part 1 - Replace product table with the conformed product table

In [3]:
# original product dimension table was copied to database "backups"
cursor.execute("SELECT * FROM backups.product_dimension")
columns: list = [desc[0] for desc in cursor.description]
original_product_dimensions: pd.DataFrame = pd.DataFrame(data=cursor.fetchall(), columns=columns)
original_product_dimensions.shape
original_product_dimensions.columns

(2075, 14)

Index(['product_key', 'sku', 'product_name', 'product_class_id',
       'product_subcategory', 'product_category', 'product_department',
       'product_family', 'size', 'number_per_case', 'brand_name',
       'manufacturer', 'supplier', 'product_class_source_key'],
      dtype='object')

In [4]:
new_product_dimensions: pd.DataFrame = pd.read_csv("ConformedProducts.txt", sep="\t")
new_product_dimensions.shape
new_product_dimensions.columns

(2075, 11)

Index(['ProductKey1', 'sku', 'product_name', 'product_class_id', 'subcategory',
       'category', 'department', 'product_family', 'size', 'brandName',
       'supplier'],
      dtype='object')

### Add new product class source to meta table

In [5]:
cursor.execute(
    """
    INSERT IGNORE INTO {db}.product_class_source (source_key, source) VALUES (?, ?)
    """.format(
        db=DATABASE_NAME
    ),
    (
        7,
        "FROM CONFORMED PRODUCT TABLE",
    ),
)
connection.commit()

### Create intermediate mapping table

In [6]:
remap_product_categories: bool = False

if remap_product_categories:
    cursor.execute("DROP TABLE IF EXISTS {db}.product_dimension_change_mapping".format(db=DATABASE_NAME))
    cursor.execute(
        """
        CREATE TABLE IF NOT EXISTS {db}.product_dimension_change_mapping (
            sku INT UNSIGNED PRIMARY KEY NOT NULL,
            old_product_key INT UNSIGNED NOT NULL,
            old_product_class_id INT UNSIGNED NOT NULL,
            old_product_class_source_key INT UNSIGNED NOT NULL,
            new_product_key INT UNSIGNED NOT NULL,
            new_product_class_id INT UNSIGNED NOT NULL,
            new_product_class_source_key INT UNSIGNED NOT NULL
        )
        """.format(
            db=DATABASE_NAME
        )
    )

    for original_dimension in original_product_dimensions.itertuples():
        new_dimension = new_product_dimensions[new_product_dimensions["sku"] == original_dimension.sku].iloc[0]
        new_product_class_id = new_dimension.product_class_id.item()
        new_product_class_source_key = 7 if original_dimension.product_class_id != new_product_class_id else original_dimension.product_class_source_key
        
        cursor.execute(
            """
            INSERT INTO {db}.product_dimension_change_mapping (
                sku, old_product_key, old_product_class_id, old_product_class_source_key,
                new_product_key, new_product_class_id, new_product_class_source_key
            ) VALUES (?, ?, ?, ?, ?, ?, ?)
            """.format(
                db=DATABASE_NAME
            ),
            (
                original_dimension.sku,
                original_dimension.product_key,
                original_dimension.product_class_id,
                original_dimension.product_class_source_key,
                new_dimension.ProductKey1.item(),
                new_product_class_id,
                new_product_class_source_key
            ),
        )
    connection.commit()

In [7]:
cursor.execute("SELECT * FROM {db}.product_dimension_change_mapping LIMIT 25".format(db=DATABASE_NAME))
for row in cursor.fetchall():
    print(row)

(42081001, 1, 57, 2, 1, 57, 2)
(42082001, 2, 57, 2, 2, 57, 2)
(42083001, 3, 83, 6, 3, 99, 7)
(42084001, 4, 7, 4, 4, 90, 7)
(42085001, 5, 6, 1, 5, 6, 1)
(42086001, 6, 48, 6, 6, 260, 7)
(42087001, 7, 65, 6, 7, 65, 6)
(42088001, 8, 0, 0, 8, 77, 7)
(42089001, 9, 30, 3, 9, 99, 7)
(42090001, 10, 30, 1, 10, 30, 1)
(42091001, 11, 35, 1, 11, 35, 1)
(42092001, 12, 5, 2, 12, 10, 7)
(42093001, 13, 5, 2, 13, 5, 2)
(42094001, 14, 30, 1, 14, 30, 1)
(42095001, 15, 30, 1, 15, 30, 1)
(42096001, 16, 30, 1, 16, 30, 1)
(42097001, 17, 30, 1, 17, 30, 1)
(42098001, 18, 30, 1, 18, 30, 1)
(42099001, 19, 35, 1, 19, 35, 1)
(42100001, 20, 52, 6, 20, 52, 6)
(42101001, 21, 30, 1, 21, 30, 1)
(42102001, 22, 30, 1, 22, 30, 1)
(42103001, 23, 1, 1, 23, 1, 1)
(42104001, 24, 13, 1, 24, 13, 1)
(42105001, 25, 13, 1, 25, 13, 1)


### Create new Product Dimension table with conformed data

In [8]:
if remap_product_categories:
    cursor.execute("DROP TABLE IF EXISTS {db}.product_dimension".format(db=DATABASE_NAME))
    cursor.execute(
        """
        CREATE TABLE IF NOT EXISTS {db}.product_dimension (
            product_key INT PRIMARY KEY AUTO_INCREMENT,
            sku INT UNSIGNED NOT NULL,
            product_name TEXT NOT NULL,
            product_class_id INT UNSIGNED,
            product_subcategory TEXT,
            product_category TEXT,
            product_department TEXT,
            product_family TEXT,
            size TEXT NOT NULL,
            brand_name TEXT,
            supplier TEXT NOT NULL
        )
        """.format(
            db=DATABASE_NAME
        )
    )
    
    for row in new_product_dimensions.itertuples():
        cursor.execute(
            """
            INSERT INTO {db}.product_dimension (sku, product_name, product_class_id, product_subcategory, product_category, product_department,
            product_family, size, brand_name, supplier)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """.format(
                db=DATABASE_NAME
            ),
            (
                row.sku,
                row.product_name,
                row.product_class_id,
                row.subcategory,
                row.category,
                row.department,
                row.product_family,
                row.size,
                row.brandName,
                row.supplier,
            ),
        )
        connection.commit()

In [9]:
cursor.execute("SELECT * FROM {db}.product_dimension LIMIT 25".format(db=DATABASE_NAME))
for row in cursor.fetchall():
    print(row)

(1, 42081001, 'Jambalaya Rice Mix', 57, 'Rice', 'Starchy Foods', 'Starchy Foods', 'Food', '12 oz', 'Zatarains', 'Rowan Warehouse')
(2, 42082001, 'Jambalaya Rice Mix', 57, 'Rice', 'Starchy Foods', 'Starchy Foods', 'Food', '8 oz', 'Zatarains', 'Rowan Warehouse')
(3, 42083001, 'Guacamole Regular', 99, 'Fresh Fruit', 'Fruit', 'Produce', 'Food', '8 oz', 'Yucatan', 'Rowan Warehouse')
(4, 42084001, 'Coffee Original Blend', 90, 'Coffee', 'Hot Beverages', 'Beverages', 'Drink', '12 oz', 'Yuban', 'Rowan Warehouse')
(5, 42085001, 'GoGurt Variety Pack', 6, 'Yogurt', 'Dairy', 'Dairy', 'Food', '8 ct', 'Yoplait', 'Rowan Warehouse')
(6, 42086001, 'Italian Dressing', 260, 'Dressings', 'Baking Goods', 'Baking Products', 'Food', '16 oz', 'Wishbone', 'Rowan Warehouse')
(7, 42087001, 'Cheeseburger Heat & Serve Sliders', 65, 'Hamburger', 'Meat', 'Meat', 'Food', '29.28 oz', 'White Castle', 'Rowan Warehouse')
(8, 42088001, 'Choice Cuts Poultry', 77, 'Fresh Chicken', 'Meat', 'Deli', 'Food', '36 oz', 'Whiskas', 