#### In order to convert the csv files to DataFrames

In [None]:
import pandas as pd
import numpy as np

customers_df = pd.read_csv('./data/raw/customers.csv')
customers_df.head(5)

In [None]:
orders_df = pd.read_csv('./data/raw/orders.csv')
orders_df.head(5)

In [None]:
products_df = pd.read_csv('./data/raw/products.csv')
products_df.head(5)

#### In order to clean the null values from the customers table

In [None]:
# In order to confirm NULL values
print(customers_df[customers_df['name'].isna()])

In [None]:
# In order to replace NULL values in the name column
customers_df['name'] = np.where(customers_df['name'].isna(), 'Guest', customers_df['name'])

In [None]:
# In order to check that the data was transformed
guest_check = customers_df[customers_df['name'] == 'Guest']
guest_check

In [None]:
# In order to confirm NULL values
print(customers_df[customers_df['email'].isna()])

In [None]:
# In order to replace NULL values in the email column
customers_df['email'] = np.where(customers_df['email'].isna(), 'none_provided', customers_df['email'])
provision_check = customers_df[customers_df['email'] == 'none_provided']
provision_check

#### In order to clean the bad dates from the order table

In [None]:
# In order to check non-date values in 'order_date'
convert = pd.to_datetime(orders_df['order_date'], errors='coerce')
bad_dates = orders_df[convert.isna()]
bad_dates

In [None]:
# In order to replace 'not_a_date" and integer values and convert them to datetime
orders_df['order_date'] = orders_df['order_date'].replace('not_a_date', '1900-01-01')
orders_df['order_date'] = orders_df['order_date'].replace(20002, '2002-01-01')
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'], errors='coerce')

null_check = orders_df['order_date'].isna().sum
null_check

#### In order to create the dimension tables

In [None]:
products_df

In [None]:
# customer_dim
customers_df

In [None]:
# product_category_dim
product_cat_df = pd.DataFrame({'product_category': products_df['product_category'].drop_duplicates()})
product_cat_df

In [None]:
# store_dim
store_df = pd.DataFrame({'store_id': orders_df['store_id'].drop_duplicates()})
store_df

In [None]:
store_name = {1: 'nyc_store', 2: 'tampa_store', 3: 'seattle_store', 4: 'chicago_store'}
store_region = {1: 'East', 2: 'South', 3: 'West', 4: 'Midwest'}

store_df['store_name'] = store_df['store_id'].map(store_name)
store_df['store_region'] = store_df['store_id'].map(store_region)
store_df

In [None]:
# product_dim
product_df = products_df.drop(columns=['price'])
product_df