In [10]:
import pandas as pd

# Importing data
product_sales_url = 'https://raw.githubusercontent.com/juanpaul96/holcim_DAE_test/main/product_sales_sample.csv'
product_sales_raw = pd.read_csv(product_sales_url)

In [11]:
########## Data Exploration ##########

#Here I'm checking things like: data types and fixing them, count of rows, checking for duplicates, 
# and I'm doing a QA to check data consistency in sales and quantity.

In [12]:
# Check initial data types
print("Dataset columns type:\n", product_sales_raw.dtypes)

Dataset columns type:
 Rest Owner                                  object
Rest Coop                                   object
REST_KEY                                     int64
MITM_KEY                                     int64
Menu Item                                   object
Menu Item Combo Meal Flag                   object
Reporting Day                               object
POS Consumer Price                         float64
POS Total Units Sold  Promo and Regular      int64
POS Promotion Units Sold                     int64
POS Units Sold                               int64
POS Combo Units Sold                         int64
dtype: object


In [17]:
import pandas as pd

# Copy the raw DataFrame
products_datatypes = product_sales_raw.copy()

# Convert the date column to datetime
products_datatypes['Reporting Day'] = pd.to_datetime(
    products_datatypes['Reporting Day'],
    errors='coerce'
)

# Coerce numeric columns (using the exact names from df.columns)
numeric_cols = [
    'REST_KEY',
    'MITM_KEY',
    'POS Consumer Price',
    'POS Total Units Sold  Promo and Regular',
    'POS Promotion Units Sold',
    'POS Units Sold',
    'POS Combo Units Sold',
]
for col in numeric_cols:
    products_datatypes[col] = pd.to_numeric(
        products_datatypes[col],
        errors='coerce'
    )

# Standardize all column names to snake_case
products_datatypes.columns = (
    products_datatypes.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace("$", "", regex=False)
)

# Rename for clarity
products_datatypes.rename(columns={
    'rest_key':                   'rest_key',
    'mitm_key':                   'mitm_key',
    'pos_consumer_price':         'consumer_price',
    'pos_total_units_sold__promo_and_regular': 'total_and_promo_flag',
    'pos_promotion_units_sold':   'promo_units_sold',
    'pos_units_sold':             'regular_units_sold',
    'pos_combo_units_sold':       'combo_units_sold',
    'menu_item_combo_meal_flag':  'combo_meal_flag'
}, inplace=True)

# Peek at the result
products_datatypes.head()


  products_datatypes['Reporting Day'] = pd.to_datetime(


Unnamed: 0,rest_owner,rest_coop,rest_key,mitm_key,menu_item,combo_meal_flag,reporting_day,consumer_price,total_and_promo_flag,promo_units_sold,regular_units_sold,combo_units_sold
0,SMITH JOE,SEA/TCA WA CP-0024,1364,1,00000001-REUBEN,N,2011-01-01,0.95,25,0,25,6
1,SMITH JOE,SEA/TCA WA CP-0024,5357,1,00000001-REUBEN,N,2011-01-01,0.95,10,0,10,1
2,SMITH JOE,SEA/TCA WA CP-0024,13369,1,00000001-REUBEN,N,2011-01-01,0.95,9,0,9,3
3,SMITH JOE,SEA/TCA WA CP-0024,13604,1,00000001-REUBEN,N,2011-01-01,0.95,24,0,24,4
4,SMITH JOE,SEA/TCA WA CP-0024,1364,1,00000001-REUBEN,N,2011-01-02,0.95,28,0,28,4


In [18]:
print("Dataset columns type:\n", products_datatypes.dtypes)

Dataset columns type:
 rest_owner                      object
rest_coop                       object
rest_key                         int64
mitm_key                         int64
menu_item                       object
combo_meal_flag                 object
reporting_day           datetime64[ns]
consumer_price                 float64
total_and_promo_flag             int64
promo_units_sold                 int64
regular_units_sold               int64
combo_units_sold                 int64
dtype: object


In [20]:
products_datatypes["reporting_day"].agg(['min','max'])

min   2011-01-01
max   2011-01-31
Name: reporting_day, dtype: datetime64[ns]

In [22]:
#Checking if there are missing days
# Make sure the date column is datetime and normalized to midnight
products_datatypes['reporting_day'] = pd.to_datetime(
    products_datatypes['reporting_day'], errors='coerce'
).dt.normalize()

# Build the full expected date range
full_range = pd.date_range(
    start=products_datatypes['reporting_day'].min(),
    end=products_datatypes['reporting_day'].max(),
    freq='D'
)

# Find which dates in that range aren’t present in your data
present = products_datatypes['reporting_day'].dropna().unique()
missing_days = full_range.difference(present)
missing_df = pd.DataFrame({'missing_reporting_day': missing_days})
missing_df

Unnamed: 0,missing_reporting_day


In [23]:
# Compute the sum of promo + regular
products_datatypes['sum_promo_plus_regular'] = (
    products_datatypes['promo_units_sold'].fillna(0) +
    products_datatypes['regular_units_sold'].fillna(0)
)

# Create a boolean check column
products_datatypes['check_total_vs_sum'] = (
    products_datatypes['total_and_promo_flag'] == 
    products_datatypes['sum_promo_plus_regular']
)

# Inspect mismatches
mismatches = products_datatypes[~products_datatypes['check_total_vs_sum']]
print(f"Found {len(mismatches)} rows where total_and_promo_flag ≠ promo + regular")

# Show mismatches
print(mismatches[['menu_item', 'reporting_day', 
                  'total_and_promo_flag', 'sum_promo_plus_regular']].head())


Found 0 rows where total_and_promo_flag ≠ promo + regular
Empty DataFrame
Columns: [menu_item, reporting_day, total_and_promo_flag, sum_promo_plus_regular]
Index: []


In [38]:
products_datatypes.to_csv('products_silver.csv', index=False)