In [64]:
import pandas as pd

# Read CSV file
df = pd.read_csv("datasets/rep_s_00014_SMRY.csv", header = 3)

# Preview first 5 rows
df.head()


Unnamed: 0,Product Desc,Qty,Total Price,Unnamed: 3,Total Cost,Total Cost %,Total Profit,Unnamed: 7,Total Profit %,Unnamed: 9
0,Stories - Bir Hasan,,,,,,,,,
1,TAKE AWAY,,,,,,,,,
2,BEVERAGES,,,,,,,,,
3,COLD BAR SECTION,,,,,,,,,
4,1 SHOT DECAFE,404.0,0.0,,3856.85,0.0,-3856.85,,100.0,


In [65]:
# let's see how many empty values in each column there is
useless = []
for col in df.columns:
    nan_count = df[col].isna().sum()
    nan_percent = (nan_count / len(df)) * 100
    print(f"In column '{col}' there are {nan_percent:.2f}% NaN values ({nan_count} empty cells)")
    if nan_percent >= 90:
        useless.append(col)

In column 'Product Desc' there are 0.00% NaN values (0 empty cells)
In column 'Qty' there are 7.15% NaN values (1042 empty cells)
In column 'Total Price' there are 7.15% NaN values (1043 empty cells)
In column 'Unnamed: 3' there are 97.29% NaN values (14186 empty cells)
In column 'Total Cost' there are 7.15% NaN values (1043 empty cells)
In column 'Total Cost %' there are 7.15% NaN values (1043 empty cells)
In column 'Total Profit' there are 7.15% NaN values (1043 empty cells)
In column 'Unnamed: 7' there are 97.29% NaN values (14186 empty cells)
In column 'Total Profit %' there are 7.15% NaN values (1042 empty cells)
In column 'Unnamed: 9' there are 97.29% NaN values (14186 empty cells)


In [66]:
df.columns

Index(['Product Desc', 'Qty', 'Total Price', 'Unnamed: 3', 'Total Cost',
       'Total Cost %', 'Total Profit', 'Unnamed: 7', 'Total Profit %',
       'Unnamed: 9'],
      dtype='object')

In [67]:
##cleanup of useless rows (Jan or Product desc)
col0 = df.columns[0]  # first column (should be "Product Desc")

bad_values = {"22-Jan-26", "Product Desc"}

df = df[~df[col0].astype(str).str.strip().isin(bad_values)].reset_index(drop=True)

In [68]:
#everything above 90% we ll drop completely useless LOL
df = df.drop(columns = useless)

In [69]:
# we will add a column true if all the important features are NaN else false
df["all_nan"] = df.drop(columns = ['Product Desc']).isna().all(axis=1)

In [70]:
import re
import pandas as pd
import numpy as np

# -------------------------
# Load your cleaned file
# -------------------------
# df = pd.read_csv("cleaned.csv")
# or if df is already in memory, skip loading.

# REQUIRED columns:
# 'Product Desc', 'Qty', 'Total Price', 'Total Cost', 'Total Cost %', 'Total Profit', 'Total Profit %'
col_desc = "Product Desc"
col_qty  = "Qty"

levels = ["Branch", "Department", "Category", "Division"]
total_pat = re.compile(r"^\s*Total\s+By\s+(Branch|Department|Category|Division)\s*:?\s*$", re.IGNORECASE)

def is_blank(x) -> bool:
    return pd.isna(x) or str(x).strip().lower() in ("", "nan", "none")

def extract_branch_name(s: str) -> str:
    s = str(s).strip()
    if " - " in s:
        return s.split(" - ", 1)[1].strip()
    if "-" in s:
        return s.split("-", 1)[1].strip()
    return s

def choose_open_level(current: dict) -> str:
    for lv in levels:
        if current[lv] is None:
            return lv
    return "Division"  # fallback if file has deeper splits but same "Total By Division" closing

def clear_from(level: str, current: dict):
    idx = levels.index(level)
    for lv in levels[idx:]:
        current[lv] = None

# -------------------------
# Parse & fill hierarchy
# -------------------------
current = {lv: None for lv in levels}

out = {lv: [] for lv in levels}

for _, row in df.iterrows():
    txt_raw = row.get(col_desc, "")
    txt = "" if is_blank(txt_raw) else str(txt_raw).strip()

    # Closing row? "Total By Division:" etc.
    m = total_pat.match(txt)
    if m:
        # include the current hierarchy on the total row itself
        for lv in levels:
            out[lv].append(current[lv])

        closing_level = m.group(1).title()  # Branch/Department/Category/Division
        clear_from(closing_level, current)
        continue

    # Opening row? (Hierarchy headers usually have Qty empty/NaN after cleaning)
    qty_val = row.get(col_qty, np.nan)
    qty_is_nan = pd.isna(qty_val)

    if qty_is_nan and txt:
        open_level = choose_open_level(current)

        if open_level == "Branch":
            current["Branch"] = extract_branch_name(txt)
            current["Department"] = None
            current["Category"] = None
            current["Division"] = None
        elif open_level == "Department":
            current["Department"] = txt
            current["Category"] = None
            current["Division"] = None
        elif open_level == "Category":
            current["Category"] = txt
            current["Division"] = None
        else:  # Division
            current["Division"] = txt

        # tag the opening row with the updated hierarchy
        for lv in levels:
            out[lv].append(current[lv])
        continue

    # Normal/product row: inherit current hierarchy
    for lv in levels:
        out[lv].append(current[lv])

# attach columns
for lv in levels:
    df[lv] = out[lv]

# save if you want
# df.to_csv("cleaned_with_hierarchy.csv", index=False)

df.head(30)

Unnamed: 0,Product Desc,Qty,Total Price,Total Cost,Total Cost %,Total Profit,Total Profit %,all_nan,Branch,Department,Category,Division
0,Stories - Bir Hasan,,,,,,,True,Bir Hasan,,,
1,TAKE AWAY,,,,,,,True,Bir Hasan,TAKE AWAY,,
2,BEVERAGES,,,,,,,True,Bir Hasan,TAKE AWAY,BEVERAGES,
3,COLD BAR SECTION,,,,,,,True,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
4,1 SHOT DECAFE,404.0,0.0,3856.85,0.0,-3856.85,100.0,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
5,2 SHOT DECAFE,637.0,0.0,12162.45,0.0,-12162.45,100.0,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
6,3 SHOT DECAFE,113.0,0.0,3236.32,0.0,-3236.32,100.0,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
7,ADD BANANA SAUCE LARGE,4.0,266.67,139.97,52.49,126.69,47.51,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
8,ADD BANANA SAUCE MEDIUM,47.0,1487.39,822.34,55.29,665.04,44.71,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
9,ADD BANANA SAUCE SMALL,28.0,893.72,489.91,54.82,403.82,45.18,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION


In [71]:
df["Division"].value_counts()

Division
COLD BAR SECTION       4451
FROZEN YOGHURT         2486
HOT BAR SECTION        2204
MEDIUM                  567
GRAB&GO FOOD            566
COFFEE PASTRY           488
SANDWICHES              459
GRAB&GO BEVERAGES       378
FRENCH PASTRY           337
CROISSANT               221
CINNAMON ROLLS          210
SUBS                    198
SMALL                   179
COOKIES                 145
PLAT DE JOUR            124
POP UP BEVERAGE          98
OFFER                    89
HEALTHY SECTION          81
LARGE                    39
LATTE SMALL              32
SALADS BAR               30
YOGHURT COMBO SMALL      21
PIZZA                    15
CREAM                    14
250 ML                   12
CRUNCH                   10
250ML/24                  8
YOUGHURT                  8
WRAP                      8
SPECULOSE LOTUS           6
CLEMENTINE 330ML/24       5
BITES                     4
330ML/24                  3
Name: count, dtype: int64

### since there are still NaN values but for some rows we will be cleaning up all rows that have all the columns values = NaN

In [72]:
# then we will drop all the stuff that have all NaN
df = df[~df["all_nan"]]
# for cleanup let's reset the index and drop the last column 
df = df.reset_index(drop = True)
df = df.iloc[:-1]

In [73]:
## let's retest the NaN test:
# let's see how many empty values in each column there is
useless = []
for col in df.columns:
    nan_count = df[col].isna().sum()
    nan_percent = (nan_count / len(df)) * 100
    print(f"In column '{col}' there are {nan_percent:.2f}% NaN values ({nan_count} empty cells)")
    if nan_percent >= 90:
        useless.append(col)
#0 % NaN = LA3IBBBB 

In column 'Product Desc' there are 0.00% NaN values (0 empty cells)
In column 'Qty' there are 0.00% NaN values (0 empty cells)
In column 'Total Price' there are 0.00% NaN values (0 empty cells)
In column 'Total Cost' there are 0.00% NaN values (0 empty cells)
In column 'Total Cost %' there are 0.00% NaN values (0 empty cells)
In column 'Total Profit' there are 0.00% NaN values (0 empty cells)
In column 'Total Profit %' there are 0.00% NaN values (0 empty cells)
In column 'all_nan' there are 0.00% NaN values (0 empty cells)
In column 'Branch' there are 0.00% NaN values (0 empty cells)
In column 'Department' there are 0.19% NaN values (25 empty cells)
In column 'Category' there are 0.51% NaN values (67 empty cells)
In column 'Division' there are 1.12% NaN values (147 empty cells)


In [74]:
df

Unnamed: 0,Product Desc,Qty,Total Price,Total Cost,Total Cost %,Total Profit,Total Profit %,all_nan,Branch,Department,Category,Division
0,1 SHOT DECAFE,404.00,0.00,3856.85,0.00,-3856.85,100.00,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
1,2 SHOT DECAFE,637.00,0.00,12162.45,0.00,-12162.45,100.00,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
2,3 SHOT DECAFE,113.00,0.00,3236.32,0.00,-3236.32,100.00,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
3,ADD BANANA SAUCE LARGE,4.00,266.67,139.97,52.49,126.69,47.51,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
4,ADD BANANA SAUCE MEDIUM,47.00,1487.39,822.34,55.29,665.04,44.71,False,Bir Hasan,TAKE AWAY,BEVERAGES,COLD BAR SECTION
...,...,...,...,...,...,...,...,...,...,...,...,...
13138,ADD TUNA,7.00,1027.03,334.50,32.57,692.53,67.43,False,Stories kaslik,TAKE AWAY,FOOD,SUBS
13139,Total By Division:,51.00,5579.58,2381.77,42.69,3197.81,57.31,False,Stories kaslik,TAKE AWAY,FOOD,SUBS
13140,Total By Category:,10528.50,1287075.68,484453.55,37.64,802622.12,62.36,False,Stories kaslik,TAKE AWAY,FOOD,
13141,Total By Department:,22252.50,2818715.92,844897.47,29.97,1973818.45,70.03,False,Stories kaslik,TAKE AWAY,,
