## We ll do the same thing as the dataset14

In [None]:
import pandas as pd
import numpy as np
import re
# Read CSV file
df = pd.read_csv("datasets/given/rep_s_00191_SMRY-3.csv", header = 3)


# let's see how many empty values in each column there is
useless = []
for col in df.columns:
    nan_count = df[col].isna().sum()
    nan_percent = (nan_count / len(df)) * 100
    print(f"In column '{col}' there are {nan_percent:.2f}% NaN values ({nan_count} empty cells)")
    if nan_percent >= 90:
        useless.append(col)

In column 'Description' there are 0.01% NaN values (1 empty cells)
In column 'Barcode' there are 94.93% NaN values (13419 empty cells)
In column 'Qty' there are 10.73% NaN values (1517 empty cells)
In column 'Total Amount' there are 8.19% NaN values (1158 empty cells)
In column 'Unnamed: 4' there are 97.47% NaN values (13778 empty cells)


In [22]:
df.head()

Unnamed: 0,Description,Barcode,Qty,Total Amount,Unnamed: 4
0,Branch: Stories - Bir Hasan,,,,
1,Division: HOT BAR SECTION,,,,
2,Group: BLACK COFFEE,,,,
3,ESPRESSO,,3313.0,500103.75,
4,DOUBLE ESPRESSO,,3200.0,498631.33,


In [23]:
pd.concat([df["Barcode"], df["Unnamed: 4"]]).unique()

array([nan, 'Years:2025 Months:0', 'Barcode',
       'Copyright Â© 2026 Omega Software, Inc. All Rights ', 359.0],
      dtype=object)

In [24]:
## thus Barcode and unnamed 4 are completely useless we will drop them
df = df.drop(columns = useless)

##cleanup of useless rows (Jan or Product desc)
col0 = df.columns[0]  # first column (should be "Product Desc")

bad_values = {"19-Jan-26", "Description"}

df = df[~df[col0].astype(str).str.strip().isin(bad_values)].reset_index(drop=True)

# and drop the last row which is useless
df = df.iloc[:-1]

In [25]:
# we will add a column true if all the important features are NaN else false
df["all_nan"] = df.drop(columns = ['Description']).isna().all(axis=1)

In [26]:
# First column holds the hierarchy text (e.g., "Branch: X", "Division: Y", "Group: Z", etc.)
col_desc = df.columns[0]

# If you have a quantity column, use it to help distinguish item rows vs header rows (optional).
# If not sure, leave as None.
qty_col = "Qty" if "Qty" in df.columns else None

# -------------------------
# Patterns (case-insensitive)
# -------------------------
open_pat = re.compile(r"^\s*(Branch|Division|Group)\s*:\s*(.+?)\s*$", re.IGNORECASE)
close_pat = re.compile(r"^\s*Total\s+by\s+(Branch|Division|Group)\s*:\s*(.+?)\s*$", re.IGNORECASE)

levels = ["Branch", "Division", "Group"]

def is_blank(x) -> bool:
    return pd.isna(x) or str(x).strip() == ""

def clear_from(level: str, current: dict):
    """Clear level and anything nested under it (Branch clears all, Division clears Division+Group, Group clears Group)."""
    idx = levels.index(level)
    for lv in levels[idx:]:
        current[lv] = None

# -------------------------
# Parse & fill columns
# -------------------------
current = {lv: None for lv in levels}
out = {lv: [] for lv in levels}

for _, row in df.iterrows():
    raw = row.get(col_desc, "")
    txt = "" if is_blank(raw) else str(raw).strip()

    # --- Closing row: "Total by X: Name"
    m_close = close_pat.match(txt)
    if m_close:
        # include current hierarchy on the total row itself
        for lv in levels:
            out[lv].append(current[lv])

        closing_level = m_close.group(1).title()  # Branch/Division/Group
        clear_from(closing_level, current)
        continue

    # --- Opening row: "X: Name"
    m_open = open_pat.match(txt)
    if m_open:
        opening_level = m_open.group(1).title()
        name = m_open.group(2).strip()

        # enforce nesting reset when opening a higher/equal level
        if opening_level == "Branch":
            current["Branch"] = name
            current["Division"] = None
            current["Group"] = None
        elif opening_level == "Division":
            current["Division"] = name
            current["Group"] = None
        else:  # Group
            current["Group"] = name

        # tag the opening row too
        for lv in levels:
            out[lv].append(current[lv])
        continue

    # --- Normal rows: inherit current hierarchy
    for lv in levels:
        out[lv].append(current[lv])

# attach columns
for lv in levels:
    df[lv] = out[lv]


### since there are still NaN values but for some rows we will be cleaning up all rows that have all the columns values = NaN

In [34]:

# then we will drop all the stuff that have all NaN
df = df[~df["all_nan"]]
# for cleanup let's reset the index and drop the last column 
df = df.reset_index(drop = True)
df = df.iloc[:-1]

In [37]:
df.to_csv("my_dataframe191.csv", index=False)

In [36]:
## let's retest the NaN test:
# let's see how many empty values in each column there is
useless = []
for col in df.columns:
    nan_count = df[col].isna().sum()
    nan_percent = (nan_count / len(df)) * 100
    print(f"In column '{col}' there are {nan_percent:.2f}% NaN values ({nan_count} empty cells)")
    if nan_percent >= 90:
        useless.append(col)

In column 'Description' there are 0.00% NaN values (0 empty cells)
In column 'Qty' there are 0.00% NaN values (0 empty cells)
In column 'Total Amount' there are 0.00% NaN values (0 empty cells)
In column 'all_nan' there are 0.00% NaN values (0 empty cells)
In column 'Branch' there are 0.00% NaN values (0 empty cells)
In column 'Division' there are 0.20% NaN values (24 empty cells)
In column 'Group' there are 3.47% NaN values (426 empty cells)
