# Data Cleaning

The dataset at hand

In [1397]:
import pandas as pd
import numpy as np
import plotly.express as px

import os


data_root = "data/daily"
files = os.listdir(data_root)
for file in files:
    print(file)

Product Reports 01 February 2024 - 01 February 2024.csv
Product Reports 29 December 2023 - 29 December 2023.csv
Product Reports 06 July 2023 - 06 July 2023.csv
Product Reports 18 October 2022 - 18 October 2022.csv
Product Reports 25 November 2023 - 25 November 2023.csv
Product Reports 10 January 2024 - 10 January 2024.csv
Product Reports 15 March 2024 - 15 March 2024.csv
Product Reports 22 January 2022 - 22 January 2022.csv
Product Reports 24 January 2023 - 24 January 2023.csv
Product Reports 07 July 2023 - 07 July 2023.csv
Product Reports 10 October 2022 - 10 October 2022.csv
Product Reports 13 February 2022 - 13 February 2022.csv
Product Reports 16 October 2023 - 16 October 2023.csv
Product Reports 06 November 2022 - 06 November 2022.csv
Product Reports 29 August 2023 - 29 August 2023.csv
Product Reports 26 August 2022 - 26 August 2022.csv
Product Reports 20 May 2022 - 20 May 2022.csv
Product Reports 11 December 2022 - 11 December 2022.csv
Product Reports 23 August 2023 - 23 August 2

Data Loading

In [1398]:
import re
from datetime import datetime


dfs = []
for file in files:
    match = re.search(r'\d{2} (.*?) \d{4}', file)
    if match is None: print(file)
    # year = re.search(r'\d{4}', file).group()
    date = datetime.strptime(match.group(), r"%d %B %Y").date()
    # print(match.group())
    df = pd.read_csv(f"{data_root}/{file}")
    df["date"] = date
    dfs.append(df)

df = pd.concat(dfs)

In [1399]:
df.head()

Unnamed: 0,Menu Item Name,Menu Category,QTY sold,Sales exc Tax.,Discounts,Sales inc Tax.(a),Cost inc Tax(b),Gross Profit(a)-(b),Markup Percentage %(a-b)/b * 100%,date
0,53.Dwaeji galbi ribs(default),RESTAURANT Grill BBQ 52-60,1,£ 12.29,£ 0.00,£ 12.90,£ 0.00,£ 12.90,0 %,2024-02-01
1,54.beef Bulgogi(Standard),RESTAURANT Grill BBQ 52-60,1,£ 12.29,£ 0.00,£ 12.90,£ 0.00,£ 12.90,0 %,2024-02-01
2,57.Chargrilled Squid(Standard),RESTAURANT Grill BBQ 52-60,1,£ 12.29,£ 0.00,£ 12.90,£ 0.00,£ 12.90,0 %,2024-02-01
3,"58.Lettuce, Garlic & Chilli(Standard)",RESTAURANT Grill BBQ 52-60,2,£ 6.67,£ 0.00,£ 7.00,£ 0.00,£ 7.00,0 %,2024-02-01
4,Kirin Ichi Ban(Standard),Bottled Beer & Cider,1,£ 4.29,£ 0.00,£ 4.50,£ 0.00,£ 4.50,0 %,2024-02-01


In [1400]:
df.shape

(87809, 10)

Missing daily records. There are 2 possible reasons why a record is missing:
1. The restaurant was closed that day e.g. holiday
2. The file for the record is simply missing (can be obtained)

In [1401]:
pd.date_range(start = '2022-01-01', end = '2024-02-29' ).difference(df.date.unique())

DatetimeIndex(['2022-12-25', '2023-05-12', '2023-05-13', '2023-05-14',
               '2023-05-15', '2023-05-16', '2023-05-17', '2023-05-18',
               '2023-05-19', '2023-05-20', '2023-05-21', '2023-05-22',
               '2023-05-23', '2023-05-24', '2023-05-25', '2023-05-26',
               '2023-05-27', '2023-05-28', '2023-05-29', '2023-05-30',
               '2023-05-31', '2023-06-01', '2023-06-02', '2023-06-03',
               '2023-06-04', '2023-06-05', '2023-06-06', '2023-06-07',
               '2023-06-08', '2023-06-09', '2023-06-10', '2023-06-11',
               '2023-06-12', '2023-06-13', '2023-06-14', '2023-06-15',
               '2023-06-16', '2023-06-17', '2023-06-18', '2023-06-19',
               '2023-06-20', '2023-06-21', '2023-06-22', '2023-06-23',
               '2023-06-24', '2023-06-25', '2023-06-26', '2023-06-27',
               '2023-06-28', '2023-06-29', '2023-06-30', '2023-07-01',
               '2023-07-02', '2023-07-03', '2023-07-04', '2023-07-05',
      

In [1402]:
# TODO: visualize missing records

missing_dates = pd.date_range(start = '2022-01-01', end = '2024-02-29' ).difference(df.date.unique()).to_frame()
missing_dates.columns = ["date"]
missing_dates = missing_dates.reset_index(drop=True)
missing_dates["is_missing"] = True

missing_df = pd.DataFrame({"date": df.date.unique()})
missing_df["is_missing"] = False

missing_df = pd.concat([missing_dates, missing_df])
missing_df.date = pd.to_datetime(missing_df.date)
missing_df.head()

Unnamed: 0,date,is_missing
0,2022-12-25,True
1,2023-05-12,True
2,2023-05-13,True
3,2023-05-14,True
4,2023-05-15,True


In [1403]:
px.bar(missing_df, x="date", y="is_missing", title="Missing Records (Daily)", color="is_missing")

Drop useless columns

In [1404]:
df = df.drop(df.columns[8], axis=1)

Renaming columns for ease of manipulation

In [1405]:
df.columns

Index(['Menu Item Name', 'Menu Category', 'QTY sold', 'Sales exc Tax.',
       'Discounts', 'Sales inc Tax.(a)', 'Cost inc Tax(b)',
       'Gross Profit(a)-(b)', 'date'],
      dtype='object')

In [1406]:
df.columns = ["name", "category", "sold", "sales_inc_tax", "discount", "sales_exc_tax", "cost_inc_tax", "gross_profit", "date"]

In [1407]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87809 entries, 0 to 179
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           87809 non-null  object
 1   category       86474 non-null  object
 2   sold           87809 non-null  int64 
 3   sales_inc_tax  87809 non-null  object
 4   discount       87809 non-null  object
 5   sales_exc_tax  87809 non-null  object
 6   cost_inc_tax   87809 non-null  object
 7   gross_profit   87809 non-null  object
 8   date           87809 non-null  object
dtypes: int64(1), object(8)
memory usage: 6.7+ MB


Columns that are supposed to be `float` are being parsed as `object`. This is due to the values having:
- the currency sign (which in this case is `£`)
- the percentage sign `%`

In [1408]:
currency_columns = ["sales_inc_tax", "discount", "sales_exc_tax", "cost_inc_tax", "gross_profit"]

for col in currency_columns:
    df[col] = df[col].str.replace("-|£|%", "", regex=True).str.strip().astype(float)

df.head()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date
0,53.Dwaeji galbi ribs(default),RESTAURANT Grill BBQ 52-60,1,12.29,0.0,12.9,0.0,12.9,2024-02-01
1,54.beef Bulgogi(Standard),RESTAURANT Grill BBQ 52-60,1,12.29,0.0,12.9,0.0,12.9,2024-02-01
2,57.Chargrilled Squid(Standard),RESTAURANT Grill BBQ 52-60,1,12.29,0.0,12.9,0.0,12.9,2024-02-01
3,"58.Lettuce, Garlic & Chilli(Standard)",RESTAURANT Grill BBQ 52-60,2,6.67,0.0,7.0,0.0,7.0,2024-02-01
4,Kirin Ichi Ban(Standard),Bottled Beer & Cider,1,4.29,0.0,4.5,0.0,4.5,2024-02-01


In [1409]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87809 entries, 0 to 179
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           87809 non-null  object 
 1   category       86474 non-null  object 
 2   sold           87809 non-null  int64  
 3   sales_inc_tax  87809 non-null  float64
 4   discount       87809 non-null  float64
 5   sales_exc_tax  87809 non-null  float64
 6   cost_inc_tax   87809 non-null  float64
 7   gross_profit   87809 non-null  float64
 8   date           87809 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 6.7+ MB


As seen above the column types are fixed.

In [1410]:
df.describe()

Unnamed: 0,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit
count,87809.0,87809.0,87809.0,87809.0,87809.0,87809.0
mean,4.784453,32.637691,0.333108,34.144147,0.0,34.144147
std,28.103427,191.849225,2.645087,200.740288,0.0,200.740288
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,6.67,0.0,7.0,0.0,7.0
50%,2.0,10.38,0.0,10.9,0.0,10.9
75%,3.0,20.76,0.0,21.8,0.0,21.8
max,704.0,4669.56,120.09,4884.38,0.0,4884.38


## Empty and duplicate data
Checking for empty values and duplicates

In [1411]:
df.duplicated().sum()

1

In [1412]:
df.isna().sum()

name                0
category         1335
sold                0
sales_inc_tax       0
discount            0
sales_exc_tax       0
cost_inc_tax        0
gross_profit        0
date                0
dtype: int64

In [1413]:
df[df.category.isna()]

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date
102,Total Sales,,191,1482.66,2.40,1550.75,0.0,1550.75,2024-02-01
0,prawn tempura rice (-),,11,228.25,0.00,228.25,0.0,228.25,2023-12-29
135,Total Sales,,354,2576.90,6.54,2689.46,0.0,2689.46,2023-12-29
0,box(-),,1,0.17,0.08,0.17,0.0,0.17,2023-07-06
99,Total Sales,,172,1140.15,22.03,1191.72,0.0,1191.72,2023-07-06
...,...,...,...,...,...,...,...,...,...
91,Total Sales,,169,1135.16,13.12,1187.98,0.0,1187.98,2022-01-09
0,takeaway box(-),,8,2.00,0.00,2.00,0.0,2.00,2023-04-16
110,Total Sales,,240,1552.80,12.63,1628.22,0.0,1628.22,2023-04-16
0,container(-),,15,3.75,0.00,3.75,0.0,3.75,2022-11-19


As seen above, there are some legitimate products with no category. The other ones are `Total Sales` just noise.

I will remove these as they are not needed.

In [1414]:
df = df[~(df.name == "Total Sales")]

In [1415]:
df.isna().sum()

name               0
category         586
sold               0
sales_inc_tax      0
discount           0
sales_exc_tax      0
cost_inc_tax       0
gross_profit       0
date               0
dtype: int64

The remaining products with no category are custom sales that are not at the menu at the time.
Thus I will replace it with `other` category. 

In [1416]:
df  = df.fillna("other")
df.head()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date
0,53.Dwaeji galbi ribs(default),RESTAURANT Grill BBQ 52-60,1,12.29,0.0,12.9,0.0,12.9,2024-02-01
1,54.beef Bulgogi(Standard),RESTAURANT Grill BBQ 52-60,1,12.29,0.0,12.9,0.0,12.9,2024-02-01
2,57.Chargrilled Squid(Standard),RESTAURANT Grill BBQ 52-60,1,12.29,0.0,12.9,0.0,12.9,2024-02-01
3,"58.Lettuce, Garlic & Chilli(Standard)",RESTAURANT Grill BBQ 52-60,2,6.67,0.0,7.0,0.0,7.0,2024-02-01
4,Kirin Ichi Ban(Standard),Bottled Beer & Cider,1,4.29,0.0,4.5,0.0,4.5,2024-02-01


Sort by quantity sold

In [1417]:
df = df.sort_values("sold", ascending=False, ignore_index=True)
df.head()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date
0,Asahi(Pint),Draught Beer,82,406.1,0.0,426.4,0.0,426.4,2022-11-12
1,Asahi(Pint),Draught Beer,64,316.46,0.52,332.28,0.0,332.28,2022-10-08
2,Asahi(Pint),Draught Beer,63,312.0,0.0,327.6,0.0,327.6,2023-01-10
3,Asahi(Pint),Draught Beer,63,312.0,0.0,327.6,0.0,327.6,2023-11-25
4,Asahi(Pint),Draught Beer,62,306.55,0.52,321.88,0.0,321.88,2022-05-21


## Data Standardisation

- (easy) name and categories must be lowercase
- (easy) add a new column to classify drinks and food `is_food`
- (hard) some names are different but refer to same product: combine these
- (medium) some categories are different but refer to same category: combine these
- (easy) remove unnecessary noise from the names and categories e.g. product number, emojis
- (medium) introduce a new column: `variant` for more granular analysis. To illustrate:

| name |
|------|
|bibimbap(beef)|

into

| name | variant |
|------|---------|
| bibimbap | beef |


In [1418]:
df.name = df.name.str.lower()
df.category = df.category.str.lower()

df.name = df.name.str.replace("^(\d{1,2}\.?)\s*", "", regex=True)

In [1419]:
df.category.unique()

array(['draught beer', 'donburi 29-39', 'soft drinks', 'cocktails',
       'korean starter 53-61', 'donburi 29-38', 'sides', 'other',
       'sides extras', 'japan rice dis 25-32', 'restaurant special',
       'spirits', 'traditional 1-14', 'buns 24-28', 'traditional 1-15',
       'lunch bento', 'noodles 47-48', 'buns 25-26',
       'bottled beer & cider', 'buns 33-38', 'bbq (korean) 75 - 84',
       'korean soft drinks', 'japanese side 1-13', 'korean soju',
       'korean side 14-24', 'tea / coffee', 'special mains 39-46',
       'sushi', 'set menu', 'sushi rolls (4pc)', 'vegan',
       'ramen & noodle 39-40', 'spirits & shots', 'rice dishes 70-74',
       'dessert', 'white wine', 'robata grill 15-23',
       'korean dishes 41-51', 'asian cocktails', 'korean sides 49-52',
       'robata grill 16-24', 'soup 66-69', 'korean nood 62-64',
       'korean set menu', 'classic cocktails', 'korean nood 62-65',
       'rose wine', 'red wine', 'restaurant grill bbq 52-60',
       'sushi rolls 4p

In [1420]:
df.name.unique()


Output cache limit (currently 1000 entries) hit.
Flushing oldest 200 entries.



array(['asahi(pint)', 'chicken katsu curry(standard)', 'tap water(glass)',
       'peroni(pint)', 'cocktail(-)', 'coke(standard)', 'stella(pint)',
       'diet coke(standard)', 'mandu (5pc)(standard)',
       'chilli fries(standard)', 'pan seared seabass(-)',
       'camden pale ale(pint)', 'ramen pork belly w/extra veg(-)',
       'lemonade(standard)', 'drinks(-)', 'takeaway box(-)',
       'egg fried rice(-)', 'mocktail(-)', 'restaurant special set',
       'tequila(standard)', 'steamed rice(standard)',
       'prawn tempura(standard)', 'asahi(half)', 'trial(-)', 'box(-)',
       'asia daisy(standard)', 'crispy duck bun',
       'bento mandu dumplings(4pc)(-)', 'yaki soba(chicken)',
       'chicken teriyaki(standard)', 'lucky buddha(-)', 'container(-)',
       'fever tree tonic(standard)', 'tequila(-)',
       'kimchi fried rice chix and pork(-)', 'chicken harumaki(standard)',
       'rib eye(standard)', 'chicken katsu  curry 日式咖喱鸡(-)',
       'singha lager', 'aloe vera(standard)', '

In [1421]:
df[df.category.str.lower().str.contains("cocktail")]

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date
47,cocktail(-),cocktails,37,259.00,0.0,259.0,0.0,259.0,2022-06-18
49,cocktail(-),cocktails,36,252.00,0.0,252.0,0.0,252.0,2022-02-26
60,cocktail(-),cocktails,34,238.00,0.0,238.0,0.0,238.0,2022-01-22
85,cocktail(-),cocktails,30,209.30,0.7,209.3,0.0,209.3,2022-07-15
86,cocktail(-),cocktails,30,210.00,0.0,210.0,0.0,210.0,2022-01-29
...,...,...,...,...,...,...,...,...,...
86841,restaurant plum sour(standard),cocktails,1,6.67,0.0,7.0,0.0,7.0,2022-01-04
86900,matcha mojito(standard),cocktails,1,6.67,0.0,7.0,0.0,7.0,2022-09-15
86945,cocktail(-),cocktails,1,7.00,0.0,7.0,0.0,7.0,2022-01-04
87012,asia daisy(standard),cocktails,1,6.67,0.0,7.0,0.0,7.0,2023-07-20


Extracting `variant` from the products. Fortunately, variant is easily found. 

Each name is in the form `product name(variant)`

In [1422]:
product = df.name.str.extract("(?P<name>.+)\((?P<variant>.+)\)")
product.tail()

Unnamed: 0,name,variant
87055,cocktail,-
87056,sweet potato fries,-
87057,dolsot bimbimbap,beef
87058,kimchi fried rice,pork
87059,,


Some names still have `(...)` in their names but this is fine as they are part of the name itself and not a variant of the product.

We can see below that it's all `name (#pc)`.

In [1423]:
product[product.name.str.contains("\(.+\)", regex=True).fillna(False)].head()

Unnamed: 0,name,variant
148,mandu (5pc),standard
280,mandu (5pc),standard
553,mandu (5pc),standard
679,bento mandu dumplings(4pc),-
683,mandu (5pc),standard


Some products also do not have their variants encoded in their original names as they don't have any variants. This leads to their `name` and `variant` value being `NaN` when performing `pandas.extract`. 

We can see this below from their original `name`

In [1424]:
df[product.name.isna()].head()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date
366,restaurant special set,restaurant special,19,541.05,0.0,568.1,0.0,568.1,2023-10-21
647,crispy duck bun,buns 24-28,15,112.86,0.0,118.5,0.0,118.5,2023-09-30
652,crispy duck bun,buns 24-28,15,112.86,0.0,118.5,0.0,118.5,2023-02-17
773,crispy duck bun,buns 33-38,14,118.67,0.0,124.6,0.0,124.6,2024-03-16
861,singha lager,draught beer,14,72.8,0.0,72.8,0.0,72.8,2023-07-21


Populate `name` and `variant` using the extracted values

Make sure to only populate `name` with non-null values

In [1425]:
df["variant"] = product.variant
df.loc[product[product.name.notna()].index, "name"] = product[product.name.notna()].name
df[["name", "variant"]].head(20)

Unnamed: 0,name,variant
0,asahi,pint
1,asahi,pint
2,asahi,pint
3,asahi,pint
4,asahi,pint
5,chicken katsu curry,standard
6,asahi,pint
7,asahi,pint
8,tap water,glass
9,asahi,pint


### Unifying Names and Categories

The most time consuming by far in this dataset is the correction of product names and categories.
It will require some manual work but I will try to do majority of the work with pandas.

My main strategy is going to use a technique called [approximate string matching](https://en.wikipedia.org/wiki/Approximate_string_matching) also called as fuzzy string matching to find names that are similar.

In [1426]:
df.category.unique()

array(['draught beer', 'donburi 29-39', 'soft drinks', 'cocktails',
       'korean starter 53-61', 'donburi 29-38', 'sides', 'other',
       'sides extras', 'japan rice dis 25-32', 'restaurant special',
       'spirits', 'traditional 1-14', 'buns 24-28', 'traditional 1-15',
       'lunch bento', 'noodles 47-48', 'buns 25-26',
       'bottled beer & cider', 'buns 33-38', 'bbq (korean) 75 - 84',
       'korean soft drinks', 'japanese side 1-13', 'korean soju',
       'korean side 14-24', 'tea / coffee', 'special mains 39-46',
       'sushi', 'set menu', 'sushi rolls (4pc)', 'vegan',
       'ramen & noodle 39-40', 'spirits & shots', 'rice dishes 70-74',
       'dessert', 'white wine', 'robata grill 15-23',
       'korean dishes 41-51', 'asian cocktails', 'korean sides 49-52',
       'robata grill 16-24', 'soup 66-69', 'korean nood 62-64',
       'korean set menu', 'classic cocktails', 'korean nood 62-65',
       'rose wine', 'red wine', 'restaurant grill bbq 52-60',
       'sushi rolls 4p

In [1427]:
df.name.unique()

array(['asahi', 'chicken katsu curry', 'tap water', 'peroni', 'cocktail',
       'coke', 'stella', 'diet coke', 'mandu (5pc)', 'chilli fries',
       'pan seared seabass', 'camden pale ale',
       'ramen pork belly w/extra veg', 'lemonade', 'drinks',
       'takeaway box', 'egg fried rice', 'mocktail',
       'restaurant special set', 'tequila', 'steamed rice',
       'prawn tempura', 'trial', 'box', 'asia daisy', 'crispy duck bun',
       'bento mandu dumplings(4pc)', 'yaki soba', 'chicken teriyaki',
       'lucky buddha', 'container', 'fever tree tonic',
       'kimchi fried rice chix and pork', 'chicken harumaki', 'rib eye',
       'chicken katsu  curry 日式咖喱鸡', 'singha lager', 'aloe vera',
       'chicken katsu bun', 'spicy squid', 'sweet potato fries',
       'restaurant special set with seafood pancake', 'vegetable tempura',
       'duck harumaki', 'yaki soba vegetable 1/2 portion', 'corona',
       'bento salmon/sushi', 'salmon teriaki rice',
       'jinro cham yi sul (fresh)', 

In [1428]:
from thefuzz import fuzz, process

Remove numbers and noise from name

In [1429]:
for col in ["name", "category"]:
    df[col] = df[col].str.replace("\(.+\)", "", regex=True)
    df[col] = df[col].str.replace("^(\d{1,2}[a-z]?\.?)\s*", "", regex=True)

df.name = df.name.str.strip()
df.tail()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date,variant
87055,cocktail,cocktails,1,7.0,0.0,7.0,0.0,7.0,2023-12-11,-
87056,sweet potato fries,sides extras,1,3.33,0.0,3.5,0.0,3.5,2023-12-11,-
87057,dolsot bimbimbap,korean dishes 41-51,1,13.24,0.0,13.9,0.0,13.9,2023-12-11,beef
87058,kimchi fried rice,korean dishes 41-51,1,12.29,0.0,12.9,0.0,12.9,2023-12-11,pork
87059,restaurant buns platter,buns 24-28,1,18.0,0.0,18.9,0.0,18.9,2022-11-19,


Remove numbers from category

In [1430]:
df.category = df.category.str.replace("\d{1,2}\s?\-\s?\d{1,2}", "", regex=True)
df.category = df.category.str.strip()
df.category.unique()

array(['draught beer', 'donburi', 'soft drinks', 'cocktails',
       'korean starter', 'sides', 'other', 'sides extras',
       'japan rice dis', 'restaurant special', 'spirits', 'traditional',
       'buns', 'lunch bento', 'noodles', 'bottled beer & cider', 'bbq',
       'korean soft drinks', 'japanese side', 'korean soju',
       'korean side', 'tea / coffee', 'special mains', 'sushi',
       'set menu', 'sushi rolls', 'vegan', 'ramen & noodle',
       'spirits & shots', 'rice dishes', 'dessert', 'white wine',
       'robata grill', 'korean dishes', 'asian cocktails', 'korean sides',
       'soup', 'korean nood', 'korean set menu', 'classic cocktails',
       'rose wine', 'red wine', 'restaurant grill bbq',
       'sushi rolls 4pc/8pc', 'mocktails', 'champagne', 'platters',
       'restaurant specials', 'sushi platters', 'bbq platters', 'extras',
       'new items', 'hotpot', 'seared salmon nigiri'], dtype=object)

*Seared salmon nigiri* should be a product with name nigiri and variant seared salmon.

In [1431]:
indices = df[df.category == "seared salmon nigiri"].index
df.loc[indices, "name"] = "nigiri"
df.loc[indices, "category"] = "sushi"
df.loc[indices, "variant"] = "seared salmon"

There is another unnecessary category called `extras`: I can either just drop this or merge it into `other` category.

I will choose to merge it just in case.

In [1432]:
df[df.category == "extras"]

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date,variant
24937,d. egg,extras,2,0.95,0.0,1.0,0.0,1.0,2023-03-18,standard


In [1433]:
indices = df[df.category == "extras"].index
df.loc[indices, "name"] = "egg"
df.loc[indices, "category"] = "other"
df.loc[indices, "variant"] = "standard"

In [1434]:
# df[df.category == "korean side"].groupby("name").agg({"sold": "sum", "sales_exc_tax": "sum"}).sort_values("sold", ascending=False)

To properly correct the names and categories, I need the correct and true values.

In [1435]:
drinks_categories = [
    "asian cocktails",
    "cocktails",
    "mocktails",
    "draught beer",
    "soft drinks",
    "korean soft drinks",
    "bottled beer & cider",
    "korean soju",
    "tea / coffee",
    "spirits",
    "red wine",
    "rose wine",
    "white wine",
    "champagne",
]

food_categories = [
    "japanese starters",
    "korean starters",
    "sides",    # unify korean sides, sides, and sides extras
    "buns",
    "bbq",
    "korean rice dishes",
    "japanese noodles",
    "korean noodles",
    "japanese rice dishes",
    "robata grill", 
    "restaurant special",
    "sushi",
    "sushi rolls",
    "sushi platter",
    "bbq platter",
    "lunch bento",
    "set menu",
    "hotpot",
    "dessert",
    "new items",    # I think this category isn't really needed
    "vegan",
    "soup",
]

other = [
    "other",
]

In [1436]:
df[df.category == "korean sides"]

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,date,variant
2283,kimchi,korean sides,9,33.43,0.00,35.10,0.0,35.10,2023-01-29,standard
2306,kimchi,korean sides,9,33.43,0.00,35.10,0.0,35.10,2022-05-13,standard
2986,kimchi,korean sides,8,29.71,0.00,31.20,0.0,31.20,2023-01-08,standard
2990,kimchi,korean sides,8,29.71,0.00,31.20,0.0,31.20,2023-01-22,standard
3391,kimchi,korean sides,8,26.67,0.00,28.00,0.0,28.00,2022-04-01,standard
...,...,...,...,...,...,...,...,...,...,...
86778,mooli kimchi,korean sides,1,2.76,0.00,2.90,0.0,2.90,2022-12-29,standard
86849,mooli kimchi,korean sides,1,2.38,0.00,2.50,0.0,2.50,2022-01-04,standard
86905,kimchi,korean sides,1,3.17,0.57,3.33,0.0,3.33,2022-09-15,standard
87016,kimchi set,korean sides,1,7.14,0.00,7.50,0.0,7.50,2023-07-20,standard


In [1437]:
fuzzed = df.category.apply(
    lambda r: process.extractOne(
        r,
        drinks_categories + food_categories + other,
        # scorer=fuzz.token_set_ratio,
    )
).to_frame()
fuzzed.columns = ["fuzzed_category"]

In [1438]:
temp = fuzzed.explode("fuzzed_category")
fuzzed = pd.concat([temp.iloc[::2, :], temp.iloc[1::2, :]], axis=1)
fuzzed.columns = ["fuzzed_category", "score"]
fuzzed["category"] = df.category
fuzzed.head()

Unnamed: 0,fuzzed_category,score,category
0,draught beer,100,draught beer
1,draught beer,100,draught beer
2,draught beer,100,draught beer
3,draught beer,100,draught beer
4,draught beer,100,draught beer


In [1439]:
fuzzed[fuzzed.score < 100].sort_values("score")

Unnamed: 0,fuzzed_category,score,category
5,soft drinks,51,donburi
73872,other,51,traditional
73861,other,51,traditional
29929,other,51,traditional
29932,other,51,traditional
...,...,...,...
18523,korean starters,97,korean starter
67712,korean starters,97,korean starter
18505,korean starters,97,korean starter
39410,korean starters,97,korean starter


To get some idea about the the transformation on a more visual basis, I will plot the original categories and their respective score averaged (mean).

Things to notice are:
- `donburi` is low, that's fine because it's going to be corrected to `japanese rice dish`
- `ramen & noodle` is low; will be corrected to `japanese noodles`
- there are 2 sushi roll categories
- `special mains` is low; will be corrected to `restaurant special`
- `traditional` is low; will be corrected to `japanese starters`

In [1440]:
px.bar(
    fuzzed.groupby("category").score.mean().reset_index(),
    x="category",
    y="score",
    title="Similarity Score (Mean) of Transformed Category Names",
)

Check the mapping of categories to see any thing we need to hardcode.
1. `traditional` should be `japanese starters`
2. `donburi` should be `japanese rice dishes`
3. `ramen & noodle` should be `japanese noodles`
4. `platters`  should be `bbq platters`
5. `korean side` should be `korean starters`
6. `japanese side` should be `japanese starters`
7. `rice dishes` should be `japanese rice dishes`
8. `sushi rolls 4pc/8pc` should be `sushi rolls`

In [1441]:
fuzzed.groupby("category").first().reset_index().sort_values("score")

Unnamed: 0,category,fuzzed_category,score
49,traditional,other,51
9,donburi,soft drinks,51
41,special mains,restaurant special,66
28,ramen & noodle,korean noodles,71
12,japan rice dis,japanese rice dishes,82
27,platters,sushi platter,84
17,korean side,korean soft drinks,86
13,japanese side,japanese rice dishes,86
15,korean nood,korean noodles,88
25,noodles,japanese noodles,90


In [1442]:
category_mapping = fuzzed.set_index("category")["fuzzed_category"].to_dict()

category_mapping["traditional"] = "japanese starters"
category_mapping["donburi"] = "japanese rice dishes"
category_mapping["ramen & noodle"] = "japanese noodles"
category_mapping["platters"] = "bbq platters"
category_mapping["korean side"] = "korean starters"
category_mapping["japanese side"] = "japanese starters"
category_mapping["rice dishes"] = "japanese rice dishes"
category_mapping["sushi rolls 4pc/8pc"] = "sushi rolls"
category_mapping

{'draught beer': 'draught beer',
 'donburi': 'japanese rice dishes',
 'soft drinks': 'soft drinks',
 'cocktails': 'cocktails',
 'korean starter': 'korean starters',
 'sides': 'sides',
 'other': 'other',
 'sides extras': 'sides',
 'japan rice dis': 'japanese rice dishes',
 'restaurant special': 'restaurant special',
 'spirits': 'spirits',
 'traditional': 'japanese starters',
 'buns': 'buns',
 'lunch bento': 'lunch bento',
 'noodles': 'japanese noodles',
 'bottled beer & cider': 'bottled beer & cider',
 'bbq': 'bbq',
 'korean soft drinks': 'korean soft drinks',
 'japanese side': 'japanese starters',
 'korean soju': 'korean soju',
 'korean side': 'korean starters',
 'tea / coffee': 'tea / coffee',
 'special mains': 'restaurant special',
 'sushi': 'sushi',
 'set menu': 'set menu',
 'sushi rolls': 'sushi rolls',
 'vegan': 'vegan',
 'ramen & noodle': 'japanese noodles',
 'spirits & shots': 'spirits',
 'rice dishes': 'japanese rice dishes',
 'dessert': 'dessert',
 'white wine': 'white wine',


In [1443]:
df.category = df.category.replace(category_mapping)
df.category.unique()


array(['draught beer', 'japanese rice dishes', 'soft drinks', 'cocktails',
       'korean starters', 'sides', 'other', 'restaurant special',
       'spirits', 'japanese starters', 'buns', 'lunch bento',
       'japanese noodles', 'bottled beer & cider', 'bbq',
       'korean soft drinks', 'korean soju', 'tea / coffee', 'sushi',
       'set menu', 'sushi rolls', 'vegan', 'dessert', 'white wine',
       'robata grill', 'korean rice dishes', 'asian cocktails', 'soup',
       'korean noodles', 'rose wine', 'red wine', 'mocktails',
       'champagne', 'bbq platters', 'sushi platter', 'bbq platter',
       'new items', 'hotpot'], dtype=object)

As all things engineering, data cleaning is an iterative process where there is no one technique that will solve all of the problems at hand. Approximate string matching cleaned most of the data but it will not clean everything. I will have to do some manual inspection to find records that were not covered by approximate string matching, find the pattern, and automate it so that when new data is collected data preparating is a unified process that requires minimal manual inspection.

There are still some issues with the `category`, there are products that are in different categories. These should be unified as well.
categories to notice are:
- there products in korean starters that should be in robata grill
- 

In [1444]:
df[df.category == "korean starters"].index

Index([  148,   280,   553,   683,   997,  1095,  1173,  1272,  1331,  1343,
       ...
       86853, 86894, 86907, 86909, 86973, 86974, 86975, 87018, 87037, 87038],
      dtype='int64', length=4072)

In [1445]:
robata_grill_mapping

{'pork yakitori': 'robata grill',
 'chilli beef': 'robata grill',
 'teriyaki salmon': 'robata grill',
 'teriyaki chicken': 'robata grill',
 'grill wings': 'robata grill',
 'king prawn': 'robata grill',
 'grilled squid': 'robata grill',
 'miso lamb': 'robata grill',
 'teriyaki beef': 'robata grill',
 'mixed vegetable': 'robata grill'}

In [1446]:
robata_grill_names = set(df[df.category == "robata grill"].name.values)
indices = df[df.name.isin(robata_grill_mapping) & (df.category == "korean starters")].index
df.loc[indices, "category"] = "robata grill"
# df.category = df.name.replace(robata_grill_mapping)

In [1447]:
df[df.category == "korean starters"].name.unique()

array(['mandu', 'fried chicken wings', 'kimchi pancake',
       'seafood pancake', 'kan pun ki', 'topokki', 'bossam pork belly',
       'teriyaki salmon side', 'teriyaki chicken side', 'corn cheese',
       'beef jap chae', 'tofu kimchi', 'dubu jorim'], dtype=object)

In [1448]:
salmon = df[(df.category == "korean starters") & (df.name == "teriyaki salmon side")].index
chicken = df[(df.category == "korean starters") & (df.name == "teriyaki chicken side")].index

df.loc[salmon, ["name", "category"]] = ["teriyaki salmon", "robata grill"]
df.loc[chicken, ["name", "category"]] = ["teriyaki chicken", "robata grill"]

In [1449]:
df[df.category == "korean starters"].name.unique()

array(['mandu', 'fried chicken wings', 'kimchi pancake',
       'seafood pancake', 'kan pun ki', 'topokki', 'bossam pork belly',
       'corn cheese', 'beef jap chae', 'tofu kimchi', 'dubu jorim'],
      dtype=object)

## Unifying Product Names

## Conclusion

In [1450]:
# df.to_csv("product_reports_partial_cleaned_2022-2024.csv", index=False)