# Data Cleaning

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

import os

files = os.listdir("data")
for file in files:
    print(file)

Product Reports 01 January 2023 - 31 December 2023.csv
Product Reports 01 January 2024 - 29 February 2024.csv
Product Reports 01 January 2021 - 31 December 2021.csv
Product Reports 01 January 2022 - 31 December 2022.csv


Data Loading

In [3]:
import re
from datetime import datetime


dfs = []
for file in files:
    # match = re.search(r'\d{2} (.*?) \d{4}', file)
    year = re.search(r'\d{4}', file).group()
    # date = datetime.strptime(match.group(), r"%d %M %Y").date()
    # print(match.group())
    df = pd.read_csv(f"data/{file}")
    df["year"] = int(year)
    dfs.append(df)

df = pd.concat(dfs)

In [4]:
df.head()

Unnamed: 0,Menu Item Name,Menu Category,QTY sold,Sales exc Tax.,Discounts,Sales inc Tax.(a),Cost inc Tax(b),Gross Profit(a)-(b),Markup Percentage %(a-b)/b * 100%,year
0,box(-),,1550,£ 5889.10,£ 24.26,£ 5889.10,£ 0.00,£ 5889.10,0 %,2023
1,75. Rib Eye(Standard),BBQ (Korean) 75 - 84,598,£ 8491.85,£ 55.76,£ 8916.44,£ 0.00,£ 8916.44,0 %,2023
2,65D. Egg(Standard),Extras,2,£ 0.95,£ 0.00,£ 1.00,£ 0.00,£ 1.00,0 %,2023
3,76. Dwaeji galbi ribs(default),BBQ (Korean) 75 - 84,220,£ 2499.38,£ 10.65,£ 2624.35,£ 0.00,£ 2624.35,0 %,2023
4,77. beef Bulgogi(Standard),BBQ (Korean) 75 - 84,867,£ 9773.04,£ 121.61,£ 10261.69,£ 0.00,£ 10261.69,0 %,2023


In [5]:
df.shape

(1400, 10)

Renaming columns for ease of manipulation

In [6]:
df.columns

Index(['Menu Item Name', 'Menu Category', 'QTY sold', 'Sales exc Tax.',
       'Discounts', 'Sales inc Tax.(a)', 'Cost inc Tax(b)',
       'Gross Profit(a)-(b)', 'Markup Percentage %(a-b)/b * 100%', 'year'],
      dtype='object')

In [7]:
df.columns = ["name", "category", "sold", "sales_inc_tax", "discount", "sales_exc_tax", "cost_inc_tax", "gross_profit", "markup", "year"]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1400 entries, 0 to 349
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1400 non-null   object
 1   category       1392 non-null   object
 2   sold           1400 non-null   int64 
 3   sales_inc_tax  1400 non-null   object
 4   discount       1400 non-null   object
 5   sales_exc_tax  1400 non-null   object
 6   cost_inc_tax   1400 non-null   object
 7   gross_profit   1400 non-null   object
 8   markup         1400 non-null   object
 9   year           1400 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 120.3+ KB


Columns that are supposed to be `float` are being parsed as `object`. This is due to the values having:
- the currency sign (which in this case is `£`)
- the percentage sign `%`

In [9]:
currency_columns = ["sales_inc_tax", "discount", "sales_exc_tax", "cost_inc_tax", "gross_profit"]
percentage_columns = ["markup"]

for col in currency_columns:
    df[col] = df[col].str.replace("£", "").str.strip().astype(float)

for col in percentage_columns:
    df[col] = df[col].str.replace("%", "").str.strip().astype(float)

df.head()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,markup,year
0,box(-),,1550,5889.1,24.26,5889.1,0.0,5889.1,0.0,2023
1,75. Rib Eye(Standard),BBQ (Korean) 75 - 84,598,8491.85,55.76,8916.44,0.0,8916.44,0.0,2023
2,65D. Egg(Standard),Extras,2,0.95,0.0,1.0,0.0,1.0,0.0,2023
3,76. Dwaeji galbi ribs(default),BBQ (Korean) 75 - 84,220,2499.38,10.65,2624.35,0.0,2624.35,0.0,2023
4,77. beef Bulgogi(Standard),BBQ (Korean) 75 - 84,867,9773.04,121.61,10261.69,0.0,10261.69,0.0,2023


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1400 entries, 0 to 349
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           1400 non-null   object 
 1   category       1392 non-null   object 
 2   sold           1400 non-null   int64  
 3   sales_inc_tax  1400 non-null   float64
 4   discount       1400 non-null   float64
 5   sales_exc_tax  1400 non-null   float64
 6   cost_inc_tax   1400 non-null   float64
 7   gross_profit   1400 non-null   float64
 8   markup         1400 non-null   float64
 9   year           1400 non-null   int64  
dtypes: float64(6), int64(2), object(2)
memory usage: 120.3+ KB


As seen above the column types are fixed.

In [11]:
df.describe()

Unnamed: 0,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,markup,year
count,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0
mean,417.558571,2805.504229,37.7753,2935.522671,0.0,2935.522671,0.0,2022.49
std,4374.167324,29254.448751,409.809853,30607.512298,0.0,30607.512298,0.0,1.083986
min,1.0,0.0,-50.0,0.0,0.0,0.0,0.0,2021.0
25%,21.75,98.93,0.0,103.5,0.0,103.5,0.0,2022.0
50%,80.0,489.67,3.37,513.15,0.0,513.15,0.0,2023.0
75%,236.25,1572.7875,20.635,1647.405,0.0,1647.405,0.0,2023.0
max,105672.0,698744.26,11407.76,730360.67,0.0,730360.67,0.0,2024.0


## Empty and duplicate data
Checking for empty values and duplicates

In [12]:
df.duplicated().sum()

0

In [13]:
df.isna().sum()

name             0
category         8
sold             0
sales_inc_tax    0
discount         0
sales_exc_tax    0
cost_inc_tax     0
gross_profit     0
markup           0
year             0
dtype: int64

In [14]:
df[df.category.isna()]

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,markup,year
0,box(-),,1550,5889.1,24.26,5889.1,0.0,5889.1,0.0,2023
402,Total Sales,,100916,693532.78,6095.22,726228.89,0.0,726228.89,0.0,2023
0,cocumber kimchi(-),,196,1629.97,5.71,1629.97,0.0,1629.97,0.0,2024
309,Total Sales,,13778,106910.06,841.21,111991.12,0.0,111991.12,0.0,2024
0,additional charge for sashimi(-),,38,169.93,4.27,169.93,0.0,169.93,0.0,2021
336,Total Sales,,71925,464665.86,11407.76,486285.19,0.0,486285.19,0.0,2021
0,soju(-),,1188,6026.77,60.47,6026.77,0.0,6026.77,0.0,2022
349,Total Sales,,105672,698744.26,8098.52,730360.67,0.0,730360.67,0.0,2022


As seen above, there are some legitimate products with no category. The other ones are `Total Sales` just noise.

I will remove these as they are not needed.

In [15]:
df = df[~(df.name == "Total Sales")]

In [16]:
df.isna().sum()

name             0
category         4
sold             0
sales_inc_tax    0
discount         0
sales_exc_tax    0
cost_inc_tax     0
gross_profit     0
markup           0
year             0
dtype: int64

The remaining products with no category are custom sales that are not at the menu at the time.
Thus I will replace it with `custom` category. 

In [17]:
df  = df.fillna("custom")
df.head()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,markup,year
0,box(-),custom,1550,5889.1,24.26,5889.1,0.0,5889.1,0.0,2023
1,75. Rib Eye(Standard),BBQ (Korean) 75 - 84,598,8491.85,55.76,8916.44,0.0,8916.44,0.0,2023
2,65D. Egg(Standard),Extras,2,0.95,0.0,1.0,0.0,1.0,0.0,2023
3,76. Dwaeji galbi ribs(default),BBQ (Korean) 75 - 84,220,2499.38,10.65,2624.35,0.0,2624.35,0.0,2023
4,77. beef Bulgogi(Standard),BBQ (Korean) 75 - 84,867,9773.04,121.61,10261.69,0.0,10261.69,0.0,2023


Sort by quantity sold

In [18]:
df = df.sort_values("sold", ascending=False, ignore_index=True)
df.head()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,markup,year
0,Asahi(Pint),Draught Beer,5026,24832.01,61.59,26073.61,0.0,26073.61,0.0,2022
1,Asahi(Pint),Draught Beer,4733,23397.7,44.01,24567.59,0.0,24567.59,0.0,2023
2,Asahi(Pint),Draught Beer,3954,19442.52,146.15,20414.65,0.0,20414.65,0.0,2021
3,Coke(Standard),Soft Drinks,3852,9912.78,18.98,10408.42,0.0,10408.42,0.0,2023
4,Coke(Standard),Soft Drinks,3494,8931.53,55.69,9378.11,0.0,9378.11,0.0,2022


Drop markup column: it is irrelevant since it doesn't contain anything other than 0

In [19]:
df.markup.describe()

count    1396.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: markup, dtype: float64

In [20]:
df = df.drop(columns=["markup"], errors="ignore")

## Data Standardisation

- (easy) name and categories must be lowercase
- (easy) add a new column to classify drinks and food `is_food`
- (hard) some names are different but refer to same product: combine these
- (medium) some categories are different but refer to same category: combine these
- (easy) remove unnecessary noise from the names and categories e.g. product number, emojis
- (medium) introduce a new column: `variant` for more granular analysis. To illustrate:

| name |
|------|
|bibimbap(beef)|

into

| name | variant |
|------|---------|
| bibimbap | beef |


In [21]:
df.name = df.name.str.lower()
df.category = df.category.str.lower()

In [22]:
df.category.unique()

array(['draught beer', 'soft drinks', 'cocktails', 'sides',
       'donburi 29-39', 'donburi 29-38', 'set menu', 'custom',
       'traditional 1-15', 'korean starter 53-61', 'buns 24-28',
       'traditional 1-14', 'rice dishes 70-74', 'bbq (korean) 75 - 84',
       'buns 25-26', 'sushi', 'sushi rolls (4pc)', 'korean sides 49-52',
       'lunch bento', 'korean soft drinks', 'noodles 47-48',
       'korean nood 62-65', 'soup 66-69', 'tea / coffee',
       'korean nood 62-64', 'robata grill 16-24', 'robata grill 15-23',
       'bottled beer & cider', 'japan rice dis 25-32', 'vegan',
       'sides extras', 'white wine', 'special mains 39-46',
       'platters 45-46', 'dessert', 'korean soju', 'platters 27-28',
       'spirits', 'sushi platters', 'buns 33-38', 'korean side 14-24',
       'rose wine', 'japanese side 1-13', 'ramen & noodle 39-40',
       'champagne', 'korean dishes 41-51', 'red wine', 'grill bbq 52-60',
       'korean set menu', 'sushi rolls 4pc/8pc', 'mocktails',
       're

In [23]:
df.name.unique()

array(['asahi(pint)', 'coke(standard)', 'diet coke(standard)',
       'cocktail(-)', 'chilli fries(standard)',
       '29. chicken katsu curry(standard)', 'tap water(glass)',
       'lunch/early set', 'egg fried rice(-)', 'peroni(pint)', 'box(-)',
       'mocktail(-)', 'lemonade(standard)', 'steamed rice(standard)',
       '02. prawn tempura(standard)', '59. mandu (5pc)(standard)',
       '27.crispy duck bun', '01. spicy squid(standard)', 'soju(-)',
       '34. salmon teriaki', '06. duck harumaki(standard)',
       '33. chicken teriyaki(standard)', '39. beef teriyaki(-)',
       '08. chicken karaagi(standard)',
       '61. fried chicken wings (4pc)(standard)',
       '70. dolsot bimbimbap(beef)', '34. beef teriyaki(-)',
       '77. beef bulgogi(standard)', 'stella(pint)',
       '25. chicken teriyaki(standard)', '11. takoyaki(standard)',
       'asia daisy(standard)', '05. chicken harumaki(standard)',
       'sweet potato fries(-)', 'chicken katsu  curry 日式咖喱鸡(-)',
       'maki (3pc)(s

In [24]:
df[df.category.str.lower().str.contains("cocktail")]

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,year
7,cocktail(-),cocktails,3013,20119.23,971.77,20119.23,0.0,20119.23,2021
9,cocktail(-),cocktails,2517,17540.88,78.12,17540.88,0.0,17540.88,2022
22,mocktail(-),cocktails,1530,5795.0,35.25,6084.75,0.0,6084.75,2022
23,cocktail(-),cocktails,1499,10462.16,30.84,10462.16,0.0,10462.16,2023
42,mocktail(-),cocktails,1109,4219.56,23.46,4430.54,0.0,4430.54,2023
50,mocktail(-),cocktails,945,3572.09,29.31,3750.69,0.0,3750.69,2021
63,asia daisy(standard),cocktails,815,5392.84,42.52,5662.48,0.0,5662.48,2022
158,asia daisy(standard),cocktails,469,3143.14,21.7,3300.3,0.0,3300.3,2023
216,asia daisy(standard),cocktails,363,2344.45,79.33,2461.67,0.0,2461.67,2021
278,cocktaill(-),cocktails,293,2329.75,14.25,2329.75,0.0,2329.75,2022


Extracting `variant` from the products. Fortunately, variant is easily found. 

Each name is in the form `product name(variant)`

In [25]:
product = df.name.str.extract("(?P<name>.+)\((?P<variant>.+)\)")
product.tail()

Unnamed: 0,name,variant
1391,soju,green grape🍇
1392,2. lunetta vino spumante rose brut,bottle
1393,kimchee fried rice,seafood
1394,jack daniels,double
1395,nigiri (3pc),beef


Some names still have `(...)` in their names but this is fine as they are part of the name itself and not a variant of the product.

We can see below that it's all `(#pc)`.

NOTE: some product also do not have their variants encoded in their original names as they don't have any variants. This lead to their name and variant value being NaN when performing pandas.extract

In [26]:
product[product.name.str.contains("\(.+\)", regex=True).fillna(False)]

Unnamed: 0,name,variant
29,59. mandu (5pc),standard
36,59. mandu (5pc),standard
43,59. mandu (5pc),standard
49,61. fried chicken wings (4pc),standard
53,61. fried chicken wings (4pc),standard
...,...,...
1360,gunkan (2pc),avacado
1365,gunkan (2pc),spicy tuna or spicy salmon
1374,sushi platter (16 pc),standard
1380,nigiri (3pc),tamago


Populate `name` and `variant` using the extracted values

Make sure to only populate `name` with non-null values

In [27]:
df["variant"] = product.variant
df.loc[product[product.name.notna()].index, "name"] = product[product.name.notna()].name
df[["name", "variant"]].head(20)

Unnamed: 0,name,variant
0,asahi,pint
1,asahi,pint
2,asahi,pint
3,coke,standard
4,coke,standard
5,diet coke,standard
6,diet coke,standard
7,cocktail,-
8,chilli fries,standard
9,cocktail,-


### Unifying Names and Categories

The most time consuming by far in this dataset is the correction of product names and categories.
It will require some manual work but I will try to do majority of the work with pandas.

My main strategy is going to use a technique called [approximate string matching](https://en.wikipedia.org/wiki/Approximate_string_matching) also called as fuzzy string matching to find names that are similar.

In [30]:
df.category.unique()

array(['draught beer', 'soft drinks', 'cocktails', 'sides',
       'donburi 29-39', 'donburi 29-38', 'set menu', 'custom',
       'traditional 1-15', 'korean starter 53-61', 'buns 24-28',
       'traditional 1-14', 'rice dishes 70-74', 'bbq (korean) 75 - 84',
       'buns 25-26', 'sushi', 'sushi rolls (4pc)', 'korean sides 49-52',
       'lunch bento', 'korean soft drinks', 'noodles 47-48',
       'korean nood 62-65', 'soup 66-69', 'tea / coffee',
       'korean nood 62-64', 'robata grill 16-24', 'robata grill 15-23',
       'bottled beer & cider', 'japan rice dis 25-32', 'vegan',
       'sides extras', 'white wine', 'special mains 39-46',
       'platters 45-46', 'dessert', 'korean soju', 'platters 27-28',
       'spirits', 'sushi platters', 'buns 33-38', 'korean side 14-24',
       'rose wine', 'japanese side 1-13', 'ramen & noodle 39-40',
       'champagne', 'korean dishes 41-51', 'red wine', 'grill bbq 52-60',
       'korean set menu', 'sushi rolls 4pc/8pc', 'mocktails',
       're

In [28]:
df.name.unique()

array(['asahi', 'coke', 'diet coke', 'cocktail', 'chilli fries',
       '29. chicken katsu curry', 'tap water', 'lunch/early set',
       'egg fried rice', 'peroni', 'box', 'mocktail', 'lemonade',
       'steamed rice', '02. prawn tempura', '59. mandu (5pc)',
       '27.crispy duck bun', '01. spicy squid', 'soju',
       '34. salmon teriaki', '06. duck harumaki', '33. chicken teriyaki',
       '39. beef teriyaki', '08. chicken karaagi',
       '61. fried chicken wings (4pc)', '70. dolsot bimbimbap',
       '34. beef teriyaki', '77. beef bulgogi', 'stella',
       '25. chicken teriyaki', '11. takoyaki', 'asia daisy',
       '05. chicken harumaki', 'sweet potato fries',
       'chicken katsu  curry 日式咖喱鸡', 'maki (3pc)', '77. bulgogi',
       'dynamite', 'camden pale ale', '24. chicken katsu bun',
       '50. kimchi', 'bento mandu dumplings(4pc)', 'spicy chicken katsu',
       '26. beef teriyaki', 'california', 'hai tai bong bong grape juice',
       '47. yaki soba', '63. kimchi ramen', '

In [33]:
from thefuzz import fuzz, process

Remove numbers and noise from name

In [56]:
for col in ["name", "category"]:
    df[col] = df[col].str.replace("\(.+\)", "", regex=True)
    df[col] = df[col].str.replace("^(\d{1,2}\.?)\s*", "", regex=True)

df.tail()

Unnamed: 0,name,category,sold,sales_inc_tax,discount,sales_exc_tax,cost_inc_tax,gross_profit,year,variant
1391,soju,korean soju,1,9.52,0.0,10.0,0.0,10.0,2024,green grape🍇
1392,lunetta vino spumante rose brut,champagne,1,18.1,0.0,19.0,0.0,19.0,2024,bottle
1393,kimchee fried rice,new items,1,7.58,1.99,7.96,0.0,7.96,2023,seafood
1394,jack daniels,spirits & shots,1,5.24,0.0,5.5,0.0,5.5,2024,double
1395,nigiri,sushi,1,4.76,0.0,5.0,0.0,5.0,2021,beef


In [60]:
df.category.unique()


array(['draught beer', 'soft drinks', 'cocktails', 'sides',
       'donburi 29-39', 'donburi 29-38', 'set menu', 'custom',
       'traditional 1-15', 'korean starter 53-61', 'buns 24-28',
       'traditional 1-14', 'rice dishes 70-74', 'bbq  75 - 84',
       'buns 25-26', 'sushi', 'sushi rolls ', 'korean sides 49-52',
       'lunch bento', 'korean soft drinks', 'noodles 47-48',
       'korean nood 62-65', 'soup 66-69', 'tea / coffee',
       'korean nood 62-64', 'robata grill 16-24', 'robata grill 15-23',
       'bottled beer & cider', 'japan rice dis 25-32', 'vegan',
       'sides extras', 'white wine', 'special mains 39-46',
       'platters 45-46', 'dessert', 'korean soju', 'platters 27-28',
       'spirits', 'sushi platters', 'buns 33-38', 'korean side 14-24',
       'rose wine', 'japanese side 1-13', 'ramen & noodle 39-40',
       'champagne', 'korean dishes 41-51', 'red wine', 'grill bbq 52-60',
       'korean set menu', 'sushi rolls 4pc/8pc', 'mocktails',
       'restaurant spec

Remove numbers from category

In [66]:
df.category = df.category.str.replace("\d{1,2}\s?\-\s?\d{1,2}", "", regex=True)
df.category.unique()

array(['draught beer', 'soft drinks', 'cocktails', 'sides', 'donburi ',
       'set menu', 'custom', 'traditional ', 'korean starter ', 'buns ',
       'rice dishes ', 'bbq  ', 'sushi', 'sushi rolls ', 'korean sides ',
       'lunch bento', 'korean soft drinks', 'noodles ', 'korean nood ',
       'soup ', 'tea / coffee', 'robata grill ', 'bottled beer & cider',
       'japan rice dis ', 'vegan', 'sides extras', 'white wine',
       'special mains ', 'platters ', 'dessert', 'korean soju', 'spirits',
       'sushi platters', 'korean side ', 'rose wine', 'japanese side ',
       'ramen & noodle ', 'champagne', 'korean dishes ', 'red wine',
       'grill bbq ', 'korean set menu', 'sushi rolls 4pc/8pc',
       'mocktails', 'restaurant special', 'asian cocktails',
       'restaurant specials', 'classic cocktails', 'spirits & shots',
       'bbq platters ', 'new items', 'starter break line',
       'seared salmon nigiri', 'extras'], dtype=object)

In [None]:
categories = [
    "traditional",
    "starters",
    "sides",
    "donburi",
    ""
]

In [71]:
pd.DataFrame(process.extract("sushi roll", df.category, limit=100))#[0].unique()

Unnamed: 0,0,1,2
0,sushi rolls,95,73
1,sushi rolls,95,81
2,sushi rolls,95,83
3,sushi rolls,95,120
4,sushi rolls,95,123
...,...,...,...
95,sushi,90,842
96,sushi,90,851
97,sushi,90,861
98,sushi,90,862


In [29]:
# df.to_csv("product_reports_cleaned_2021-2024.csv", index=False)